# Objetivo del notebook

El actual notebook tiene como objetivo el desarrollo de un modelo de clasificacion binaria, capaz de aprender a partir del conjunto de datos de entrenamiento, y generar predicciones acorde al conjunto de prueba.

### Instalacion de dependencias

In [1]:
# Catboost
!pip install catboost

# Importar las librerias a utilizar

La siguiente celda reune el codigo necesario para importar todas las librerias de las que se hacen uso en el presente notebook.

In [2]:
# Librerias y metodos para analisis y manipulacion de datos
import numpy as np
import pandas as pd

# Librerias y metodos para el desarrollo del modelo
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from imblearn.over_sampling import SMOTE

# Otros
import os
import time
import joblib

In [3]:
## Cargo en memoria el conjunto de entrenamiento

# Ruta del fichero train.csv
train_dataset__route = "../data/processed/train/train.csv"

# Instancio un objeto dataframe que cargue el conjunto de datos de entrenamiento
train_df = pd.read_csv(train_dataset__route, low_memory = False)

# 10 primeros registros del dataframe
train_df.head(10)

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0.461538,0.666667,0.47619,0.5,0.091837,0.091837,0.0,0.0,0.364198,0.377358,...,0.24507,0.023131,0.58642,0.0,0.078261,0.00738,0.004806,0.022066,0.0,1
1,0.538462,0.583333,0.47619,0.538462,0.112245,0.142857,0.0,0.0,0.234568,0.339623,...,0.104225,0.063475,0.623457,0.0,0.078261,0.017528,0.010985,0.018054,0.0,1
2,0.692308,0.333333,0.190476,0.474359,0.040816,0.071429,0.0,0.0,0.487654,0.471698,...,0.126761,0.080151,0.481481,0.0,0.06087,0.015683,0.004806,0.019057,0.0,0
3,0.692308,0.5,0.333333,0.358974,0.020408,0.030612,0.0,0.0,0.376543,0.415094,...,0.174648,0.068316,0.537037,0.0,0.069565,0.012915,0.004463,0.022066,1.0,0
4,0.307692,0.666667,0.47619,0.516667,0.091837,0.091837,0.0,0.0,0.302469,0.433962,...,0.101408,0.071544,0.67284,0.0,0.069565,0.027675,0.013732,0.118355,0.0,1
5,0.615385,0.25,0.285714,0.346154,0.112245,0.112245,0.0,0.0,0.401235,0.509434,...,0.183099,0.050027,0.549383,0.0,0.06087,0.01476,0.007896,0.02006,0.0,0
6,0.076923,0.833333,0.47619,0.4,0.163265,0.071429,0.0,0.0,0.518519,0.518868,...,0.180282,0.051641,0.660494,0.0,0.078261,0.016605,0.010642,0.117352,1.0,1
7,0.307692,0.416667,0.238095,0.269231,0.112245,0.112245,0.0,0.0,0.216049,0.273585,...,0.166197,0.069392,0.493827,0.0,0.052174,0.009225,0.00309,0.015045,0.0,0
8,0.153846,0.5,0.428571,0.5,0.091837,0.05102,0.0,0.0,0.401235,0.424528,...,0.101408,0.080689,0.703704,0.2,0.086957,0.011993,0.007552,0.042126,1.0,1
9,0.153846,0.583333,0.238095,0.294872,0.112245,0.112245,0.0,0.0,0.240741,0.377358,...,0.225352,0.04411,0.574074,0.2,0.078261,0.012915,0.004119,0.012036,1.0,1


In [4]:
## Cargo en memoria el conjunto de prueba

# Ruta del fichero train.csv
test_dataset__route = "../data/processed/test/test.csv"

# Instancio un objeto dataframe que cargue el conjunto de datos de entrenamiento
test_df = pd.read_csv(test_dataset__route, low_memory = False)

# 10 primeros registros del dataframe
test_df.head(10)

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,id
0,0.307692,0.545455,0.4,0.430248,0.112245,0.112245,0.0,0.0,0.415493,0.49,...,0.238462,0.093519,0.571429,0.0,0.081633,0.016839,0.008239,0.03009,0.0,159256
1,0.923077,0.454545,0.3,0.547588,0.091837,0.091837,1.0,1.0,0.514085,0.32,...,0.130769,0.084495,0.496894,0.0,0.102041,0.018135,0.003776,0.022066,0.0,159257
2,0.615385,0.636364,0.4,0.462842,0.05102,0.061224,0.0,0.0,0.323944,0.35,...,0.161538,0.07137,0.645963,0.0,0.132653,0.041451,0.020254,0.034102,0.0,159258
3,0.307692,0.454545,0.2,0.208605,0.020408,0.030612,0.0,0.0,0.316901,0.22,...,0.438462,0.104184,0.590062,0.0,0.05102,0.024611,0.005836,0.008024,1.0,159259
4,0.307692,0.636364,0.45,0.500652,0.091837,0.081633,0.0,0.0,0.429577,0.54,...,0.161538,0.100082,0.714286,0.0,0.091837,0.031088,0.013045,0.025075,1.0,159260
5,0.307692,0.545455,0.25,0.323338,0.091837,0.091837,0.0,0.0,0.161972,0.12,...,0.176923,0.087777,0.627329,0.0,0.091837,0.027202,0.004806,0.054162,1.0,159261
6,0.307692,0.545455,0.5,0.469361,0.142857,0.142857,0.0,0.0,0.401408,0.32,...,0.215385,0.130435,0.652174,0.0,0.091837,0.023316,0.009955,0.023069,0.0,159262
7,0.538462,0.272727,0.2,0.237288,0.091837,0.081633,0.0,0.0,0.570423,0.51,...,0.323077,0.079573,0.571429,0.0,0.061224,0.025907,0.006522,0.034102,0.0,159263
8,0.769231,0.454545,0.3,0.365059,0.040816,0.040816,0.0,0.0,0.556338,0.5,...,0.3,0.091879,0.552795,0.0,0.112245,0.016839,0.007896,0.016048,0.0,159264
9,0.769231,0.181818,0.25,0.404172,0.091837,0.091837,0.0,0.0,0.514085,0.5,...,0.238462,0.089418,0.590062,0.0,0.091837,0.018135,0.004806,0.025075,0.0,159265


In [5]:
## Separo las caracteristicas predictoras de la variable objetivo
X = train_df.drop(columns = 'smoking')
y = train_df['smoking']

### Balanceo de clases con SMOTE


In [6]:
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X, y)

In [7]:
y.value_counts()

smoking
1    114269
0    114269
Name: count, dtype: int64

## Defino un primer modelo

Para el objetivo que tenemos entre manos, voy a definir un modelo Catboost para clasificacion binaria.
Ademas, el entrenamiento del modelo se va a ejecutar mediante optimizacion dinamica de hiperparametros, llevando a cabo el propio entrenamiento con un objeto instanciado de la clase GridSearchCV.

In [8]:
# Instancio el modelo LGBMClassifier
model_1 = LGBMClassifier(random_state=42, device_type = 'gpu', objective = 'binary')
# Defino un diccionario de parametros a optimizar
param_grid = {
    'learning_rate': [0.05], # Taxa de aprendizaje. Valores comunes van de 0.01 a 0.2
    'n_estimators': [150], # Número de árboles a construir.
    'num_leaves': [31], # Número máximo de nodos hoja en un árbol. Valores típicos van de 20 a 40
    'max_depth': [-1], # Máxima profundidad de cada árbol. -1 significa sin límite.
    'subsample': [0.8],
    'colsample_bytree': [0.4], # Fracción de características a considerar para cada árbol.
    'min_data_in_leaf': [20],
    'gpu_use_dp': [True],
    'max_bin': [255]
}

# Instancio un objeto de la clase GridSearchCV
grid_cv_1 = GridSearchCV(model_1, param_grid, verbose=False)

# Entreno el modelo con el conjunto de entrenamiento
inicio_modelo_1 = time.time()
grid_cv_1.fit(X, y)
fin_modelo_1 = time.time()
tiempo_modelo_1 = fin_modelo_1 - inicio_modelo_1

[LightGBM] [Info] Number of positive: 91415, number of negative: 91415
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4577
[LightGBM] [Info] Number of data points in the train set: 182830, number of used features: 22
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 18 dense feature groups (3.49 MB) transferred to GPU in 0.003250 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 91415, number of negative: 91415
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4577
[LightGBM] [Info] Number of data points in the train set: 182830, number of used features: 22
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070, Vendor: NVIDIA Co

In [9]:
# Parametros del mejor modelo
best_params_1 = grid_cv_1.best_params_

# Mejor modelo entrenado
best_model_1 = grid_cv_1.best_estimator_

print('Parametros del mejor modelo entrenado ==>', best_params_1)

Parametros del mejor modelo entrenado ==> {'colsample_bytree': 0.4, 'gpu_use_dp': True, 'learning_rate': 0.05, 'max_bin': 255, 'max_depth': -1, 'min_data_in_leaf': 20, 'n_estimators': 150, 'num_leaves': 31, 'subsample': 0.8}


In [10]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
model_2 = LGBMClassifier(random_state=42, device_type = 'gpu', objective = 'binary', verbose_eval=False)
# Defino un diccionario de parametros a optimizar
param_grid = {
    'learning_rate': [0.15], # Taxa de aprendizaje. Valores comunes van de 0.01 a 0.2
    'n_estimators': [150], # Número de árboles a construir.
    'num_leaves': [35], # Número máximo de nodos hoja en un árbol. Valores típicos van de 20 a 40
    'max_depth': [-1], # Máxima profundidad de cada árbol. -1 significa sin límite.
    'subsample': [0.8],
    'colsample_bytree': [0.6], # Fracción de características a considerar para cada árbol.
    'min_data_in_leaf': [30],
    'gpu_use_dp': [True],
    'max_bin': [255]
}

# Instancio un objeto de la clase GridSearchCV
grid_cv_2 = GridSearchCV(model_2, param_grid, verbose = False)

inicio_modelo_2 = time.time()
# Entreno el modelo con el conjunto de entrenamiento
grid_cv_2.fit(X_train, y_train)
fin_modelo_2 = time.time()
tiempo_modelo_2 = fin_modelo_2 - inicio_modelo_2

[LightGBM] [Info] Number of positive: 73130, number of negative: 73134
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4547
[LightGBM] [Info] Number of data points in the train set: 146264, number of used features: 22
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4070, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 16
[LightGBM] [Info] 18 dense feature groups (2.79 MB) transferred to GPU in 0.002790 secs. 1 sparse feature groups
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499986 -> initscore=-0.000055
[LightGBM] [Info] Start training from score -0.000055
[LightGBM] [Info] Number of positive: 73130, number of negative: 73134
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 4549
[LightGBM] [Info] Number of data points in the train set: 146264, number of used features: 22
[LightGBM] [Info] Using

In [12]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
model_3 = RandomForestClassifier(n_estimators=100, random_state=42)

inicio_modelo_3 = time.time()
model_3.fit(X_train, y_train)
fin_modelo_3 = time.time()
tiempo_modelo_3 = fin_modelo_3 - inicio_modelo_3

In [14]:
# Obtenemos las predicciones de los modelos
preds_1 = grid_cv_1.best_estimator_.predict_proba(X_val)[:, 1]
preds_2 = grid_cv_2.best_estimator_.predict_proba(X_val)[:, 1]
preds_3 = model_3.predict_proba(X_val)[:, 1]

# Creamos el nuevo conjunto de entrenamiento para el meta-modelo
X_new = np.column_stack((preds_1, preds_2, preds_3))



In [15]:
# Entrenamos el meta-modelo
meta_model = LogisticRegression(random_state=42)

inicio_modelo_meta = time.time()
meta_model.fit(X_new, y_val)
fin_modelo_meta = time.time()
tiempo_modelo_meta = fin_modelo_meta - inicio_modelo_meta

In [16]:
print('TIEMPOS DE ENTRENAMIENTO (en segundos):')
print('Modelo 1 (LightGBM) ==>', tiempo_modelo_1)
print('Modelo 2 (LightGBM) ==>', tiempo_modelo_2)
print('Modelo 3 (RandomForestClassifier) ==>', tiempo_modelo_3)
print('Modelo meta (LogisticRegression) ==>', tiempo_modelo_meta)

TIEMPOS DE ENTRENAMIENTO (en segundos):
Modelo 1 (LightGBM) ==> 9.013382196426392
Modelo 2 (LightGBM) ==> 6.368925094604492
Modelo 3 (RandomForestClassifier) ==> 30.05014967918396
Modelo meta (LogisticRegression) ==> 0.02691507339477539


## Hiperparametros de los modelos

In [17]:
print('HIPERPARAMETROS DE LOS MODELOS:')
print('Modelo 1 (LightGBM) ==>', grid_cv_1.best_params_)
print('Modelo 2 (LightGBM) ==>', grid_cv_2.best_params_)
print('Modelo 3 (RandomForestClassifier) ==>')
print('Theta Modelo meta (LogisticRegression) ==>')
print('Bias Modelo meta (LogisticRegression) ==>')

HIPERPARAMETROS DE LOS MODELOS:
Modelo 1 (LightGBM) ==> {'colsample_bytree': 0.4, 'gpu_use_dp': True, 'learning_rate': 0.05, 'max_bin': 255, 'max_depth': -1, 'min_data_in_leaf': 20, 'n_estimators': 150, 'num_leaves': 31, 'subsample': 0.8}
Modelo 2 (LightGBM) ==> {'colsample_bytree': 0.6, 'gpu_use_dp': True, 'learning_rate': 0.15, 'max_bin': 255, 'max_depth': -1, 'min_data_in_leaf': 30, 'n_estimators': 150, 'num_leaves': 35, 'subsample': 0.8}
Modelo 3 (RandomForestClassifier) ==>
Theta Modelo meta (LogisticRegression) ==>
Bias Modelo meta (LogisticRegression) ==>


## Coeficientes de los modelos entrenados

In [18]:
print('COEFICIENTES DE THETA:')
print('Modelo 1 (LightGBM) ==>', grid_cv_1.best_estimator_.feature_importances_)
print('Modelo 2 (LightGBM) ==>', grid_cv_2.best_estimator_.feature_importances_)
print('Modelo 3 (RandomForestClassifier) ==>', model_3.feature_importances_)
print('Theta Modelo meta (LogisticRegression) ==>', meta_model.coef_)
print('Bias Modelo meta (LogisticRegression) ==>', meta_model.intercept_)

COEFICIENTES DE THETA:
Modelo 1 (LightGBM) ==> [404 377 279  91 109  81   3   7 142 139 127 135 269 141 204 353   3 622
 195 251 488  80]
Modelo 2 (LightGBM) ==> [312 246 217 238 171 167  14  16 231 242 305 243 341 240 270 393  21 362
 277 337 418  39]
Modelo 3 (RandomForestClassifier) ==> [0.03523531 0.12882504 0.04682345 0.04240012 0.02602392 0.02582706
 0.00145281 0.00151369 0.03890588 0.03645506 0.04002612 0.04386789
 0.07236651 0.04532802 0.04590697 0.119981   0.00370751 0.05330746
 0.0384999  0.04471238 0.1024197  0.0064142 ]
Theta Modelo meta (LogisticRegression) ==> [[-1.59108502  4.32310047  4.01457941]]
Bias Modelo meta (LogisticRegression) ==> [-3.55955573]


In [19]:
meta_model.classes_

array([0, 1], dtype=int64)

# Inferencia

En las siguientes celdas, cargo en memoria el dataset de testing, y utilizo el ensamblaje de modelos de clasificacion para generar las predicciones del conjunto de datos.

In [20]:
test_data__route = '../data/processed/test/test.csv'
test_df = pd.read_csv(test_data__route, low_memory = False)

id_column = test_df['id']

# Elimino la columna 'id'
test_df.drop(columns = ['id'], inplace = True)
# 5 primeros registros del dataframe
test_df.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries
0,0.307692,0.545455,0.4,0.430248,0.112245,0.112245,0.0,0.0,0.415493,0.49,...,0.32963,0.238462,0.093519,0.571429,0.0,0.081633,0.016839,0.008239,0.03009,0.0
1,0.923077,0.454545,0.3,0.547588,0.091837,0.091837,1.0,1.0,0.514085,0.32,...,0.277778,0.130769,0.084495,0.496894,0.0,0.102041,0.018135,0.003776,0.022066,0.0
2,0.615385,0.636364,0.4,0.462842,0.05102,0.061224,0.0,0.0,0.323944,0.35,...,0.305556,0.161538,0.07137,0.645963,0.0,0.132653,0.041451,0.020254,0.034102,0.0
3,0.307692,0.454545,0.2,0.208605,0.020408,0.030612,0.0,0.0,0.316901,0.22,...,0.072222,0.438462,0.104184,0.590062,0.0,0.05102,0.024611,0.005836,0.008024,1.0
4,0.307692,0.636364,0.45,0.500652,0.091837,0.081633,0.0,0.0,0.429577,0.54,...,0.17037,0.161538,0.100082,0.714286,0.0,0.091837,0.031088,0.013045,0.025075,1.0


In [21]:
# Aquí asumimos que X_test es tu conjunto de datos de prueba
preds_1_test = grid_cv_1.best_estimator_.predict_proba(test_df)[:, 1]
preds_2_test = grid_cv_2.best_estimator_.predict_proba(test_df)[:, 1]
preds_3_test = model_3.predict_proba(test_df)[:, 1]

X_test_new = np.column_stack((preds_1_test, preds_2_test, preds_3_test))

# Finalmente, hacemos las predicciones con el meta-modelo
final_predictions = meta_model.predict(X_test_new)



In [22]:
final_predictions[-100:]

array([1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0], dtype=int64)

In [23]:
# Genero un dataframe con las predicciones del ensamblaje, y guardo los resultados en un fichero .csv
submission_df = pd.DataFrame(data = id_column,
                             columns = ['id'])
# Concateno la columna de resultados
submission_df['smoking'] = final_predictions

# 5 primeros registros
submission_df.head()

Unnamed: 0,id,smoking
0,159256,1
1,159257,0
2,159258,1
3,159259,0
4,159260,1


In [24]:
# Guardo el dataframe como fichero .csv
submission_path = '../src/results'
if not os.path.exists(submission_path):
    os.mkdir(submission_path)

submission_df.to_csv(os.path.join(submission_path, 'submission.csv'), index = False)
print('Resultados almacenados con exito.')

Resultados almacenados con exito.


# --- RESULTADO ---

A continuacion, se muestra una captura de pantalla de la puntuacion obtenida tras publicar el .csv generado con los resultados predichos por el ensamblaje.

![Submission Score](../src/data/images/submission_score.png)

## Guardo los modelos entrenados


In [27]:
# Ruta donde almaceno los modelos entrenados
models_path = "../src/models/"

## Guardo los modelos
try:
    # Modelo 1
    joblib.dump(grid_cv_1.best_estimator_, os.path.join(models_path, 'lightgbm_1.joblib'))
    
    # Modelo 2
    joblib.dump(grid_cv_2.best_estimator_, os.path.join(models_path, 'lightgbm_2.joblib'))
    
    # Modelo 3
    joblib.dump(model_3, os.path.join(models_path, 'rf.joblib'))
    
    # Meta modelo
    joblib.dump(meta_model, os.path.join(models_path, 'meta_model.joblib'))
    
    print('Los modelos entrenados han sido guardados con exito.')

except Exception as e:
    print('No ha sido posible guardar los modelos entrenados, debido al siguiente error ==>', e)

Los modelos entrenados han sido guardados con exito.
