# DiploDatos Kaggle Competition

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import random

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
# from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, classification_report

## Leer el dataset

### Train

Cargamos los datos de entrenamiento que vamos a utilizar para generar nuestro modelo.

In [2]:
df = pd.read_csv('data/diabetes_prediction_dataset_train-labeled.csv')
print(df.shape)
print(df.describe())

(95000, 10)
             patient           age  hypertension  heart_disease           bmi  \
count   95000.000000  95000.000000  95000.000000   95000.000000  95000.000000   
mean    50016.501389     41.935269      0.075074       0.039463     27.320879   
std     28868.357071     22.514788      0.263512       0.194695      6.626335   
min         1.000000      0.080000      0.000000       0.000000     10.010000   
25%     25021.750000     24.000000      0.000000       0.000000     23.650000   
50%     50024.000000     43.000000      0.000000       0.000000     27.320000   
75%     75024.250000     60.000000      0.000000       0.000000     29.580000   
max    100000.000000     80.000000      1.000000       1.000000     95.690000   

        HbA1c_level  blood_glucose_level      diabetes  
count  95000.000000         95000.000000  95000.000000  
mean       5.527659           138.070537      0.085074  
std        1.070261            40.739962      0.278993  
min        3.500000           

#### Separamos a las columnas en numéricas y categóricas

In [3]:
cat_cols = ['gender', 'smoking_history']
num_cols = [x for x in df.columns if x not in cat_cols and x not in ['patient', 'diabetes']]
# En las columnas numéricas quitamos la columna "patient" que contiene el id de los pacientes y "diabetes" que es la variable target

#### Preprocesamiento

In [4]:
X = df.drop(columns=['patient', 'diabetes'])
y = df['diabetes']
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 8)

In [5]:
#Cargo el pipeline:
pipeline = joblib.load('pipeline.pkl')
# Fiteo el pipeline
x_train_transformed = pipeline.fit_transform(x_train)
x_test_transformed = pipeline.transform(x_test)

generamos la salida 

In [7]:
test_df = pd.read_csv('data/diabetes_prediction_dataset_test.csv')


# acá empiezo con la red neuronal:

Defino una clase para generar datos que pueda leer el dataloader de torch:


In [47]:
class set_up_data(Dataset):
    "Para utilizarse con el DataLoader de PyTorch"
    def __init__(self,data):
        self.x_data = data['x']
        self.y_data = data['y']
    
        self.xmin, self.xmax = np.amin(self.x_data), np.amax(self.x_data)
        self.ymin, self.ymax = np.amin(self.y_data), np.amax(self.y_data)
        
        #self.x_data = self.norm( self.x_data, self.xmin, self.xmax)
        #self.y_data = self.norm( self.y_data, self.ymin, self.ymax)
    
    def __len__(self):
        "Denoto el numero total de muestras"
        return len(self.y_data)
    
    def norm( self, data, datamin, datamax):
        #Normalizacion [0,1]
        return (data-datamin)/(datamax-datamin)   

In [27]:
#fijo semilla

def define_seed(seed):
        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)

define_seed(seed=100)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [28]:
#Hiperparametros
batch_size= 100
max_epochs = 30
learning_rate = 1e-2

#Definimos la función de costo que queremos minimizar, y también el método de calculo sobre el batch.
MSE_Loss = torch.nn.MSELoss(reduction='mean')

In [48]:
#cargo los datos
train_data = {'x':x_train_transformed, 'y':y_train}
test_data = {'x':x_test_transformed, 'y':y_test}

train_subset = set_up_data(train_data)
test_subset = set_up_data(test_data)

dataloader_train = DataLoader(train_subset, batch_size = batch_size, shuffle=True) 
#dataloader_val   = DataLoader(val_subset , batch_size=len(val_subset), shuffle=False)
dataloader_test  = DataLoader(test_subset , batch_size=len(test_subset), shuffle=False)

Armo el modelo:

In [None]:
model = nn.Sequential(
    nn.Linear(in_features = 12, out_features = 25, bias=True),
    nn.Sigmoid(),
    nn.Linear(in_features = 25, out_features = 15, bias=True),
    nn.Sigmoid(),
    nn.Linear(in_features = 15, out_features = 1, bias=True),
    nn.Sigmoid()
)

#Guardamos al modelo que definimos previamente
model.to(device) #Cargamos en memoria

Para poder evaluar nuestra predicción los datos de prueba deben tener exactamente el mismo tratamiento que los datos de entrenamiento

In [8]:
Y_test = test_df.diabetes
X_test = test_df.drop(columns=['patient','diabetes'])
PatientId_test = test_df['patient']


In [10]:
X_test_transformed = pipeline.transform(X_test)

In [11]:
# Para obtener el nombre de las columnas creadas a partir del OneHotEncoder es necesario acceder al mismo de esta manera:
pipeline.transformers_[0][1]

In [12]:
# Con el método get_features_names_out se puede obtener el nombre de las columnas creadas
pipeline.transformers_[0][1].get_feature_names_out()

array(['gender_Male', 'gender_Other', 'smoking_history_current',
       'smoking_history_ever', 'smoking_history_former',
       'smoking_history_never', 'smoking_history_not current'],
      dtype=object)

In [13]:
cols = pipeline.transformers_[0][1].get_feature_names_out().tolist() + num_cols
X_test_transformed = pd.DataFrame(X_test_transformed, columns=cols)

Generamos la salida

In [None]:
test_id = PatientId_test
test_pred = np.int64(xgb.predict(X_test_transformed))

Con el resultado predicho tenemos que generar el archivo `.csv` para subir a la competencia de kaggle:

In [None]:
submission = pd.DataFrame(list(zip(test_id, test_pred)), columns=["patient", "diabetes"])
submission.to_csv("sample_submission.csv", header=True, index=False)