# DiploDatos Kaggle Competition

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import random

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

# from sklearn.preprocessing import StandardScaler, OneHotEncoder
# from sklearn.compose import ColumnTransformer
# from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
# from sklearn.metrics import accuracy_score, ConfusionMatrixDisplay, classification_report


import numpy as np
from scipy.stats import spearmanr

def rmse( modeldata , targetdata ) :
    return np.sqrt( np.mean( (modeldata.flatten() - targetdata.flatten()) ** 2 ) )

def bias( modeldata , targetdata ) :
    return np.mean( modeldata.flatten() - targetdata.flatten() )

def corr_P( modeldata , targetdata ) :    
    return np.corrcoef( modeldata.flatten() , targetdata.flatten() )[0,1]

def corr_S( modeldata , targetdata ) :    
    return spearmanr( modeldata.flatten() , targetdata.flatten() )[0]

## Leer el dataset

### Train

Cargamos los datos de entrenamiento que vamos a utilizar para generar nuestro modelo.

In [3]:
df = pd.read_csv('data/diabetes_prediction_dataset_train-labeled.csv')
print(df.shape)
print(df.describe())

(95000, 10)
             patient           age  hypertension  heart_disease           bmi  \
count   95000.000000  95000.000000  95000.000000   95000.000000  95000.000000   
mean    50016.501389     41.935269      0.075074       0.039463     27.320879   
std     28868.357071     22.514788      0.263512       0.194695      6.626335   
min         1.000000      0.080000      0.000000       0.000000     10.010000   
25%     25021.750000     24.000000      0.000000       0.000000     23.650000   
50%     50024.000000     43.000000      0.000000       0.000000     27.320000   
75%     75024.250000     60.000000      0.000000       0.000000     29.580000   
max    100000.000000     80.000000      1.000000       1.000000     95.690000   

        HbA1c_level  blood_glucose_level      diabetes  
count  95000.000000         95000.000000  95000.000000  
mean       5.527659           138.070537      0.085074  
std        1.070261            40.739962      0.278993  
min        3.500000           

#### Separamos a las columnas en numéricas y categóricas

In [4]:
cat_cols = ['gender', 'smoking_history']
num_cols = [x for x in df.columns if x not in cat_cols and x not in ['patient', 'diabetes']]
# En las columnas numéricas quitamos la columna "patient" que contiene el id de los pacientes y "diabetes" que es la variable target

#### Preprocesamiento

In [5]:
X = df.drop(columns=['patient', 'diabetes'])
y = df['diabetes']
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state = 8)

In [6]:
#Cargo el pipeline:
pipeline = joblib.load('pipeline.pkl')
# Fiteo el pipeline
x_train_transformed = pipeline.fit_transform(x_train)
x_test_transformed = pipeline.transform(x_test)

# acá empiezo con la red neuronal:

Defino una clase para generar datos que pueda leer el dataloader de torch:


In [7]:
class set_up_data(Dataset):
    "Para utilizarse con el DataLoader de PyTorch"
    #nota: las series de pandas pueden tirar error en las keys, usar arrays de numpy o tensores de torch
    def __init__(self,data,scaling='norm'):
        self.x_data = np.array(data['x']).T
        self.y_data = np.array(data['y'])
        self.xmin, self.xmax = np.amin(self.x_data,axis=1), np.amax(self.x_data,axis=1)
        
        if scaling=='norm':
            self.scaling_mtd = self.Norm
        elif scaling=='01':
            self.scaling_mtd = self.ScaleTo01
        
        self.x_data = self.scaling_mtd( self.x_data, self.xmin, self.xmax)
        self.y_data = self.class_data(self.y_data)

        
    def __getitem__(self,index):
        x = torch.tensor(self.x_data[:,index], dtype=torch.float)
        y = torch.tensor(self.y_data[:,index], dtype=torch.float)
        return x, y

    def __len__(self):
        "Denoto el numero total de muestras"
        return self.y_data.shape[1]
    
    def Norm(self, data, datamin, datamax):
        #Normalizacion [0,1]
        return (data-datamin[:,np.newaxis])/(datamax[:,np.newaxis]-datamin[:,np.newaxis])
    
    def ScaleTo01(self, data, datamin, datamax):
        return data/datamax[:,np.newaxis]
    
    
    def denorm(self, data, datamin, datamax):
        return (data)*(datamax[:,np.newaxis]-datamin[:,np.newaxis])+datamin[:,np.newaxis]
    
        
    def class_data(self,arr):

        """
        transforma los target a arrays de dimensión (ndat,2)
        donde y[:,0]=1 si es diabetes de tipo 0 y 0 si no,
        y y[:,1]=1 si es diabetes de tipo 1 y 0 si no.
        """
        
        arr_out = np.zeros((2,len(arr)))

        for i in range(len(arr)):
            if arr[i] == 0:
                arr_out[0,i] = 1
            else:
                arr_out[1,i] = 1
    
        return arr_out

In [8]:
#fijo semilla

def define_seed(seed):
        torch.manual_seed(seed)
        np.random.seed(seed)
        random.seed(seed)

define_seed(seed=100)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Armo el modelo:

In [None]:
class LinearNN(nn.Module):

    def __init__(self,dims):
        super().__init__()
        self.linear1 = nn.Linear(in_features = dims[0], out_features = dims[1], bias = True)
        self.linear2 = nn.Linear(in_features = dims[1], out_features = dims[2], bias = True)
        self.linear3 = nn.Linear(in_features = dims[2], out_features = dims[3], bias = True)
        self.linear4 = nn.Linear(in_features = dims[3], out_features = dims[4], bias = True)

        self.dims = dims
        self.activation = nn.ReLU()
        self.softmax = nn.Softmax()
        
    def forward(self,x):
        x = x.view(-1,self.dims[0])
        x = self.linear1(x)
        x = self.activation(x)

        x = x.view(-1,self.dims[1])
        x = self.linear2(x)
        x = self.activation(x)

        x = x.view(-1,self.dims[1])
        x = self.linear3(x)
        x = self.activation(x)
        
        x = x.view(-1,self.dims[2])
        x = self.linear4(x)
        x = self.softmax(x)

        return x


    def initialize_weights(self):
        for m in self.modules():
                if m.bias is not None:
                    torch.nn.init.xavier_normal_(m.weight)
                    torch.nn.init.constant_(m.bias, 1)

nlayers = [13, 260 ,260,260,2]
model = LinearNN(nlayers)

#Guardamos al modelo que definimos previamente
model.to(device) #Cargamos en memoria

In [None]:
#Hiperparametros
batch_size= len(y_test)
max_epochs = 30
learning_rate = 1e-3

#cargo los datos
train_data = {'x':x_train_transformed, 'y':y_train}
test_data = {'x':x_test_transformed, 'y':y_test}

ratio_1_0_train = len([i for i in y_train if i==1])/len(y_train)
ratio_1_0_test = len([i for i in y_test if i==1])/len(y_test)

train_subset = set_up_data(train_data, scaling='norm')
test_subset = set_up_data(test_data, scaling='norm')

dataloader_train = DataLoader(train_subset, batch_size = batch_size, shuffle=False) 
dataloader_test  = DataLoader(test_subset , batch_size=len(y_test), shuffle=False)

In [None]:
count_0 = len([i for i in y_train if i==0])
count_1 = len([i for i in y_train if i==1])

counts = [count_0,count_1]

#La función de costo va aestar ponderada por el inverso de la cantidad de elementos de cada clase, 
#para favorecer la diabetes tipo 1, que es mucho menos recurrente

ww = 1./np.array(counts)
ww_norm = ww/np.sum(ww) #normalizo

#CrossEntropyLoss para problemas de clasificación
Loss = nn.CrossEntropyLoss(weight=torch.as_tensor(ww_norm))

#Definimos el optimizador
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 


# Entreno la red

In [1]:
#Listas donde guardamos loss de entrenamiento, y para el de validación la loss y las métricas de evaluación.
RMSE, BIAS, Corr_P, Corr_S = [], [], [], []
loss_train = []

for epoch in range(max_epochs):
    #print('Epoca: '+ str(epoch+1) + ' de ' + str(max_epochs) )
    
    #Entrenamiento del modelo        
    model.train()  #Esto le dice al modelo que se comporte en modo entrenamiento.

    sum_loss = 0.0
    batch_counter = 0

    # Iteramos sobre los minibatches. 
    for inputs, target in dataloader_train :
        #Enviamos los datos a la memoria.
        inputs, target = inputs.to(device), target.to(device)

        optimizer.zero_grad()

        outputs = model(inputs).squeeze()

        loss = Loss(outputs.float(), target.float())

        loss.backward()
        optimizer.step()

        batch_counter += 1
        sum_loss = sum_loss + loss.item()

    #Calculamos la loss media sobre todos los minibatches 
    loss_train.append( sum_loss / batch_counter )

    model.eval()   #Esto le dice al modelo que lo usaremos para evaluarlo (no para entrenamiento)

    #Calculamos la función de costo para la muestra de testing.
    input_test, target_test = next(iter(dataloader_test))
    input_test, target_test = input_test.to(device) , target_test.to(device) 

    with torch.no_grad():
        output_test = model(input_test).squeeze()

    #Calculo de la loss de la epoca
    print('Loss Train: ', str(loss_train[epoch]))
    #print('Loss Val:   ', str(loss_val[epoch]))

    ###################################

    #Calculo de metricas RMSE, BIAS, Correlacion de Pearson y Spearman
    Corr_P.append(corr_P(output_test, target_test))
    Corr_S.append(corr_S(output_test, target_test)) 
    
plt.plot(loss_train)
plt.show()

NameError: name 'max_epochs' is not defined

In [None]:
#hago la prediccion sobre el conjunto de testing
model.eval()   #Esto le dice al modelo que lo usaremos para evaluarlo (no para entrenamiento)

input_test, target_test = next(iter(dataloader_test))
input_test, target_test = input_test.detach().to(device) , target_test.detach().to(device)


with torch.no_grad():
    output_test = model( input_test )

#transformo outputs probabilisticos a etiquetas. La mayor probabilidad tiene un 1, el resto cero
output_test = np.array(output_test)
output_test0 = np.zeros(output_test.shape)
for i in range(output_test.shape[0]):
    max = np.argmax(output_test[i])
    output_test0[i,max] = 1
print(output_test0)

In [None]:
import sklearn
from sklearn import calibration as cal
#from sklearn.calibration import CalibrationDisplay

#Matriz de confusion
plt.clf()
cmatrix=sklearn.metrics.confusion_matrix( np.argmax(target_test,axis=1) , np.argmax(output_test,axis=1) , normalize="true")
disp = sklearn.metrics.ConfusionMatrixDisplay(cmatrix)
disp.plot()
plt.savefig("tmp/conf_mat")

fig ,ax = plt.subplots(1,1)
disp = cal.CalibrationDisplay.from_predictions( target_test[:,0], output_test[:,0], n_bins=10, name = "lluvia severa", ax=ax )
ax.set_title('Diagrama de confiabilidad')
ax.set_xlabel('Probabilidad de diabetes tipo 0')
ax.set_ylabel('Frecuencia observada - diabetes tipo 0')
fig.savefig("tmp/diag_conf_lig")

fig ,ax = plt.subplots(1,1)
disp = cal.CalibrationDisplay.from_predictions( target_test[:,1], output_test[:,1], n_bins=10, name = "lluvia moderada", ax=ax )
ax.set_title('Diagrama de confiabilidad')
ax.set_xlabel('Probabilidad de diabetes tipo 1')
ax.set_ylabel('Frecuencia observada - diabetes tipo 1')
fig.savefig("tmp/diag_conf_mod")

In [None]:
Y_test = test_df.diabetes
X_test = test_df.drop(columns=['patient','diabetes'])
PatientId_test = test_df['patient']


In [None]:
X_test_transformed = pipeline.transform(X_test)

# --------------------------------------------------------------------------------

In [None]:
# Para obtener el nombre de las columnas creadas a partir del OneHotEncoder es necesario acceder al mismo de esta manera:
pipeline.transformers_[0][1]

In [None]:
# Con el método get_features_names_out se puede obtener el nombre de las columnas creadas
pipeline.transformers_[0][1].get_feature_names_out()

In [None]:
cols = pipeline.transformers_[0][1].get_feature_names_out().tolist() + num_cols
X_test_transformed = pd.DataFrame(X_test_transformed, columns=cols)

Generamos la salida

In [None]:
test_id = PatientId_test
test_pred = np.int64(xgb.predict(X_test_transformed))

Con el resultado predicho tenemos que generar el archivo `.csv` para subir a la competencia de kaggle:

In [None]:
submission = pd.DataFrame(list(zip(test_id, test_pred)), columns=["patient", "diabetes"])
submission.to_csv("sample_submission.csv", header=True, index=False)