## Objetivo del notebook

El actual notebook tiene como objetivo aplicar todos los pasos de preprocesamiento de datos necesarios para la limpieza del conjunto de datos a utilizar para el entrenamiento del modelo final.

# Importar las librerias a utilizar

La siguiente celda reune el codigo necesario para importar todas las librerias de las que se hacen uso en el presente notebook.

In [1]:
# Librerias y metodos para analisis y manipulacion de datos
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import shuffle

# Otros
import os

In [2]:
## Cargo en memoria el conjunto de entrenamiento

# Ruta del fichero train.csv
train_dataset__route = "../data/raw/train/train.csv"

# Instancio un objeto dataframe que cargue el conjunto de datos de entrenamiento
data_df = pd.read_csv(train_dataset__route, low_memory = False)

# 10 primeros registros del dataframe
data_df.head(10)

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0,55,165,60,81.0,0.5,0.6,1,1,135,...,40,75,16.5,1,1.0,22,25,27,0,1
1,1,70,165,65,89.0,0.6,0.7,2,2,146,...,57,126,16.2,1,1.1,27,23,37,1,0
2,2,20,170,75,81.0,0.4,0.5,1,1,118,...,45,93,17.4,1,0.8,27,31,53,0,1
3,3,35,180,95,105.0,1.5,1.2,1,1,131,...,38,102,15.9,1,1.0,20,27,30,1,0
4,4,30,165,60,80.5,1.5,1.0,1,1,121,...,44,93,15.4,1,0.8,19,13,17,0,1
5,5,50,170,55,51.0,1.2,1.2,1,1,146,...,31,99,15.9,1,0.7,24,42,119,1,1
6,6,45,160,55,69.0,1.5,1.2,1,1,150,...,69,122,13.0,1,0.7,17,12,16,0,0
7,7,55,155,60,84.5,0.7,0.9,1,1,137,...,51,198,14.5,1,0.7,16,15,16,0,0
8,8,40,165,70,89.0,0.7,1.0,1,1,130,...,59,150,15.7,1,0.9,24,21,31,0,1
9,9,40,155,50,73.0,1.5,1.5,1,1,105,...,55,122,13.2,1,0.7,22,16,14,0,0


# Preprocesamiento de datos

El estado de partida de nuestro dataset nos permite ahorrar bastante tiempo en el proceso de ajuste y limpieza de datos. Debido a la falta de registros nulos, y la ausencia favorable de variables de tipo string, solamente nos queda randomizar y estandarizar los valores de los registros con los que contamos.

In [3]:
# Cargo el conjunto de datos adicional (original, aquel a partir del cual esta basado el
# de esta competicion).

train_2__route = '../data/external/competition_based_dataset/train/train_dataset.csv'
train_data_2__df = pd.read_csv(train_2__route, low_memory = False)

# 10 primeros registros del dataframe
train_data_2__df.head(10)

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,35,170,85,97.0,0.9,0.9,1,1,118,78,...,70,142,19.8,1,1.0,61,115,125,1,1
1,20,175,110,110.0,0.7,0.9,1,1,119,79,...,71,114,15.9,1,1.1,19,25,30,1,0
2,45,155,65,86.0,0.9,0.9,1,1,110,80,...,57,112,13.7,3,0.6,1090,1400,276,0,0
3,45,165,80,94.0,0.8,0.7,1,1,158,88,...,46,91,16.9,1,0.9,32,36,36,0,0
4,20,165,60,81.0,1.5,0.1,1,1,109,64,...,47,92,14.9,1,1.2,26,28,15,0,0
5,60,160,50,78.0,1.0,0.9,2,2,126,75,...,98,64,13.9,1,1.0,47,23,70,0,1
6,40,175,90,95.0,0.9,1.0,1,1,130,88,...,39,102,16.5,1,1.0,19,22,19,0,0
7,40,180,75,85.0,1.5,1.5,1,1,110,60,...,58,99,14.0,2,1.4,29,20,32,1,1
8,40,170,60,74.0,1.2,1.5,1,1,89,57,...,60,104,12.9,2,0.7,17,17,14,0,0
9,45,155,55,78.0,0.7,1.0,1,1,114,81,...,41,107,13.1,1,0.6,22,15,56,0,0


In [4]:
# Incluyo los registros de este nuevo dataset en el conjunto de entrenamiento
data_df = pd.concat([data_df, train_data_2__df], axis= 0)
data_df.reset_index()
data_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 198240 entries, 0 to 38983
Data columns (total 24 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   159256 non-null  float64
 1   age                  198240 non-null  int64  
 2   height(cm)           198240 non-null  int64  
 3   weight(kg)           198240 non-null  int64  
 4   waist(cm)            198240 non-null  float64
 5   eyesight(left)       198240 non-null  float64
 6   eyesight(right)      198240 non-null  float64
 7   hearing(left)        198240 non-null  int64  
 8   hearing(right)       198240 non-null  int64  
 9   systolic             198240 non-null  int64  
 10  relaxation           198240 non-null  int64  
 11  fasting blood sugar  198240 non-null  int64  
 12  Cholesterol          198240 non-null  int64  
 13  triglyceride         198240 non-null  int64  
 14  HDL                  198240 non-null  int64  
 15  LDL                  19

## --- Conjunto de entrenamiento ---

In [5]:
# Defino mi matriz de caracteristicas, y mi matriz de una sola columna con la variable
# de salida.

X = data_df.drop(columns = ['id', 'smoking'])
column_names = X.columns
y = data_df['smoking']

In [6]:
## Randomizacion
X, y = shuffle(X, y, random_state = 42)

In [7]:
## Estandarizacion
scaler = MinMaxScaler()

X = scaler.fit_transform(X)
X

array([[0.46153846, 0.66666667, 0.47619048, ..., 0.00480604, 0.0220662 ,
        0.        ],
       [0.53846154, 0.58333333, 0.47619048, ..., 0.01098524, 0.01805416,
        0.        ],
       [0.69230769, 0.33333333, 0.19047619, ..., 0.00480604, 0.01905717,
        0.        ],
       ...,
       [0.38461538, 0.5       , 0.28571429, ..., 0.00411946, 0.01203611,
        0.        ],
       [0.61538462, 0.66666667, 0.52380952, ..., 0.01785101, 0.03811434,
        0.        ],
       [0.30769231, 0.75      , 0.52380952, ..., 0.00411946, 0.05315948,
        0.        ]])

In [8]:
# Unifico los datos en un nuevo dataframe
train_df = pd.DataFrame(data = X,
                        columns = column_names)

# Agrego la columna de mi variable objetivo
train_df['smoking'] = y.values

# 5 primeros registros del DataFrame
train_df.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,smoking
0,0.461538,0.666667,0.47619,0.5,0.091837,0.091837,0.0,0.0,0.364198,0.377358,...,0.24507,0.023131,0.58642,0.0,0.078261,0.00738,0.004806,0.022066,0.0,1
1,0.538462,0.583333,0.47619,0.538462,0.112245,0.142857,0.0,0.0,0.234568,0.339623,...,0.104225,0.063475,0.623457,0.0,0.078261,0.017528,0.010985,0.018054,0.0,1
2,0.692308,0.333333,0.190476,0.474359,0.040816,0.071429,0.0,0.0,0.487654,0.471698,...,0.126761,0.080151,0.481481,0.0,0.06087,0.015683,0.004806,0.019057,0.0,0
3,0.692308,0.5,0.333333,0.358974,0.020408,0.030612,0.0,0.0,0.376543,0.415094,...,0.174648,0.068316,0.537037,0.0,0.069565,0.012915,0.004463,0.022066,1.0,0
4,0.307692,0.666667,0.47619,0.516667,0.091837,0.091837,0.0,0.0,0.302469,0.433962,...,0.101408,0.071544,0.67284,0.0,0.069565,0.027675,0.013732,0.118355,0.0,1


In [9]:
# Guardo el conjunto de datos de entrenamiento como .csv
train_set__route = '../data/processed/train'

if not os.path.exists(train_set__route):
    os.mkdir(train_set__route)

train_df.to_csv(os.path.join(train_set__route, 'train.csv'), index = False)
print('Dataset procesado guardado con exito.')

Dataset procesado guardado con exito.


## --- Conjunto de testing ---

In [10]:
## Cargo en memoria el conjunto de prueba
# Ruta del fichero test.csv
train_dataset__route = "../data/raw/test/test.csv"

# Instancio un objeto dataframe que cargue el conjunto de datos de prueba
data_df = pd.read_csv(train_dataset__route, low_memory = False)

# 10 primeros registros del dataframe
data_df.head(10)

Unnamed: 0,id,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,triglyceride,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries
0,159256,40,165,70,84.0,1.2,1.2,1,1,130,...,186,49,115,14.2,1,0.9,19,25,32,0
1,159257,80,160,60,93.0,1.0,1.0,2,2,144,...,158,35,104,13.0,1,1.1,20,12,24,0
2,159258,60,170,70,86.5,0.6,0.7,1,1,117,...,173,39,88,15.4,1,1.4,38,60,36,0
3,159259,40,160,50,67.0,0.3,0.4,1,1,116,...,47,75,128,14.5,1,0.6,25,18,10,1
4,159260,40,170,75,89.4,1.0,0.9,1,1,132,...,100,39,123,16.5,1,1.0,30,39,27,1
5,159261,40,165,55,75.8,1.0,1.0,1,1,94,...,146,41,108,15.1,1,1.0,27,15,56,1
6,159262,40,165,80,87.0,1.5,1.5,1,1,128,...,95,46,160,15.5,1,1.0,24,30,25,0
7,159263,55,150,50,69.2,1.0,0.9,1,1,152,...,89,60,98,14.2,1,0.7,26,20,36,0
8,159264,70,160,60,79.0,0.5,0.5,1,1,150,...,116,57,113,13.9,1,1.2,19,24,18,0
9,159265,70,145,55,82.0,1.0,1.0,1,1,144,...,72,49,110,14.5,1,1.0,20,15,27,0


## Estandarizacion

In [16]:
id_column = data_df['id']
norm_data_df = scaler.fit_transform(data_df.drop(columns = 'id'))
column_names = data_df.drop(columns = 'id').columns
norm_data_df

array([[0.30769231, 0.54545455, 0.4       , ..., 0.00823893, 0.03009027,
        0.        ],
       [0.92307692, 0.45454545, 0.3       , ..., 0.00377618, 0.0220662 ,
        0.        ],
       [0.61538462, 0.63636364, 0.4       , ..., 0.02025403, 0.03410231,
        0.        ],
       ...,
       [0.23076923, 0.63636364, 0.55      , ..., 0.01064195, 0.0220662 ,
        1.        ],
       [0.30769231, 0.45454545, 0.3       , ..., 0.0120151 , 0.03009027,
        0.        ],
       [0.46153846, 0.27272727, 0.2       , ..., 0.00343289, 0.01504514,
        1.        ]])

In [17]:
# Unifico los datos en un nuevo dataframe
test_df = pd.DataFrame(data = norm_data_df,
                        columns = column_names)

test_df['id'] = id_column

# 5 primeros registros del DataFrame
test_df.head()

Unnamed: 0,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,relaxation,...,HDL,LDL,hemoglobin,Urine protein,serum creatinine,AST,ALT,Gtp,dental caries,id
0,0.307692,0.545455,0.4,0.430248,0.112245,0.112245,0.0,0.0,0.415493,0.49,...,0.238462,0.093519,0.571429,0.0,0.081633,0.016839,0.008239,0.03009,0.0,159256
1,0.923077,0.454545,0.3,0.547588,0.091837,0.091837,1.0,1.0,0.514085,0.32,...,0.130769,0.084495,0.496894,0.0,0.102041,0.018135,0.003776,0.022066,0.0,159257
2,0.615385,0.636364,0.4,0.462842,0.05102,0.061224,0.0,0.0,0.323944,0.35,...,0.161538,0.07137,0.645963,0.0,0.132653,0.041451,0.020254,0.034102,0.0,159258
3,0.307692,0.454545,0.2,0.208605,0.020408,0.030612,0.0,0.0,0.316901,0.22,...,0.438462,0.104184,0.590062,0.0,0.05102,0.024611,0.005836,0.008024,1.0,159259
4,0.307692,0.636364,0.45,0.500652,0.091837,0.081633,0.0,0.0,0.429577,0.54,...,0.161538,0.100082,0.714286,0.0,0.091837,0.031088,0.013045,0.025075,1.0,159260


In [18]:
# Guardo el conjunto de datos de prueba como .csv
test_set__route = '../data/processed/test'

if not os.path.exists(test_set__route):
    os.mkdir(test_set__route)

test_df.to_csv(os.path.join(test_set__route, 'test.csv'), index = False)
print('Dataset procesado guardado con exito.')

Dataset procesado guardado con exito.
