### 1. Importación de librerías

In [1]:
# Librerías para manejo de datos
import pandas as pd
pd.set_option('display.max_columns', 25) # Número máximo de columnas a mostrar
pd.set_option('display.max_rows', 50) # Numero máximo de filas a mostar
import numpy as np
np.random.seed(3301)
import pandas as pd
# Para preparar los datos
from sklearn.preprocessing import LabelEncoder
# Para crear el arbol de decisión 
from sklearn.tree import DecisionTreeClassifier 
# Para usar KNN como clasificador
from sklearn.neighbors import KNeighborsClassifier
# Para realizar la separación del conjunto de aprendizaje en entrenamiento y test.
from sklearn.model_selection import train_test_split
# Para evaluar el modelo
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import plot_confusion_matrix
# Para búsqueda de hiperparámetros
from sklearn.model_selection import GridSearchCV
# Para la validación cruzada
from sklearn.model_selection import KFold 
#Librerías para la visualización
import matplotlib.pyplot as plt
# Seaborn
import seaborn as sns 
from sklearn import tree

### 2. Carga de los datos

In [2]:
# Se cargan los datos. 
df_tracks=pd.read_csv('202210_Laboratorio1_data_Datos_Clasificacion_2022.csv', sep=';', encoding = 'utf-8', index_col=0, low_memory=False)

### 3. Limpieza y preparación de los datos

In [3]:
# Cantidad de datos y número de variables
df_tracks.shape

(100000, 26)

In [4]:
# Mostrar los datos
df_tracks.head()

Unnamed: 0_level_0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,...,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
Diabetes_012,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
0,1,1,1,40,1,0,0,0,0,1,0,1,...,18,15,1,0,9,4,3,,,,,
0,0,0,0,25,1,0,0,1,0,0,0,0,...,0,0,0,0,7,6,1,,,,,
0,1,1,1,28,0,0,0,0,1,0,0,1,...,30,30,1,0,9,4,8,,,,,
0,1,0,1,27,0,0,0,1,1,1,0,1,...,0,0,0,0,11,3,6,,,,,
0,1,1,1,24,0,0,0,1,1,1,0,1,...,3,0,0,0,11,5,4,,,,,


In [5]:
# Podemos ver los tipos de todas la variables.
df_tracks.dtypes

HighBP                   object
HighChol                 object
CholCheck                object
BMI                      object
Smoker                   object
Stroke                   object
HeartDiseaseorAttack     object
PhysActivity             object
Fruits                   object
Veggies                  object
HvyAlcoholConsump        object
AnyHealthcare            object
NoDocbcCost              object
GenHlth                  object
MentHlth                 object
PhysHlth                 object
DiffWalk                 object
Sex                      object
Age                      object
Education                object
Income                   object
Unnamed: 22             float64
Unnamed: 23             float64
Unnamed: 24             float64
Unnamed: 25             float64
Unnamed: 26             float64
dtype: object

Notamos que hay columnas nulas, entonces tenemos que eliminarlas

In [6]:
# Eliminar las columnas en nulo
df_tracks_t = df_tracks.drop(['Unnamed: 22','Unnamed: 23','Unnamed: 24','Unnamed: 25','Unnamed: 26' ], axis=1)

In [7]:
# Mostramos otra vez la tabla sin las columnas en nulo
df_tracks_t.describe()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,99982,99991,99984,99981,99979,99987,99987,99985,99976,99989,99988,99984,99978,99982,99986,99978,99987,99991,99988,99981,99999
unique,5,5,5,95,5,5,5,5,5,5,5,5,5,8,49,54,5,5,16,9,11
top,0,0,1,27,0,0,0,1,1,1,0,1,0,2,0,0,0,0,9,6,8
freq,56544,56814,96330,9624,55426,95818,90617,76135,63639,81464,94248,95003,91757,35060,68495,62518,83320,56163,13114,43815,36920


In [8]:
df_tracks_t.head()

Unnamed: 0_level_0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
Diabetes_012,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,40,1,0,0,0,0,1,0,1,0,5,18,15,1,0,9,4,3
0,0,0,0,25,1,0,0,1,0,0,0,0,1,3,0,0,0,0,7,6,1
0,1,1,1,28,0,0,0,0,1,0,0,1,1,5,30,30,1,0,9,4,8
0,1,0,1,27,0,0,0,1,1,1,0,1,0,2,0,0,0,0,11,3,6
0,1,1,1,24,0,0,0,1,1,1,0,1,0,2,3,0,0,0,11,5,4


In [9]:
df_tracks_t.isnull().sum()

HighBP                  18
HighChol                 9
CholCheck               16
BMI                     19
Smoker                  21
Stroke                  13
HeartDiseaseorAttack    13
PhysActivity            15
Fruits                  24
Veggies                 11
HvyAlcoholConsump       12
AnyHealthcare           16
NoDocbcCost             22
GenHlth                 18
MentHlth                14
PhysHlth                22
DiffWalk                13
Sex                      9
Age                     12
Education               19
Income                   1
dtype: int64

Ahora eliminamos las celdas con datos nulos y las que están duplicadas

In [10]:
# Eliminación registros con ausencias
df_tracks_t = df_tracks_t.dropna()
# Eliminación de registros duplicados.
df_tracks_t = df_tracks_t.drop_duplicates()

In [11]:
df_tracks_t.describe()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
count,93384,93384,93384,93384,93384,93384,93384,93384,93384,93384,93384,93384,93384,93384,93384,93384,93384,93384,93384,93384,93384
unique,5,5,5,95,5,5,5,5,5,5,5,5,5,8,49,54,5,5,16,9,11
top,0,0,1,27,0,0,0,1,1,1,0,1,0,2,0,0,0,0,9,6,8
freq,51273,52016,89768,8806,50265,89236,84103,69708,58052,75049,87723,88425,85195,32068,62160,56167,76783,52594,12134,38312,31577


In [12]:
# Podemos ver los estadísticos de la variable "diabetes"
df_tracks_t['HighBP'].describe()

count     93384
unique        5
top           0
freq      51273
Name: HighBP, dtype: object

Vemos que HighBP tiene 5 tipos de valores, pero según el codebook, solo puede tomar 2 valores, el 0 y el 1

In [13]:
df_tracks_t['HighBP'].value_counts()

0     51273
1     42067
-        19
Xx       18
?         7
Name: HighBP, dtype: int64

Por lo tanto, tenemos que eliminar los valores que no sean 0 o 1

In [14]:
df_tracks_t = df_tracks_t.set_index('HighBP')
df_tracks_t = df_tracks_t.drop('-', axis=0)
df_tracks_t = df_tracks_t.drop('Xx', axis=0)
df_tracks_t = df_tracks_t.drop('?', axis=0)
df_tracks_t = df_tracks_t.reset_index()
df_tracks_t['HighBP'].value_counts()

In [19]:
df_tracks_t['HighChol'].value_counts()

0     51989
1     41326
Xx       10
-         8
?         7
Name: HighChol, dtype: int64

Ahora hay que hacer lo mismo con HighChol

In [20]:
df_tracks_t = df_tracks_t.set_index('HighChol')
df_tracks_t = df_tracks_t.drop('-', axis=0)
df_tracks_t = df_tracks_t.drop('Xx', axis=0)
df_tracks_t = df_tracks_t.drop('?', axis=0)
df_tracks_t = df_tracks_t.reset_index()
df_tracks_t['HighChol'].value_counts()

0    51989
1    41326
Name: HighChol, dtype: int64

In [21]:
df_tracks_t['CholCheck'].value_counts()

1     89702
0      3583
-        15
Xx       10
?         5
Name: CholCheck, dtype: int64

In [22]:
df_tracks_t = df_tracks_t.set_index('CholCheck')
df_tracks_t = df_tracks_t.drop('-', axis=0)
df_tracks_t = df_tracks_t.drop('Xx', axis=0)
df_tracks_t = df_tracks_t.drop('?', axis=0)
df_tracks_t = df_tracks_t.reset_index()
df_tracks_t['CholCheck'].value_counts()

1    89702
0     3583
Name: CholCheck, dtype: int64

In [25]:
df_tracks_t['BMI'].value_counts()

27    8799
26    7222
24    6877
25    6045
28    6029
      ... 
88       1
85       1
91       1
86       1
83       1
Name: BMI, Length: 95, dtype: int64

BMI está en el rango correcto de 1 a 99

In [26]:
df_tracks_t['Smoker'].value_counts()

0     50211
1     43029
Xx       20
-        15
?        10
Name: Smoker, dtype: int64

In [27]:
df_tracks_t = df_tracks_t.set_index('Smoker')
df_tracks_t = df_tracks_t.drop('-', axis=0)
df_tracks_t = df_tracks_t.drop('Xx', axis=0)
df_tracks_t = df_tracks_t.drop('?', axis=0)
df_tracks_t = df_tracks_t.reset_index()
df_tracks_t['Smoker'].value_counts()

0    50211
1    43029
Name: Smoker, dtype: int64

In [28]:
df_tracks_t['Stroke'].value_counts()

0     89097
1      4111
Xx       16
-        13
?         3
Name: Stroke, dtype: int64

In [29]:
df_tracks_t = df_tracks_t.set_index('Stroke')
df_tracks_t = df_tracks_t.drop('-', axis=0)
df_tracks_t = df_tracks_t.drop('Xx', axis=0)
df_tracks_t = df_tracks_t.drop('?', axis=0)
df_tracks_t = df_tracks_t.reset_index()
df_tracks_t['Stroke'].value_counts()

0    89097
1     4111
Name: Stroke, dtype: int64

In [30]:
df_tracks_t['HeartDiseaseorAttack'].value_counts()

0     83943
1      9233
-        16
Xx        9
?         7
Name: HeartDiseaseorAttack, dtype: int64

In [31]:
df_tracks_t = df_tracks_t.set_index('HeartDiseaseorAttack')
df_tracks_t = df_tracks_t.drop('-', axis=0)
df_tracks_t = df_tracks_t.drop('Xx', axis=0)
df_tracks_t = df_tracks_t.drop('?', axis=0)
df_tracks_t = df_tracks_t.reset_index()
df_tracks_t['HeartDiseaseorAttack'].value_counts()

0    83943
1     9233
Name: HeartDiseaseorAttack, dtype: int64

In [32]:
df_tracks_t['PhysActivity'].value_counts()

1     69553
0     23590
-        18
Xx       10
?         5
Name: PhysActivity, dtype: int64

In [33]:
df_tracks_t = df_tracks_t.set_index('PhysActivity')
df_tracks_t = df_tracks_t.drop('-', axis=0)
df_tracks_t = df_tracks_t.drop('Xx', axis=0)
df_tracks_t = df_tracks_t.drop('?', axis=0)
df_tracks_t = df_tracks_t.reset_index()
df_tracks_t['PhysActivity'].value_counts()

1    69553
0    23590
Name: PhysActivity, dtype: int64

In [34]:
df_tracks_t['Fruits'].value_counts()

1     57900
0     35210
Xx       17
?         8
-         8
Name: Fruits, dtype: int64

In [35]:
df_tracks_t = df_tracks_t.set_index('Fruits')
df_tracks_t = df_tracks_t.drop('-', axis=0)
df_tracks_t = df_tracks_t.drop('Xx', axis=0)
df_tracks_t = df_tracks_t.drop('?', axis=0)
df_tracks_t = df_tracks_t.reset_index()
df_tracks_t['Fruits'].value_counts()

1    57900
0    35210
Name: Fruits, dtype: int64

In [36]:
df_tracks_t['Veggies'].value_counts()

1     74830
0     18240
Xx       18
-        13
?         9
Name: Veggies, dtype: int64

In [37]:
df_tracks_t = df_tracks_t.set_index('Veggies')
df_tracks_t = df_tracks_t.drop('-', axis=0)
df_tracks_t = df_tracks_t.drop('Xx', axis=0)
df_tracks_t = df_tracks_t.drop('?', axis=0)
df_tracks_t = df_tracks_t.reset_index()
df_tracks_t['Veggies'].value_counts()

1    74830
0    18240
Name: Veggies, dtype: int64

In [38]:
df_tracks_t['HvyAlcoholConsump'].value_counts()

0     87433
1      5614
-        11
?         7
Xx        5
Name: HvyAlcoholConsump, dtype: int64

In [39]:
df_tracks_t = df_tracks_t.set_index('HvyAlcoholConsump')
df_tracks_t = df_tracks_t.drop('-', axis=0)
df_tracks_t = df_tracks_t.drop('Xx', axis=0)
df_tracks_t = df_tracks_t.drop('?', axis=0)
df_tracks_t = df_tracks_t.reset_index()
df_tracks_t['HvyAlcoholConsump'].value_counts()

0    87433
1     5614
Name: HvyAlcoholConsump, dtype: int64

In [40]:
df_tracks_t['AnyHealthcare'].value_counts()

1     88112
0      4908
Xx       10
-         9
?         8
Name: AnyHealthcare, dtype: int64

In [41]:
df_tracks_t = df_tracks_t.set_index('AnyHealthcare')
df_tracks_t = df_tracks_t.drop('-', axis=0)
df_tracks_t = df_tracks_t.drop('Xx', axis=0)
df_tracks_t = df_tracks_t.drop('?', axis=0)
df_tracks_t = df_tracks_t.reset_index()
df_tracks_t['AnyHealthcare'].value_counts()

1    88112
0     4908
Name: AnyHealthcare, dtype: int64

In [42]:
df_tracks_t['NoDocbcCost'].value_counts()

0     84875
1      8114
Xx       14
-        13
?         4
Name: NoDocbcCost, dtype: int64

In [43]:
df_tracks_t = df_tracks_t.set_index('NoDocbcCost')
df_tracks_t = df_tracks_t.drop('-', axis=0)
df_tracks_t = df_tracks_t.drop('Xx', axis=0)
df_tracks_t = df_tracks_t.drop('?', axis=0)
df_tracks_t = df_tracks_t.reset_index()
df_tracks_t['NoDocbcCost'].value_counts()

0    84875
1     8114
Name: NoDocbcCost, dtype: int64

In [44]:
df_tracks_t['GenHlth'].value_counts()

2     31927
3     29023
1     15149
4     12161
5      4705
-        12
Xx        9
?         3
Name: GenHlth, dtype: int64

In [45]:
df_tracks_t = df_tracks_t.set_index('GenHlth')
df_tracks_t = df_tracks_t.drop('-', axis=0)
df_tracks_t = df_tracks_t.drop('Xx', axis=0)
df_tracks_t = df_tracks_t.drop('?', axis=0)
df_tracks_t = df_tracks_t.reset_index()
df_tracks_t['GenHlth'].value_counts()

2    31927
3    29023
1    15149
4    12161
5     4705
Name: GenHlth, dtype: int64

In [46]:
df_tracks_t['MentHlth'].value_counts()

0      61870
2       5043
30      4543
5       3632
1       3394
3       3024
10      2468
15      2127
4       1510
20      1344
7       1235
25       497
14       445
6        398
8        235
12       167
28       159
21        92
16        66
29        64
18        54
22        47
9         39
24        36
26        34
36        27
-21       26
-33       25
-29       25
-27       25
27        25
34        24
-25       23
32        23
-19       22
-35       21
-31       20
13        19
11        17
23        16
-17       15
-         15
-23       15
Xx        14
17        14
-15       13
19         9
-37        6
?          3
Name: MentHlth, dtype: int64

En este caso vemos que hay valores negativos