In [32]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import numpy as np
import pandas as pd
# Gráficos
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns
# Estandarización variables numéricas y Codificación variables categóricas
# ------------------------------------------------------------------------------
from sklearn.preprocessing import StandardScaler
# Gestión datos desbalanceados
# ------------------------------------------------------------------------------
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek
# Para separar los datos en train y test
# ------------------------------------------------------------------------------
from sklearn.model_selection import train_test_split
#  Gestión de warnings
# ------------------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

# Pair programming Preprocesado
### Ana Gonzalez y Ana Campos

En esta lección realizaremos los cambios oportunos para poder ejecutar el modelo de regresión logística.
Cuando nos enfrentamos a problemas de regresión lineal vimos que eran necesarios algunos cambios antes de poder ajustar los modelos.

En el caso de la regresión logística lo tendremos que hacer.

Estos cambios incluyen:

- **Estandarización** de las variables predictoras numéricas
- **Codificación** de las variables categóricas
- **Balanceo** de la variable respuesta

In [34]:
# cargamos el conjunto de datos que guardamos en la lección anterior
df = pd.read_pickle("../data-log/01-dataframe.pickle")
df.head()

Unnamed: 0,sex,is_smoking,education,id,age,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,F,YES,2.0,0.0,64.0,3.0,0.0,0.0,0.0,0.0,221.0,148.0,85.0,26.060925,90.0,80.0,1.0
1,F,YES,1.0,2.0,46.0,10.0,0.0,0.0,0.0,0.0,250.0,116.0,71.0,20.35,88.0,94.0,0.0
2,F,YES,1.0,4.0,64.0,30.0,0.0,0.0,0.0,0.0,241.0,136.5,85.0,26.42,70.0,77.0,0.0
3,M,YES,4.0,7.0,36.0,35.0,0.0,0.0,0.0,0.0,295.0,102.0,68.0,28.15,60.0,63.0,0.0
4,F,YES,2.0,8.0,41.0,20.0,0.0,0.0,0.0,0.0,220.0,126.0,78.0,20.7,86.0,79.0,0.0


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3390 entries, 0 to 3389
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   sex              3390 non-null   category
 1   is_smoking       3390 non-null   category
 2   education        3390 non-null   category
 3   id               3390 non-null   float64 
 4   age              3390 non-null   float64 
 5   cigsPerDay       3390 non-null   float64 
 6   BPMeds           3390 non-null   category
 7   prevalentStroke  3390 non-null   category
 8   prevalentHyp     3390 non-null   category
 9   diabetes         3390 non-null   category
 10  totChol          3390 non-null   float64 
 11  sysBP            3390 non-null   float64 
 12  diaBP            3390 non-null   float64 
 13  BMI              3390 non-null   float64 
 14  heartRate        3390 non-null   float64 
 15  glucose          3390 non-null   float64 
 16  TenYearCHD       3390 non-null   category


In [36]:
df.columns

Index(['sex', 'is_smoking', 'education', 'id', 'age', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

In [37]:
df=df[['id','sex','education',  'age','is_smoking' ,'cigsPerDay', 'BPMeds', 'prevalentStroke',
       'prevalentHyp', 'diabetes','glucose',  'totChol', 'sysBP', 'diaBP', 'BMI',
       'heartRate', 'TenYearCHD']]
df.head()

Unnamed: 0,id,sex,education,age,is_smoking,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,glucose,totChol,sysBP,diaBP,BMI,heartRate,TenYearCHD
0,0.0,F,2.0,64.0,YES,3.0,0.0,0.0,0.0,0.0,80.0,221.0,148.0,85.0,26.060925,90.0,1.0
1,2.0,F,1.0,46.0,YES,10.0,0.0,0.0,0.0,0.0,94.0,250.0,116.0,71.0,20.35,88.0,0.0
2,4.0,F,1.0,64.0,YES,30.0,0.0,0.0,0.0,0.0,77.0,241.0,136.5,85.0,26.42,70.0,0.0
3,7.0,M,4.0,36.0,YES,35.0,0.0,0.0,0.0,0.0,63.0,295.0,102.0,68.0,28.15,60.0,0.0
4,8.0,F,2.0,41.0,YES,20.0,0.0,0.0,0.0,0.0,79.0,220.0,126.0,78.0,20.7,86.0,0.0


In [39]:
# nos hacemos una copia del df que acabamos de cargar

df = df.copy()

# Estandarización

In [40]:
# iniciamos el método para escalar

scaler = StandardScaler()

In [41]:
# recordemos que la estandarización solo se hace para las variables predictoras numéricas

numericas = df.select_dtypes(include = np.number)
numericas.head()

Unnamed: 0,id,age,cigsPerDay,glucose,totChol,sysBP,diaBP,BMI,heartRate
0,0.0,64.0,3.0,80.0,221.0,148.0,85.0,26.060925,90.0
1,2.0,46.0,10.0,94.0,250.0,116.0,71.0,20.35,88.0
2,4.0,64.0,30.0,77.0,241.0,136.5,85.0,26.42,70.0
3,7.0,36.0,35.0,63.0,295.0,102.0,68.0,28.15,60.0
4,8.0,41.0,20.0,79.0,220.0,126.0,78.0,20.7,86.0


In [42]:
# Si queremos eliminar alguna columna
numericas.drop(["id"], axis = 1, inplace = True)

In [43]:
# ahora ya podemos ajustar nuestros datos.  

scaler.fit(numericas)

# transformamos los datos

X_escaladas = scaler.transform(numericas)

# por último convertiremos el array que nos devuelve en un dataframe. 

numericas_estandar = pd.DataFrame(X_escaladas, columns = numericas.columns)
numericas_estandar.head(2)

Unnamed: 0,age,cigsPerDay,glucose,totChol,sysBP,diaBP,BMI,heartRate
0,1.682783,-0.506751,-0.07735,-0.358276,0.690879,0.176093,0.064553,1.171617
1,-0.412284,0.083414,0.523186,0.286026,-0.744824,-0.988457,-1.325505,1.004509


In [44]:
# dropeamos las columnas sin estandarizar

df.drop(["id"], axis = 1, inplace=True)
df.head()

Unnamed: 0,sex,education,age,is_smoking,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,glucose,totChol,sysBP,diaBP,BMI,heartRate,TenYearCHD
0,F,2.0,64.0,YES,3.0,0.0,0.0,0.0,0.0,80.0,221.0,148.0,85.0,26.060925,90.0,1.0
1,F,1.0,46.0,YES,10.0,0.0,0.0,0.0,0.0,94.0,250.0,116.0,71.0,20.35,88.0,0.0
2,F,1.0,64.0,YES,30.0,0.0,0.0,0.0,0.0,77.0,241.0,136.5,85.0,26.42,70.0,0.0
3,M,4.0,36.0,YES,35.0,0.0,0.0,0.0,0.0,63.0,295.0,102.0,68.0,28.15,60.0,0.0
4,F,2.0,41.0,YES,20.0,0.0,0.0,0.0,0.0,79.0,220.0,126.0,78.0,20.7,86.0,0.0


In [45]:
# lo unomos al dataframe original 

df = pd.concat([df, numericas_estandar], axis = 1)
# chequeamos que esta todo bien
df.head()

Unnamed: 0,sex,education,age,is_smoking,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,glucose,...,heartRate,TenYearCHD,age.1,cigsPerDay.1,glucose.1,totChol,sysBP,diaBP,BMI,heartRate.1
0,F,2.0,64.0,YES,3.0,0.0,0.0,0.0,0.0,80.0,...,90.0,1.0,1.682783,-0.506751,-0.07735,-0.358276,0.690879,0.176093,0.064553,1.171617
1,F,1.0,46.0,YES,10.0,0.0,0.0,0.0,0.0,94.0,...,88.0,0.0,-0.412284,0.083414,0.523186,0.286026,-0.744824,-0.988457,-1.325505,1.004509
2,F,1.0,64.0,YES,30.0,0.0,0.0,0.0,0.0,77.0,...,70.0,0.0,1.682783,1.7696,-0.206036,0.08607,0.174923,0.176093,0.151952,-0.499456
3,M,4.0,36.0,YES,35.0,0.0,0.0,0.0,0.0,63.0,...,60.0,0.0,-1.57621,2.191146,-0.806571,1.285805,-1.372943,-1.238003,0.57304,-1.334993
4,F,2.0,41.0,YES,20.0,0.0,0.0,0.0,0.0,79.0,...,86.0,0.0,-0.994247,0.926507,-0.120245,-0.380493,-0.296167,-0.406182,-1.240314,0.837402


# Codificacion con datos ESTANDARIZADOS

In [None]:
df.head(2)

¿ tienen orden nuestras variables? 

# No tienen

In [None]:
lista_columnas = ["embarked", "maturity", "adult_male", "alone"]

df_encoded = pd.DataFrame()


for columna in lista_columnas:
    df_dummies = pd.get_dummies(df[columna], prefix_sep = "_", prefix = columna, dtype = int)

    df_encoded = pd.concat([df_encoded, df_dummies], axis = 1)



In [None]:
# ya tenemos nuestro dataframe con las variables codificadas,
df_encoded.head()

In [None]:
# el siguiente paso sería unir este dataframe con nuestro dataframe original para tener todos los datos en un mismo df

df_final = pd.concat([df, df_encoded], axis = 1)
df_final.head()

In [None]:
# por último nos queda eliminar las columnas categóricas originales porque ya no nos hacen falta. 

df_final.drop(lista_columnas, axis = 1, inplace=True)
df_final.head(2)

# Tienen

In [None]:
# definimos el diccionario

map_sex = {"male": 0, "female": 1}

In [None]:
df_final["sex"] = df_final["sex"].map(map_sex)
df_final.head(2)

In [None]:
# Dejo por si tenemos mas diccionariso que poner 

-------------------------------------------------------------

Vamos a aplicar esta codificación al *dataframe* original, el objetivo, tener dos datasets: 

- Uno con las variables categóricas codificadas y las numéricas estandarizadas.

- Unos con las variables categóricas codificadas y las numéricas sin estandarizar.  


Con estos dos csv ajustaremos nuestro modelo de regresión logística para comparar que modelo es mejor y como pueden cambiar las métricas. 

# Codificacion con datos SIN ESTANDARIZAR

In [None]:
df.head(2)

# sin orden

In [None]:
lista_columnas = ["embarked", "maturity", "adult_male", "alone"]

df_encoded2 = pd.DataFrame()


for columna in lista_columnas:
    df_dummies2 = pd.get_dummies(df_titanic[columna], prefix_sep = "_", prefix = columna, dtype = int)

    df_encoded2 = pd.concat([df_encoded2, df_dummies2], axis = 1)

In [None]:
# ya tenemos nuestro dataframe con las variables codificadas,
df_encoded2.head()

In [None]:
# el siguiente paso sería unir este dataframe con nuestro dataframe original para tener todos los datos en un mismo df

df_final2 = pd.concat([df_titanic, df_encoded2], axis = 1)
df_final2.head()

In [None]:
# por último nos queda eliminar las columnas categóricas originales porque ya no nos hacen falta. 

df_final2.drop(lista_columnas, axis = 1, inplace=True)
df_final2.head(2)

# con orden

In [None]:
# definimos el diccionario

map_sex = {"male": 0, "female": 1}

In [None]:
df_final2["sex"] = df_final2["sex"].map(map_sex)
df_final2.head(2)

# Balanceo de nuestra variable respuesta

# Pandas

In [None]:
# recordemos como estaban distribuidos nuestros datos

plt.figure(figsize=(8,5)) # para cambiar el tamaño de la figura

fig1 = sns.countplot(data = df_titanic, x = "survived",  color = "mediumaquamarine",  edgecolor='black')
fig1.set(xticklabels=["No", "Yes"]) 
plt.show(

## Downsampling

In [None]:
# lo primero que hacemos es sacar el número de registros que tenemos para la clase minoritaria
num_minoritarios = df_final["survived"].value_counts()[1]
num_minoritarios

In [None]:
# nos creamos un dataframe solo con las filas donde la variable respuesta sea Yes. 
minoritarios = df_final2[df_final["survived"] == 1]
minoritarios.head(2)

In [None]:
# extraemos una muestra de la categoría mayoritaria que sea del mismo tamaño que la clase minotaria

mayoritarios = df_final[df_final["survived"] == 0].sample(num_minoritarios, random_state = 42)
mayoritarios.head(2)

In [None]:
# Ahora es el momento de unir los *dataframes*
balanceado = pd.concat([minoritarios,mayoritarios],axis = 0)
balanceado.head(2)

In [None]:
# chequeamos como es nuestro dataframe ahora
balanceado["survived"].value_counts()

---
El resultado es : 

### Upsampling

In [None]:
num_mayoritarios = df_final["survived"].value_counts()[0]
num_mayoritarios

In [None]:
# seleccionamos ls datos de la clase mayoritaria

mayoritarios2 = df_final[df_final["survived"]== 0]
mayoritarios2.head(2)

In [None]:
# hacemos un selección aleatoria de datos de la clase minoritaria, para tener el mismo número que la clase mayoritaria

minoritarios2 =df_final[df_final["survived"]==1].sample(num_mayoritarios, replace=True)
minoritarios2.head(2)

In [None]:
# unimos los dos dataframes

balanceado2 = pd.concat([mayoritarios2,minoritarios2], axis = 0)
balanceado2.head(2)

In [None]:
# chequeamos como quedaron cada una de nuestras categorías

balanceado2["survived"].value_counts()

---
El resultado es : 