## Regresión Lineal Preprocesado


---

In [51]:
# Tratamiento de datos
import numpy as np
import pandas as pd

# Gráficos
import matplotlib.pyplot as plt
import seaborn as sns

# Estandarización variables numéricas y Codificación variables categóricas
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

# Gestión datos desbalanceados
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.combine import SMOTETomek

# Para separar los datos en train y test
from sklearn.model_selection import train_test_split

#  Gestión de warnings
import warnings
warnings.filterwarnings("ignore")

In [52]:
df = pd.read_pickle("../files_rl/1-heart-eda.pk")
df.head()

Unnamed: 0,age,sex,chestpaintype,restingbp,fastingbs,restingecg,maxhr,exerciseangina,oldpeak,st_slope,heartdisease,cholesterol
0,40,M,ATA,140,0,Normal,172,N,0.0,Up,0,289.0
1,49,F,NAP,160,0,Normal,156,N,1.0,Flat,1,180.0
2,37,M,ATA,130,0,ST,98,N,0.0,Up,0,283.0
3,48,F,ASY,138,0,Normal,108,Y,1.5,Flat,1,214.0
4,54,M,NAP,150,0,Normal,122,N,0.0,Up,0,195.0


### Usando el mismo dataset que usatéis ayer, los objetivos de los ejercicios de hoy son:


Variables predictoras:


- Age: edad del paciente en años

- Sex: sexo del paciente [M: Male, F: Female]

- ChestPainType: tipo de dolor de pecho [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]

- RestingBP: presion sanguinea en reposo[mm Hg]

- Cholesterol: colesterol [mm/dl]

- FastingBS: azucar en sangre [1: if FastingBS > 120 mg/dl, 0: otherwise]

- RestingECG: electrocardiograma resultados [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]

- MaxHR: frecuencia cardiaca máxima alcanzada [Numeric value between 60 and 202]

- ExerciseAngina:  angina inducida por el ejercicio [Y: Yes, N: No]

- Oldpeak: oldpeak = ST pico mas bajo del electrocardiograma [Numeric value measured in depression]

- ST_Slope: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]


Variable respuesta:

- HeartDisease: cardiopatia [1: heart disease, 0: Normal]

### 1)Codificar las variables categóricas. Recordad que tendréis que tener en cuenta si vuestras variables tienen orden o no.

In [53]:
df.dtypes

age                  int64
sex               category
chestpaintype     category
restingbp            int64
fastingbs         category
restingecg        category
maxhr                int64
exerciseangina    category
oldpeak            float64
st_slope          category
heartdisease      category
cholesterol        float64
dtype: object

In [54]:
df.restingecg.unique()

['Normal', 'ST', 'LVH']
Categories (3, object): ['LVH', 'Normal', 'ST']

In [55]:
df.exerciseangina.unique()

['N', 'Y']
Categories (2, object): ['N', 'Y']

In [56]:
df.st_slope.unique()

['Up', 'Flat', 'Down']
Categories (3, object): ['Down', 'Flat', 'Up']

sex = no tiene orden

chestpaintype = no tiene orden

fastingbs = tiene orden donde 1 tiene mas peso que 0

restingecg =  tiene orden  1 - Normal: Normal
                           2 - ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
                           3 - LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria
                           
exerciseangina = tiene orden 1 - 'N'
                             2 - 'Y'

st_slope = tiene orden  1- 'Flat'
                        2 - 'Down'
                        3- 'Up'


In [58]:
categoricas_sinorden = df[["sex","chestpaintype"]]
#categoricas_sinorden= categoricas.copy
#categoricas_sinorden.drop([["heartdisease", "fastingbd","restingecg", "exerciseangina", "st_slope"]], axis=1, inplace=True)


In [59]:
categoricas_sinorden.shape

(917, 2)

In [67]:
dummies_sex = pd.get_dummies(categoricas_sinorden["sex"], prefix_sep = "_", prefix = "sex", dtype = int)
dummies_chest = pd.get_dummies(categoricas_sinorden["chestpaintype"], prefix_sep = "_", prefix = "chest", dtype = int)


In [68]:
df_dummies = pd.concat([df, dummies_sex,dummies_chest], axis = 1)

In [70]:
df_dummies.drop(["sex","chestpaintype"], axis=1, inplace=True)

In [72]:
df_dummies.head()

Unnamed: 0,age,restingbp,fastingbs,restingecg,maxhr,exerciseangina,oldpeak,st_slope,heartdisease,cholesterol,sex_F,sex_M,chest_ASY,chest_ATA,chest_NAP,chest_TA
0,40,140,0,Normal,172,N,0.0,Up,0,289.0,0,1,0,1,0,0
1,49,160,0,Normal,156,N,1.0,Flat,1,180.0,1,0,0,0,1,0
2,37,130,0,ST,98,N,0.0,Up,0,283.0,0,1,0,1,0,0
3,48,138,0,Normal,108,Y,1.5,Flat,1,214.0,1,0,1,0,0,0
4,54,150,0,Normal,122,N,0.0,Up,0,195.0,0,1,0,0,1,0


### 2)Estandarizar las variables numéricas de vuestro set de datos.


In [None]:
numericas=df.select_dtypes(np.number)

In [None]:
robust= RobustScaler()

In [None]:
robust.fit(numericas)

RobustScaler()

In [None]:
x_robust= robust.transform(numericas)

In [None]:
numericas_robust=pd.DataFrame(x_robust, columns=numericas.columns)
numericas_robust.head(3)

Unnamed: 0,age,restingbp,maxhr,oldpeak,cholesterol
0,-1.076923,0.5,0.944444,-0.4,0.87084
1,-0.384615,1.5,0.5,0.266667,-1.185764
2,-1.307692,0.0,-1.111111,-0.4,0.757632


In [None]:
df.drop(numericas.columns, axis=1, inplace=True)
df.head()

Unnamed: 0,sex,chestpaintype,fastingbs,restingecg,exerciseangina,st_slope,heartdisease
0,M,ATA,0,Normal,N,Up,0
1,F,NAP,0,Normal,N,Flat,1
2,M,ATA,0,ST,N,Up,0
3,F,ASY,0,Normal,Y,Flat,1
4,M,NAP,0,Normal,N,Up,0


In [None]:
df[numericas_robust.columns]= numericas_robust

In [None]:
df.head()

Unnamed: 0,sex,chestpaintype,fastingbs,restingecg,exerciseangina,st_slope,heartdisease,age,restingbp,maxhr,oldpeak,cholesterol
0,M,ATA,0,Normal,N,Up,0,-1.076923,0.5,0.944444,-0.4,0.87084
1,F,NAP,0,Normal,N,Flat,1,-0.384615,1.5,0.5,0.266667,-1.185764
2,M,ATA,0,ST,N,Up,0,-1.307692,0.0,-1.111111,-0.4,0.757632
3,F,ASY,0,Normal,Y,Flat,1,-0.461538,0.4,-0.833333,0.6,-0.544255
4,M,NAP,0,Normal,N,Up,0,0.0,1.0,-0.444444,-0.4,-0.902745


### 3)Chequear si vuestros datos están balanceados. En caso de que no lo estén utilizad algunas de las herramientas aprendidas en la lección para balancearlos.


### 4)Guardad el dataframe con los cambios que habéis aplicado para utilizarlo en la siguiente lección.


In [None]:
df_final.to_csv("files/titanic_esta_enco.csv")
df_final2.to_csv("files/titanic_enco.csv")
balanceado4.to_csv("files/titanic_esta_enco_up.csv")
balanceado3.to_csv("files/titanic_esta_enco_down.csv")

NameError: name 'df_final' is not defined