## Estandarización

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler


import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None

import sys
sys.path.append("../../")
from src import funciones as fun
from src import variables as var

In [27]:
df = pd.read_pickle("../archivos/coste_vida_boxcox.pkl")
df.head(2)

Unnamed: 0,country,mcdonalds,cappuccino,milk,rice,eggs,chicken,beef,banana,water,wine,beer,cigarettes_marlboro,public_transport_ticket,taxi,gasoline,basic,internet,gym_monthly,cinema,preschool,primary_school,apt_3beds_outcentre,monthly_salary,basic_boxcox
0,South Korea,6.15,3.93,2.2,3.53,4.04,10.58,41.61,3.71,1.05,15.36,2.12,3.46,1.0,0.92,1.43,182.13,22.48,55.88,9.6,404.15,17902.55,1731.08,2689.62,16.684542
1,China,5.69,3.98,2.74,1.22,2.22,4.86,13.12,1.6,0.64,14.24,0.94,3.56,0.57,0.43,1.2,66.0,17.07,63.49,8.54,1382.62,26379.45,1561.59,1419.87,10.443987


En vuestro dataset habréis identificado unas variables predictoras y una variable respuesta. Los objetivos del pair programming de hoy son:

1- Sobre las variables predictoras, en este caso deberéis estandarizar estas variables. De nuevo, usad el método que prefiráis.

In [28]:
# creamos un dataframe con las variables numéricas predictoras
numericas = (df.select_dtypes(include=np.number))
# numericas = numericas.drop(['basic', 'basic_boxcox'], axis = 1)

In [31]:
numericas.sample()

Unnamed: 0,mcdonalds,cappuccino,milk,rice,eggs,chicken,beef,banana,water,wine,beer,cigarettes_marlboro,public_transport_ticket,taxi,gasoline,basic,internet,gym_monthly,cinema,preschool,primary_school,apt_3beds_outcentre,monthly_salary,basic_boxcox
3132,6.32,2.144969,1.26,3.03,1.52,6.32,10.11,1.77,0.875946,7.963791,1.540208,3.181821,0.25,0.3,1.31,75.8,-3.623403,75.8,6.32,278.756688,7888.863203,1137.03,442.18,11.158397


- Como nosotras hemos decidido dejar los outliers porque consideramos que contienen información relevante dentro de nuestra muestra, optamos por estandarizar las variables predictoras con el método RobustScaler.

In [32]:
# construir el modelo de escalador
robust = RobustScaler()

In [33]:
# ajustamos el modelo
robust.fit(numericas)

In [34]:
# transformamos los datos
X_robust = robust.transform(numericas)

In [35]:
num_robust = pd.DataFrame(X_robust, columns = numericas.columns)

In [36]:
num_robust.sample(3)

Unnamed: 0,mcdonalds,cappuccino,milk,rice,eggs,chicken,beef,banana,water,wine,beer,cigarettes_marlboro,public_transport_ticket,taxi,gasoline,basic,internet,gym_monthly,cinema,preschool,primary_school,apt_3beds_outcentre,monthly_salary,basic_boxcox
804,-1.096866,-0.875,-1.083333,-0.624277,-0.876812,-0.461818,-0.416522,-1.466192,-0.43956,2.722222,0.555043,-0.181224,-0.597633,-0.753327,-0.081967,-0.996429,-0.74818,-0.800517,-0.743711,-0.495883,-0.759037,-0.537302,-0.436579,-1.628681
2396,-0.111111,1.097222,3.416667,1.942197,1.630435,0.381818,0.197154,0.569395,0.054945,0.933007,5.054081,0.186829,1.816568,0.308032,-0.360656,0.000387,2.063699,2.304541,0.313525,0.987041,-0.186582,0.164184,0.800027,0.000371
4073,0.777778,1.226852,-0.541667,0.242775,-0.427536,0.778182,-0.365151,-0.02847,1.714286,0.751634,1.35736,0.659505,0.43787,0.299186,0.163934,0.833893,0.849997,1.881989,1.105359,1.566935,0.855215,2.241533,3.109884,0.663825


In [37]:
# creamos otro dataframe con nuestras variables categóricas para unirlas al de las variables numércias estandarizadas
categoricas = df.select_dtypes(include=['object', 'category'])

In [21]:
# y añadimos nuestra variable respuesta original
# categoricas['basic'] = df['basic']

In [22]:
# y nuestra variable respuesta tratada con un método de normalización
# categoricas['basic_boxcox'] = df['basic_boxcox']

In [38]:
# Creamos nuestro dataframe final estandar:
df_estandar = pd.concat([categoricas, num_robust], axis=1, ignore_index = False)

In [39]:
df_estandar.head()

Unnamed: 0,country,mcdonalds,cappuccino,milk,rice,eggs,chicken,beef,banana,water,wine,beer,cigarettes_marlboro,public_transport_ticket,taxi,gasoline,basic,internet,gym_monthly,cinema,preschool,primary_school,apt_3beds_outcentre,monthly_salary,basic_boxcox
0,South Korea,-0.219373,0.601852,2.395833,1.046243,1.304348,0.747273,4.294342,3.245552,0.307692,1.218954,0.504901,-0.33816,-0.153846,-0.258103,0.245902,0.465023,-0.302919,1.074934,0.208531,0.101671,1.262442,0.904358,0.517125,0.397744
1,China,-0.350427,0.625,3.520833,-0.289017,-0.014493,-0.292727,0.338771,0.241993,-0.142857,1.035948,-0.462312,-0.319477,-0.408284,-0.69155,-0.131148,-0.534202,-0.439942,1.455932,0.053956,1.617468,2.248014,0.750069,-0.00458,-0.617292
2,China,-0.552707,0.421296,1.791667,-0.398844,-0.384058,-0.490909,0.14856,0.014235,-0.285714,0.570261,-0.454115,-0.293321,-0.532544,-0.744625,-0.147541,-0.58884,-0.450326,-0.011987,0.053956,0.335639,2.036034,-0.045124,-0.090119,-0.698365
3,India,-0.923077,-0.069444,-0.625,-0.514451,-0.934783,-0.505455,-0.656716,-1.081851,-0.461538,0.315359,0.627852,-0.181224,-0.597633,-0.797701,0.114754,-0.727198,-0.635978,-0.452562,-0.564346,-0.358551,-0.390842,-0.034428,-0.324674,-0.930884
4,India,-0.746439,-0.398148,-0.666667,-0.50289,-0.884058,-0.483636,-0.690038,-0.967972,-0.450549,0.315359,0.029491,-0.181224,-0.455621,-0.877313,-0.065574,-0.602435,-0.67093,-0.661335,-0.475392,-0.409781,-0.590637,-0.371835,-0.347005,-0.719354


2- Guardar en un csv el nuevo dataframe que habéis creado para seguir usándolo en los siguientes pair programmings

In [40]:
num_robust.to_pickle('../archivos/numericas_robustas.pkl')

In [41]:
df_estandar.to_pickle('../archivos/coste_vida_estandar.pkl')