## Estandarización

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler


import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None

import sys
sys.path.append("../../")
from src import funciones as fun
from src import variables as var

In [2]:
df = pd.read_pickle("../archivos/coste_vida_boxcox.pkl")
df.head(2)

Unnamed: 0,country,mcdonalds,cappuccino,milk,rice,eggs,chicken,beef,banana,water,wine,beer,cigarettes_marlboro,public_transport_ticket,taxi,gasoline,basic,internet,gym_monthly,cinema,preschool,primary_school,apt_3beds_outcentre,monthly_salary,basic_boxcox
0,South Korea,6.15,3.93,2.2,3.53,4.04,10.58,41.61,3.71,1.05,15.36,2.12,3.46,1.0,0.92,1.43,182.13,22.48,55.88,9.6,404.15,17902.55,1731.08,2689.62,16.684542
1,China,5.69,3.98,2.74,1.22,2.22,4.86,13.12,1.6,0.64,14.24,0.94,3.56,0.57,0.43,1.2,66.0,17.07,63.49,8.54,1382.62,26379.45,1561.59,1419.87,10.443987


En vuestro dataset habréis identificado unas variables predictoras y una variable respuesta. Los objetivos del pair programming de hoy son:

1- Sobre las variables predictoras, en este caso deberéis estandarizar estas variables. De nuevo, usad el método que prefiráis.

In [4]:
# creamos un dataframe con las variables numéricas predictoras
numericas = (df.select_dtypes(include=np.number))
numericas = numericas.drop(['basic', 'basic_boxcox'], axis = 1)

- Como nosotras hemos decidido dejar los outliers porque consideramos que contienen información relevante dentro de nuestra muestra, optamos por estandarizar las variables predictoras con el método RobustScaler.

In [5]:
# construir el modelo de escalador
robust = RobustScaler()

In [6]:
# ajustamos el modelo
robust.fit(numericas)

In [7]:
# transformamos los datos
X_robust = robust.transform(numericas)

In [8]:
num_robust = pd.DataFrame(X_robust, columns = numericas.columns)

In [9]:
num_robust.sample(3)

Unnamed: 0,mcdonalds,cappuccino,milk,rice,eggs,chicken,beef,banana,water,wine,beer,cigarettes_marlboro,public_transport_ticket,taxi,gasoline,internet,gym_monthly,cinema,preschool,primary_school,apt_3beds_outcentre,monthly_salary
2637,2.62963,0.731481,1.5,0.693642,1.268116,0.96,1.22874,1.437722,0.868132,1.035948,0.947524,0.936011,0.295858,0.095731,1.393443,-0.141075,2.956393,0.713088,1.067458,1.835919,0.887572,0.454278
974,-0.746439,-0.763889,-0.916667,-0.722543,-0.905797,-0.818182,-0.545644,-1.19573,-0.527473,-1.017974,-0.773788,-0.297057,-0.674556,-0.850776,-0.081967,-0.732223,-1.107919,-1.012031,-0.40075,-0.654237,-0.57457,-0.620824
3361,0.022792,0.569444,-0.8125,0.913295,-0.173913,2.030909,0.506768,0.042705,1.901099,1.486928,0.873753,0.883699,1.029586,0.339468,-0.147541,0.816311,-0.621283,0.704338,1.179647,1.506303,1.24019,1.234664


In [10]:
# creamos otro dataframe con nuestras variables categóricas para unirlas al de las variables numércias estandarizadas
categoricas = df.select_dtypes(include=['object', 'category'])

In [17]:
# y añadimos nuestra variable respuesta original
categoricas['basic'] = df['basic']

In [18]:
# y nuestra variable respuesta tratada con un método de normalización
categoricas['basic_boxcox'] = df['basic_boxcox']

In [19]:
# Creamos nuestro dataframe final estandar:
df_estandar = pd.concat([categoricas, num_robust], axis=1, ignore_index = False)

In [20]:
df_estandar.head()

Unnamed: 0,country,basic_boxcox,basic,mcdonalds,cappuccino,milk,rice,eggs,chicken,beef,banana,water,wine,beer,cigarettes_marlboro,public_transport_ticket,taxi,gasoline,internet,gym_monthly,cinema,preschool,primary_school,apt_3beds_outcentre,monthly_salary
0,South Korea,16.684542,182.13,-0.219373,0.601852,2.395833,1.046243,1.304348,0.747273,4.294342,3.245552,0.307692,1.218954,0.504901,-0.33816,-0.153846,-0.258103,0.245902,-0.302919,1.074934,0.208531,0.101671,1.262442,0.904358,0.517125
1,China,10.443987,66.0,-0.350427,0.625,3.520833,-0.289017,-0.014493,-0.292727,0.338771,0.241993,-0.142857,1.035948,-0.462312,-0.319477,-0.408284,-0.69155,-0.131148,-0.439942,1.455932,0.053956,1.617468,2.248014,0.750069,-0.00458
2,China,9.945543,59.65,-0.552707,0.421296,1.791667,-0.398844,-0.384058,-0.490909,0.14856,0.014235,-0.285714,0.570261,-0.454115,-0.293321,-0.532544,-0.744625,-0.147541,-0.450326,-0.011987,0.053956,0.335639,2.036034,-0.045124,-0.090119
3,India,8.515986,43.57,-0.923077,-0.069444,-0.625,-0.514451,-0.934783,-0.505455,-0.656716,-1.081851,-0.461538,0.315359,0.627852,-0.181224,-0.597633,-0.797701,0.114754,-0.635978,-0.452562,-0.564346,-0.358551,-0.390842,-0.034428,-0.324674
4,India,9.8165,58.07,-0.746439,-0.398148,-0.666667,-0.50289,-0.884058,-0.483636,-0.690038,-0.967972,-0.450549,0.315359,0.029491,-0.181224,-0.455621,-0.877313,-0.065574,-0.67093,-0.661335,-0.475392,-0.409781,-0.590637,-0.371835,-0.347005


2- Guardar en un csv el nuevo dataframe que habéis creado para seguir usándolo en los siguientes pair programmings

In [21]:
num_robust.to_pickle('../archivos/numericas_robustas.pkl')

In [22]:
df_estandar.to_pickle('../archivos/coste_vida_estandar.pkl')