In [1]:
# Para que no se vean los warnings de sklearn
import warnings
warnings.simplefilter('ignore')

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

La **estandarizacion** es la transformacion de una variable para que tenga una distrubucion gaussiana de media 0 y desviacion estandar 1.

In [3]:
x = pd.read_csv('./data/x.csv')
y = x['worldwide_gross']
x = x.drop('worldwide_gross', axis=1)
x = x.drop('gross', axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y)


In [4]:
scaler = StandardScaler()
scaler.fit(x_train, y_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [5]:
scaler.mean_ # promedios

array([3.34001395e+07, 2.95237617e+05, 5.14726189e+05, 3.66521672e+05,
       1.01876176e+04, 3.91913985e+07, 6.45399610e+00])

In [6]:
scaler.scale_  # desviacion standar

array([4.13313699e+07, 1.08350801e+07, 1.27104448e+07, 1.25192164e+07,
       1.93785461e+04, 2.29699834e+08, 1.07506227e+00])

In [7]:
scaler.transform(x_train)

array([[ 1.00649605, -0.02706308, -0.04049617, ...,  2.60382704,
         0.15589302,  0.60089905],
       [-0.08226535, -0.02706252, -0.04049613, ..., -0.50832594,
        -0.01824729,  0.41486332],
       [-0.37260172, -0.02706335, -0.04049617, ...,  0.43983601,
        -0.09225692, -1.16644043],
       ...,
       [-0.66293809, -0.02706585, -0.04049617, ...,  0.27950406,
        -0.13648856,  0.87995265],
       [-0.44518581, -0.02706234,  0.33537168, ..., -0.51580844,
        -0.10531744, -0.7013511 ],
       [-0.25162823, -0.02706419, -0.04049617, ...,  0.04259259,
        -0.08790341,  0.32184545]])

Reescalamos los datos:

In [8]:
x_train_scaled, x_test_scaled = (scaler.transform(x_train), scaler.transform(x_test))

In [9]:
from sklearn.linear_model import Lasso

# instanciamos modelos
model = Lasso()
model_scaled = Lasso()

# entrenamos modelos:
model.fit(x_train, y_train)
model_scaled.fit(x_train_scaled, y_train)


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [10]:
print('Modelo: ',model.score(x_test, y_test))
print('Modelo escalado: ',model_scaled.score(x_test_scaled, y_test))

Modelo:  0.5543224309607073
Modelo escalado:  0.5543224318355524


<div class='alert alert-info'> Las regresiones no necesitan re-escalamiento, pero los clasificadores si </div>

## Simplificar las transformaciones con pipelines

In [11]:

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline

model_scaled = make_pipeline(StandardScaler(), Lasso())

model_scaled.fit(x_train, y_train)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lasso', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [12]:
print(model_scaled.score(x_test, y_test))

0.5543224318355524


## Crear features de forma automática.

In [13]:
A = np.arange(6).reshape(3, 2) # matriz de 3 X 2

from sklearn.preprocessing import PolynomialFeatures

transformer = PolynomialFeatures(2) # Grado del polinomio a fitear
#transformer.fit(A)
#transformer.transform(A)
transformer.fit_transform(A)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

**PolynomialFeatures** fransforma matriz [A<sub>1</sub>, A<sub>2</sub>] -> [1, A<sub>1</sub>, A<sub>2</sub>, A<sub>1</sub>², A<sub>1</sub>  $\cdot$ A<sub>2</sub>, A<sub>2</sub>²]

In [14]:
x.shape

(4104, 7)

In [15]:
transformer = PolynomialFeatures(2)
transformer.fit_transform(x).shape

(4104, 36)

Pasamos de tener 7 features a tener 36 features.

In [16]:
model_poly = make_pipeline(PolynomialFeatures(2), Lasso())

model_poly.fit(x_train, y_train)
model_poly.score(x_test, y_test)


0.5770581324970545