In [11]:
# Para que no se vean los warnings de sklearn
import warnings
warnings.simplefilter('ignore')

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

La **estandarizacion** es la transformacion de una variable para que tenga una distrubucion gaussiana de media 0 y desviacion estandar 1.

In [2]:
x = pd.read_csv('./data/x.csv')
y = x['worldwide_gross']
x = x.drop('worldwide_gross', axis=1)
x = x.drop('gross', axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y)


In [3]:
scaler = StandardScaler()
scaler.fit(x_train, y_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [5]:
scaler.mean_ # promedios

array([3.34338753e+07, 2.87597108e+05, 4.40683110e+05, 2.68485524e+05,
       9.83170663e+03, 3.98838314e+07, 6.45974659e+00])

In [6]:
scaler.scale_  # desviacion standar

array([4.12879810e+07, 1.08165872e+07, 1.15992019e+07, 1.13499580e+07,
       1.44171102e+04, 2.32848395e+08, 1.07468396e+00])

In [7]:
scaler.transform(x_train)

array([[ 0.15903235, -0.02640335, -0.03799238, ...,  0.908871  ,
        -0.00809038, -2.19575864],
       [-0.44647074, -0.02640298, -0.03799233, ...,  0.89569222,
        -0.10686709,  1.06101278],
       [ 0.64343482, -0.02640325, -0.03799238, ...,  1.04780315,
         0.1722845 , -0.14864518],
       ...,
       [-0.73711222, -0.02640307, -0.03799238, ..., -0.52095784,
        -0.15840277, -3.21931537],
       [-0.73711222, -0.02640233, -0.03799233, ...,  2.35680334,
        -0.15840277, -0.98610068],
       [-0.37381037, -0.02640288, -0.03799233, ..., -0.52831022,
        -0.09398317, -1.26525252]])

Reescalamos los datos:

In [9]:
x_train_scaled, x_test_scaled = (scaler.transform(x_train), scaler.transform(x_test))

In [12]:
from sklearn.linear_model import Lasso

# instanciamos modelos
model = Lasso()
model_scaled = Lasso()

# entrenamos modelos:
model.fit(x_train, y_train)
model_scaled.fit(x_train_scaled, y_train)


Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [13]:
print('Modelo: ',model.score(x_test, y_test))
print('Modelo escalado: ',model_scaled.score(x_test_scaled, y_test))

Modelo:  0.6230147690820664
Modelo escalado:  0.6230147684399951


<div class='alert alert-info'> Las regresiones no necesitan re-escalamiento, pero los clasificadores si </div>

## Simplificar las transformaciones con pipelines

In [16]:

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline

model_scaled = make_pipeline(StandardScaler(), Lasso())

model_scaled.fit(x_train, y_train)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lasso', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [17]:
print(model_scaled.score(x_test, y_test))

0.6230147684399951


## Crear features de forma automática.

In [20]:
A = np.arange(6).reshape(3, 2) # matriz de 3 X 2

from sklearn.preprocessing import PolynomialFeatures

transformer = PolynomialFeatures(2) # Grado del polinomio a fitear
#transformer.fit(A)
#transformer.transform(A)
transformer.fit_transform(A)

array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

**PolynomialFeatures** fransforma matriz [A<sub>1</sub>, A<sub>2</sub>] -> [1, A<sub>1</sub>, A<sub>2</sub>, A<sub>1</sub>², A<sub>1</sub>  $\cdot$ A<sub>2</sub>, A<sub>2</sub>²]

In [22]:
x.shape

(4104, 7)

In [23]:
transformer = PolynomialFeatures(2)
transformer.fit_transform(x).shape

(4104, 36)

Pasamos de tener 7 features a tener 36 features.

In [24]:
model_poly = make_pipeline(PolynomialFeatures(2), Lasso())

model_poly.fit(x_train, y_train)
model_poly.score(x_test, y_test)


0.6362003450850185