In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut

from sklearn.cluster import KMeans

## Template Data Science -- Hands On Machine Learning

### Parte 2 - Regressão

### Dados

In [3]:
def data_info(data):        
    info = pd.DataFrame()
    info['var'] = data.columns
    info['# missing'] = list(data.isnull().sum())
    info['% missing'] = info['# missing'] / data.shape[0]
    info['types'] = list(data.dtypes)
    info['unique values'] = list(len(data[var].unique()) for var in data.columns)
    
    return info

#### Regressão

In [4]:
dataset = pd.read_csv('data/50_Startups.csv')

In [5]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [6]:
data_info(dataset)

Unnamed: 0,var,# missing,% missing,types,unique values
0,R&D Spend,0,0.0,float64,49
1,Administration,0,0.0,float64,50
2,Marketing Spend,0,0.0,float64,48
3,State,0,0.0,object,3
4,Profit,0,0.0,float64,50


### Preprocessing

In [9]:
X = pd.DataFrame(dataset.iloc[:, :4])
y = pd.DataFrame(dataset.iloc[:, -1])

In [10]:
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


#### Lidando com variáveis categóricas

Label Encoder: dá a cada elemento da variável categórica um número

    Uso: categóricas ordinais

In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
X_le = X.copy()
for var in X.columns:
    if (X[var].dtype == object):
        X_le[var] = le.fit_transform(X[var])

One hot Encoder

In [12]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

le = LabelEncoder()
X_le = X.copy()
X_ohe = X.copy()
for var in X.columns:
    if (X[var].dtype == object):
        ohe = OneHotEncoder(categorical_features = [0])
        X_le[var] = le.fit_transform(X[var])
        X_ohe = ohe.fit_transform(X_le).toarray()

In [13]:
X_ohe = pd.DataFrame(X_ohe)

#### Padronização e Normalização

In [14]:
X_encoded = X_le

Normalização StandardScaler: remove a média de cada atributo e divide os atributos não-constantes pelo desvio padrão. Ex de uso: se um atributo cuja variância é MUITO maior do que do que dos outros atributos, este atributo pode dominar a função objetivo e impossibilitar o estimador de aprender com as outras features

In [15]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()
std_scaler.fit(X_encoded)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [16]:
X_stds = pd.DataFrame(std_scaler.transform(X_encoded), columns = X_encoded.columns)

Normalização para intervalo: escalar os atributos para que seus valores fiquem dentro de um intervalo, dado o valor mínimo e o máximo

In [17]:
from sklearn.preprocessing import MinMaxScaler

minmaxscaler = MinMaxScaler()
minmaxscaler.fit(X_encoded)

MinMaxScaler(copy=True, feature_range=(0, 1))

In [18]:
X_minmaxs = pd.DataFrame(minmaxscaler.transform(X_encoded), columns = X_encoded.columns)

Normalização Robust Scaler: Quando o dataset contém muitos outliers, usar média e variância não trará resultados bons. Usa uma estimativa mais robusta para encontrar o centro dos dados. Remove a mediana e escala os dados de acordo com os intervalos de quartis(intervalo interquartil (IQR): intervalo entre o primeiro quartil (25%) e o terceiro quartil (75%))

In [19]:
from sklearn.preprocessing import RobustScaler

robscaler = RobustScaler()
robscaler.fit(X_encoded)

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)

In [20]:
X_robusts = pd.DataFrame(robscaler.transform(X_encoded), columns = X_encoded.columns)

Normalização por sample (linha): cada registro/sample com pelo menos um componente não zero é reescalado independentemente dos outros samples

-l1-norm: divide cada elemento de cada sample pela soma dos módulos de cada elemento do tal sample

-l2-norm: divide cada elemento de cada sample pela raíz quadrada da soma dos quadrados de cada elemento de tal sample

In [21]:
from sklearn.preprocessing import Normalizer

nscaler = Normalizer(norm = 'l1')
nscaler.fit(X_encoded)

Normalizer(copy=True, norm='l1')

In [22]:
X_norm = pd.DataFrame(nscaler.transform(X_encoded), columns = X_encoded.columns)

#### Train Test Split

In [29]:
#escolha do X
#X_preped = X_stds
X_preped = X_le

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_preped, y, test_size = 0.2, random_state = 0)

In [31]:
from sklearn.model_selection import StratifiedKFold, KFold, LeaveOneOut

cv_kfold = KFold(5, shuffle = False, random_state=12) #### se shuffle = False os k folds são sequenciais
cv_leave_one_out = LeaveOneOut()

### Modelagem -- Regressão

O modelo possui uma função com a cara $$y_{pred} = \beta_{0} + \beta_{1}*x$$

e devemos encontrar os coeficientes que minimizam a função de custo: $$Error = \sum_{i = 1}^{n}(y_i - y_{pred_{i}})$$



In [53]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_validate

def reg_results_holdout(clf, X_train, y_train, X_test, y_test):
    
    classifier = clf.fit(X_train, np.ravel(y_train))
    metrics = [mean_squared_error, mean_absolute_error, r2_score ]
    
    pred_train = classifier.predict(X_train)
    pred_test = classifier.predict(X_test)

    results = pd.DataFrame()
    
    for m in metrics:
        result_train = m(y_train, pred_train)
        result_test = m(y_test, pred_test)
        
        dict_res = {'Treino': ["{:.3f}".format(result_train)], 'Teste': ["{:.3f}".format(result_test)]}
        res_aux = pd.DataFrame(dict_res, index = [str(m.__name__)])
        
        results = pd.concat([results, res_aux], axis = 0)
    
    return results

def reg_results_cv(clf, X_train, y_train, kcv):
    
    classifier = clf.fit(X_train, np.ravel(y_train))
    metrics = ['explained_variance', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_median_absolute_error', 'r2' ]
    
    results = pd.DataFrame()
    for m in metrics:
        cv_results = cross_validate(clf, X_train, np.ravel(y_train), scoring = m, cv = kcv, return_train_score = True)
        dict_res = {'' + m + '_Treino': cv_results['train_score'], 
                    '' + m + '_Teste': cv_results['test_score']}
        res_aux = pd.DataFrame(dict_res)
        results = pd.concat([results, res_aux], axis = 1)
        
    results_mean = np.transpose(pd.DataFrame(results.mean(), columns=['media']))
    results = pd.concat([results, results_mean], axis = 0)
    return results


def reg_grid_search(clf, X_train, y_train, params, score, cv):    
    grid = GridSearchCV(clf, params, scoring = score, cv = cv, return_train_score=True)
    grid_fitted = grid.fit(X_train, np.ravel(y_train))
    print ("Best score: %.4f" % grid_fitted.best_score_)
    print ("Best parameters: %s" % grid_fitted.best_params_)
    return grid_fitted, grid_fitted.best_estimator_, grid_fitted.cv_results_

#### Regressão Linear Simples -- Ordinary Least Squares (OLS)

In [33]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression(fit_intercept=True, normalize=False, copy_X=True, n_jobs=1)

In [35]:
X_train.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
33,55493.95,103057.49,214634.81,1
35,46014.02,85047.44,205517.64,2
26,75328.87,144135.98,134050.07,1
34,46426.07,157693.92,210797.67,0
18,91749.16,114175.79,294919.57,1


In [34]:
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [36]:
lin_reg.predict([[55490, 104050.6, 163948, 1]])

array([[94973.48038961]])

In [61]:
lin_reg.coef_

array([7.77203690e-01, 3.02432445e-02, 3.49236072e-02, 3.15877202e+02])

In [37]:
reg_results_holdout(lin_reg, X_train, y_train, X_test, y_test)

Unnamed: 0,Treino,Teste
mean_squared_error,81871927.547,78413822.172
mean_absolute_error,6363.728,7400.616
r2_score,0.95,0.939


#### l1 e l2

`l1`: Adiciona à função de custo um valor absoluto de magnitude dos coeficientes como uma penalidade
$$F_{cost} = \sum_{i = 1}^{n}(y_i - y_{pred_{i}}) + \lambda \sum_{j = 1}^{p}|\beta_j|$$
`l2`: Adiciona à função de custo um valor ao quadrado como penalidade
$$F_{cost} = \sum_{i = 1}^{n}(y_i - y_{pred_{i}}) + \lambda \sum_{j = 1}^{p}\beta_j^2$$

A norma l1 tem a capacidade de zerar coeficientes pela região em que intercepta as curvas $\beta$, enquanto a norma l2 diminui os coeficientes mas quase nunca zera.

<img src="l1_l2.png" width="400">

`Nota`: **Lasso** é um modelo que utiliza a norma l1 como penalização dos coeficientes e **Ridge** é um modelo que utiliza a norma l2 como penalização dos coeficientes

##### Lasso

In [40]:
from sklearn.linear_model import Lasso
#alpha é o lambda  :P
#se alpha = 0, o modelo é equivalente ao OLS
lasso = Lasso(alpha=1.0, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=1000, tol=0.0001, 
              warm_start=False, positive=False, random_state=None, selection='cyclic')

In [41]:
lasso.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [42]:
lasso.predict([[55490, 104050.6, 163948, 1]])

array([94973.40252365])

In [44]:
#coeficientes do modelo linear
lasso.coef_

array([7.77210914e-01, 3.02394872e-02, 3.49226656e-02, 3.14483927e+02])

Holdout

In [49]:
reg_results_holdout(lasso, X_train, y_train, X_test, y_test)

Unnamed: 0,Treino,Teste
mean_squared_error,81871928.941,78409578.915
mean_absolute_error,6363.547,7400.263
r2_score,0.95,0.939


CV

In [54]:
reg_results_cv(lasso, X_train, y_train, cv_kfold)

Unnamed: 0,explained_variance_Treino,explained_variance_Teste,neg_mean_absolute_error_Treino,neg_mean_absolute_error_Teste,neg_mean_squared_error_Treino,neg_mean_squared_error_Teste,neg_median_absolute_error_Treino,neg_median_absolute_error_Teste,r2_Treino,r2_Teste
0,0.960281,0.79958,-5995.127755,-9326.727466,-74612570.0,-135031100.0,-4369.811,-6044.517151,0.960281,0.799436
1,0.953117,0.890481,-6636.670264,-7340.832735,-83834980.0,-98867900.0,-5769.919739,-4365.056802,0.953117,0.884993
2,0.961225,0.915803,-5900.715688,-8230.275844,-59327950.0,-179743400.0,-3758.395888,-4744.869898,0.961225,0.911928
3,0.949317,0.949031,-6693.014494,-6601.670245,-85996780.0,-72075610.0,-4942.248444,-5819.876883,0.949317,0.934711
4,0.929199,0.974227,-6338.701819,-7054.842046,-89836800.0,-80169300.0,-3771.095384,-4526.070776,0.929199,0.974221
media,0.950628,0.905824,-6312.846004,-7710.869667,-78721820.0,-113177500.0,-4522.294091,-5100.078302,0.950628,0.901058


##### Ridge

In [74]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', 
              random_state=None)

In [75]:
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [76]:
ridge.predict([[55490, 104050.6, 163948, 1]])

array([[94972.88593105]])

In [77]:
ridge.coef_

array([[7.77258802e-01, 3.02145889e-02, 3.49164259e-02, 3.05245020e+02]])

Holdout

In [78]:
reg_results_holdout(ridge, X_train, y_train, X_test, y_test)

Unnamed: 0,Treino,Teste
mean_squared_error,81872008.683,78381502.762
mean_absolute_error,6362.343,7397.918
r2_score,0.95,0.939


CV

In [80]:
reg_results_cv(ridge, X_train, y_train, cv_kfold)

Unnamed: 0,explained_variance_Treino,explained_variance_Teste,neg_mean_absolute_error_Treino,neg_mean_absolute_error_Teste,neg_mean_squared_error_Treino,neg_mean_squared_error_Teste,neg_median_absolute_error_Treino,neg_median_absolute_error_Teste,r2_Treino,r2_Teste
0,0.96028,0.800175,-6001.766534,-9315.80136,-74613700.0,-134625100.0,-4358.943134,-6041.913114,0.96028,0.800039
1,0.953113,0.892227,-6624.587765,-7278.052389,-83841740.0,-97382430.0,-5805.083318,-4261.041581,0.953113,0.886721
2,0.961225,0.915829,-5901.484766,-8232.364633,-59328040.0,-179696400.0,-3758.645474,-4754.80151,0.961225,0.911951
3,0.949316,0.949181,-6684.881539,-6570.251944,-85998200.0,-71837790.0,-4929.204523,-5764.040497,0.949316,0.934927
4,0.929198,0.974296,-6344.131238,-7034.265889,-89837300.0,-79953860.0,-3772.569525,-4500.878077,0.929198,0.97429
media,0.950627,0.906342,-6311.370368,-7686.147243,-78723800.0,-112699100.0,-4524.889195,-5064.534956,0.950627,0.901585


##### Elastic Net

Combinação da norma l1 com a norma l2
$$J = MSE + a * l1 + (1-a) * l2$$

In [87]:
from sklearn.linear_model import ElasticNet
#se l1_ratio = 0, a norma é a l2
elastic = ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True, normalize=False, precompute=False, max_iter=1000, copy_X=True, 
                     tol=0.0001, warm_start=False, positive=False, random_state=None, selection='cyclic')

In [88]:
elastic.fit(X_train, y_train)

ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [89]:
elastic.predict([[55490, 104050.6, 163948, 1]])

array([94966.20585529])

In [90]:
elastic.coef_

array([7.77878112e-01, 2.98925769e-02, 3.48357278e-02, 1.85768305e+02])

Holdout

In [91]:
reg_results_holdout(elastic, X_train, y_train, X_test, y_test)

Unnamed: 0,Treino,Teste
mean_squared_error,81884077.657,78026839.429
mean_absolute_error,6346.78,7367.593
r2_score,0.95,0.939


CV

In [92]:
reg_results_cv(elastic, X_train, y_train, cv_kfold)

Unnamed: 0,explained_variance_Treino,explained_variance_Teste,neg_mean_absolute_error_Treino,neg_mean_absolute_error_Teste,neg_mean_squared_error_Treino,neg_mean_squared_error_Teste,neg_median_absolute_error_Treino,neg_median_absolute_error_Teste,r2_Treino,r2_Teste
0,0.960219,0.805715,-6074.936267,-9212.389021,-74729420.0,-130854300.0,-4653.578968,-6126.480142,0.960219,0.80564
1,0.952764,0.906736,-6577.614293,-6725.524427,-84467410.0,-85047140.0,-5278.097973,-3345.606971,0.952764,0.90107
2,0.96122,0.916077,-5909.130637,-8253.130817,-59335960.0,-179239600.0,-3761.126781,-4853.538023,0.96122,0.912175
3,0.949239,0.950353,-6622.901644,-6335.825214,-86128430.0,-69906340.0,-4813.106567,-5267.065384,0.949239,0.936676
4,0.92916,0.974926,-6405.029251,-6842.847637,-89885270.0,-78001730.0,-3786.283001,-4266.512765,0.92916,0.974918
media,0.95052,0.910761,-6317.922418,-7473.943423,-78909300.0,-108609800.0,-4458.438658,-4771.840657,0.95052,0.906096


##### KNN Regressor

##### Árvore de Regressão

##### SVR