# 3_modeling_evaluation_suto

In [1]:
import os

from feature_engine.encoding import RareLabelEncoder, CountFrequencyEncoder
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pycaret.regression import * 
import seaborn as sns
from sklearn.linear_model import LinearRegression, ElasticNet, Ridge
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import (
    cross_val_score, cross_validate, RepeatedStratifiedKFold, RandomizedSearchCV, GridSearchCV, train_test_split)
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.utils.validation import check_is_fitted

os.chdir('../')
import src.config as config
from src.preprocessing import non_numeric_to_nan, get_car_brand
from src.preprocessing import KNNImputerDataframe
from src.pipeline import pipe_categorical
os.chdir('./notebooks')

path_data = "../data/raw/"
path_interim_data = "../data/interim/"

# Parâmetros estéticos dos gráficos:
set_palette = "Paired"
axes_style = "ticks"

linha = 50*"-"

# Carregamento de dados

Usando o pipeline obtido na etapa anterior.

In [2]:
lst_numeric = config.NUMERICAL_FEATURES

df = pd.read_csv(path_data+config.DATA_FILE)
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


## Separação em treino e validação

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    df.drop([config.TARGET], axis=1), df[config.TARGET], test_size=0.025, random_state=config.RANDOM_STATE)

print(f"""Dimensão do conjunto de treino: {X_train.shape}
Dimensão do conjunto de validação: {X_test.shape}""")

Dimensão do conjunto de treino: (388, 8)
Dimensão do conjunto de validação: (10, 8)


# Preparação dos dados usando o Pipeline

## Primeiro: usando uma pipeline para as variáveis categóricas

- criação do fator "car brand" contendo o nome do fabricante; e
- transformaremos os dados categóricos em numéricos para serem consumidos pelo modelo.

In [4]:
X_train.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
9,8,390.0,190,3850,8.5,70,1,amc ambassador dpl
383,4,91.0,67,1965,15.0,82,3,honda civic
56,4,91.0,70,1955,20.5,71,1,plymouth cricket
72,8,304.0,150,3892,12.5,72,1,amc matador (sw)
132,4,140.0,75,2542,17.0,74,1,chevrolet vega


In [8]:
X_train = non_numeric_to_nan(X_train, lst_numeric)
X_test = non_numeric_to_nan(X_test, lst_numeric)

X_train["car brand"] = X_train["car name"].apply(get_car_brand)
X_test["car brand"] = X_test["car name"].apply(get_car_brand)

pipe_categorical.fit(X_train)
X_train_encoded = pipe_categorical.transform(X_train)
X_test_encoded = pipe_categorical.transform(X_test)

X_train_encoded.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,car brand
9,8,390.0,190.0,3850,8.5,70,1,amc ambassador dpl,0.069588
383,4,91.0,67.0,1965,15.0,82,3,honda civic,0.427835
56,4,91.0,70.0,1955,20.5,71,1,plymouth cricket,0.074742
72,8,304.0,150.0,3892,12.5,72,1,amc matador (sw),0.069588
132,4,140.0,75.0,2542,17.0,74,1,chevrolet vega,0.110825


In [10]:
X_test_encoded.head()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,car brand
198,4,91.0,53.0,1795,17.4,76,3,honda civic,0.427835
396,4,120.0,79.0,2625,18.6,82,1,ford ranger,0.123711
33,6,232.0,100.0,2634,13.0,71,1,amc gremlin,0.069588
208,8,318.0,150.0,3940,13.2,76,1,plymouth volare premier v8,0.074742
93,8,318.0,150.0,4237,14.5,73,1,plymouth fury gran sedan,0.074742


## Segundo: substituir os valores faltantes dos fatores numéricos usando KNN

## Testando um vários modelos de regressão usando o PyCaret

In [None]:
regression = setup(data=X_train_encoded, target='mpg')

best = compare_models(sort='MSE')

## Testando regressão linear com regularização de rede elástica

### Rodando com valores padrões (*default*)

In [None]:
X = X_train_encoded.drop(['mpg'], axis=1)
y = X_train_encoded['mpg']
scoring='neg_mean_squared_error'
random_state=42

model = ElasticNet(random_state=random_state)
model.fit(X, y)

cross = cross_validate(model, X, y, cv=3, scoring=scoring,
                       return_train_score=True, return_estimator=True, n_jobs=-1)

for key in ['test_score', 'train_score']:
    print(f"""{key}: {round(-cross[key].mean(), 3)}""")

### Tunning do modelo

In [None]:
params = {"alpha":[0.001, 0.01, 0.05, 0.1],
          "l1_ratio":[0.75, 0.9, 0.98, 1],
          "selection":['cyclic', 'random']}

grid_search = GridSearchCV(estimator=model, param_grid=params,
                           scoring=scoring, cv=3, n_jobs=-1, return_train_score=True)

grid_search = grid_search.fit(X, y)

print(f"""Best hyperparameters: {grid_search.best_params_}""")
print(f"""{scoring}: {round(grid_search.score(X, y),3)}""")

for item in ["mean_test_score", "mean_train_score"]:
    print(f"""{item}: {round(np.mean(grid_search.cv_results_[item]), 3)}""")
    

In [None]:
model = ElasticNet(**grid_search.best_params_, random_state=random_state)
model.fit(X, y)

cross = cross_validate(model, X, y, cv=3, scoring=scoring,
                       return_train_score=True, return_estimator=True, n_jobs=-1)

for key in ['test_score', 'train_score']:
    print(f"""{key}: {round(-cross[key].mean(), 3)}""")

In [None]:
try:
    check_is_fitted(model)
    print("Fittado.")
except:
    print("Não fittado.")


In [None]:
lst_columns = X_train_encoded.columns.tolist()[:-1]
coef_importance = pd.DataFrame(data=model.coef_.reshape(1,-1), columns=lst_columns)

## Plotando

In [None]:
def create_linspace_df(df, features_list=[None], elements_number=100):
    """
    O objetivo desta função é criar um dataframe com uma quantidade de registros igualmente espaçados,
    indicado para usar em regressão.
    
    Inputs:
        df: DataFrame do qual deseja-se criar o domínio igualmente espaçado.
        features_list: lista com as variáveis nu
        elements_number: 100
    
    Output:
        df_result: DataFrame
    """
    import pandas as pd
    
    if features_list[0]==None:
        features_list = df.select_dtypes(include=[float, "float32", "float64"]).columns
    else:
        pass
    
    df_result = pd.DataFrame()
    
    for feature in features_list:
        min_temp = int(df[feature].min()*.95)
        max_temp = int(df[feature].max()*1.05)
        df_result[feature] = np.linspace(min_temp, max_temp, num=elements_number, endpoint=True)
        
    return df_result

In [None]:
df_reg = create_linspace_df(X_train_encoded, features_list=lst_columns, elements_number=50)

# Obtendo as equações de reta para cada relação "fator vs 'mpg'".
dct_equations = {}

model = ElasticNet(**grid_search.best_params_, random_state=random_state)
model.fit(X, y)

str_equation = "y = "+str(round(model.intercept_, 2))

for coef, feature in zip(model.coef_, lst_columns):
    if -0.1 < coef < 0.1:
        pass
    else:
        str_equation += " + (" + str(round(coef, 2)) + "*"+feature+")"

df_reg["y_pred_mpg"] = model.predict(df_reg)
    
print(f"Dimensão da matriz: {df_reg.shape}")

str_equation

### Fatores mais importantes:

In [None]:
coef_importance = pd.DataFrame.from_dict(dict(zip(lst_columns, model.coef_)), orient='index', columns=['coeficient'])
coef_importance = coef_importance.loc[(coef_importance["coeficient"]>0.1) | (coef_importance["coeficient"]<-0.1),:]
# coef_importance
coef_importance.sort_values(by='coeficient', ascending=False).plot(kind='bar');
lst_columns = coef_importance.index

In [None]:
if len(lst_columns)%2==0:
    nrows = int(len(lst_columns)/2)
else:
    nrows = int(len(lst_columns)/2)+1

figsize_height = nrows*5

fig, axs = plt.subplots(ncols=2, nrows=nrows, figsize=(14,figsize_height), sharey=True, gridspec_kw={'hspace': .35, 'wspace': .1})

row = col = 0

for i, feature in enumerate(lst_columns):
    axs[row, col].scatter(data=X_train_encoded, x=feature, y="mpg")
    # axs[row, col].scatter(data=df_reg, x=feature, y="y_pred_mpg", color='red', 'r-')
    axs[row, col].plot(df_reg[feature], df_reg["y_pred_mpg"], color='red')
    axs[row, col].set_title("Regressão linear simples de\n'"+feature+"' vs 'mpg'",
                            fontdict={'fontsize':14}, pad=12)
    axs[row, col].set(ylim=[0,50], xlabel=feature);
            
#     textstr = f""" Equação da reta: \n{str_equation}"""

    props = dict(boxstyle='round', facecolor='wheat', alpha=0.25)
    
#     axs[row, col].text(.65, 0.96, textstr, transform=axs[row, col].transAxes, fontsize=12,
#         verticalalignment='top', bbox=props, color='red');

    if (i+1)%2!=0:
        col += 1
    else:
        col = 0
        row += 1
        
for ax in axs.flat:
    ax.set(ylabel='mpg')