# Introdução ao Scikit-learn (Regressores)

<img src="img\scikit_learn.png" alt="drawing" width="300"/>

O scikit-learn é uma poderosa ferramenta para aprendizado de máquina em python de código aberto (open source) e pode ser usado comercialmente (licença BSD). Construído sobre as bibliotecas NumPy, SciPy e matplotlib, possui ferramentas simples e eficientes para mineração e análise de dados, sendo uma biblioteca acessível e reutilizável em vários contextos.

#### O scikit-learn requer python (> = 3.5)

In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

In [29]:
so2 = pd.read_csv('../../scr/datasets/so2.tsv', sep = "\t")
so2.index.name = "cidade"

In [30]:
so2.columns

Index(['so2', 'temp', 'manuf', 'pop', 'vento', 'precip', 'dias'], dtype='object')

In [31]:
len(so2)

41

In [32]:
so2.head()

Unnamed: 0_level_0,so2,temp,manuf,pop,vento,precip,dias
cidade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,10,70.3,213,582,6.0,7.05,36
1,13,61.0,91,132,8.2,48.52,100
2,12,56.7,453,716,8.7,20.66,67
3,17,51.9,454,515,9.0,12.95,86
4,56,49.1,412,158,9.0,43.37,127


## Treino e Teste

In [36]:
from sklearn.model_selection import train_test_split

In [37]:
explicativa = so2.drop(["so2"], axis=1)
resposta = so2["so2"]

In [39]:
X_train, X_test, y_train, y_test = train_test_split(explicativa, 
                                                    resposta, 
                                                    test_size=0.3)

In [40]:
X_train.head()

Unnamed: 0_level_0,temp,manuf,pop,vento,precip,dias
cidade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
39,55.2,35,71,6.5,40.75,148
30,50.0,343,179,10.6,42.75,125
35,51.0,137,176,8.7,15.17,89
16,55.0,625,905,9.6,41.31,111
11,52.3,361,746,9.7,38.74,121


In [41]:
X_test.head()

Unnamed: 0_level_0,temp,manuf,pop,vento,precip,dias
cidade,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
19,54.5,381,507,10.0,37.0,99
8,75.5,207,335,9.0,59.8,128
18,43.5,699,744,10.6,25.94,137
5,54.0,80,80,9.0,40.25,114
9,61.5,368,497,9.1,48.34,115


In [42]:
y_train.head()

cidade
39    31
30    94
35    28
16    47
11    28
Name: so2, dtype: int64

In [43]:
y_test.head()

cidade
19    14
8     10
18    29
5     36
9     24
Name: so2, dtype: int64

## Modelos

In [136]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [158]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print(f"R² = {lr.score(X_test, y_test)}")

R² = 0.23955052023065382


In [159]:
knnr = KNeighborsRegressor()
knnr.fit(X_train, y_train)
print(f"R² = {knnr.score(X_test, y_test)}")

R² = 0.1309965172452533


In [160]:
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
print(f"R² = {rf.score(X_test, y_test)}")

R² = 0.4130642034602852


## Avaliação do modelo

In [171]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [172]:
y_pred = lr .predict(X_test)
print(f"MAE = {mean_absolute_error(y_pred, y_test)}")
print(f"MSQE = {mean_squared_error(y_pred, y_test)}")
print(f"R² = {lr.score(X_test, y_test)}")

MAE = 19.46112969176843
MSQE = 640.8294266913089
R² = 0.23955052023065382


In [173]:
y_pred = knnr.predict(X_test)
print(f"MAE = {mean_absolute_error(y_pred, y_test)}")
print(f"MSQE = {mean_squared_error(y_pred, y_test)}")
print(f"R² = {knnr.score(X_test, y_test)}")

MAE = 19.63076923076923
MSQE = 732.3076923076923
R² = 0.1309965172452533


In [174]:
y_pred = rf.predict(X_test)
print(f"MAE = {mean_absolute_error(y_pred, y_test)}")
print(f"MSQE = {mean_squared_error(y_pred, y_test)}")
print(f"R² = {rf.score(X_test, y_test)}")

MAE = 16.65076923076923
MSQE = 494.6097538461539
R² = 0.4130642034602852


## Kfold e Kfold Estratificado

In [126]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [127]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [134]:
from sklearn.model_selection import RepeatedKFold

folds = RepeatedKFold(n_splits=3, n_repeats=10, random_state=None)

scores_linear = []
scores_knn = []
scores_rf = []

for train_index, test_index in folds.split(explicativa, resposta):
    
    X_train = explicativa.iloc()[train_index].values
    X_test = explicativa.iloc()[test_index].values
    y_train = resposta.iloc()[train_index].dropna()
    y_test = resposta.iloc()[test_index].dropna()

    linear_regression = LinearRegression()
    knn_regression = KNeighborsRegressor()
    random_florest = RandomForestRegressor(n_estimators=5)
    
    scores_linear.append(get_score(logistic_regression, X_train, X_test, y_train, y_test))
    scores_knn.append(get_score(knn_regression, X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(random_florest, X_train, X_test, y_train, y_test))

In [129]:
get_score(logistic_regression, X_train, X_test, y_train, y_test)

0.2610453848661288

In [130]:
from statistics import mean

In [131]:
mean(scores_linear)

0.008394624287208325

In [132]:
mean(scores_knn)

-0.12444574330865361

In [133]:
mean(scores_rf)

-0.6386212506409281

## Cross-Validation

In [114]:
from sklearn.model_selection import cross_val_score

In [123]:
cross_val_score(LinearRegression(), 
                explicativa, resposta, cv=3)

array([0.78513796, 0.27277237, 0.34954727])

In [124]:
cross_val_score(KNeighborsRegressor(), 
                explicativa, resposta, cv=3)

array([ 0.3678977 , -0.49433709, -0.2715298 ])

In [125]:
cross_val_score(RandomForestRegressor(n_estimators=100), 
                explicativa, resposta, cv=3)

array([ 0.5814977 , -0.07752934,  0.08227919])