# Introdução ao Scikit-learn (Regressores)

<img src="img\scikit_learn.png" alt="drawing" width="300"/>

O scikit-learn é uma poderosa ferramenta para aprendizado de máquina em python de código aberto (open source) e pode ser usado comercialmente (licença BSD). Construído sobre as bibliotecas NumPy, SciPy e matplotlib, possui ferramentas simples e eficientes para mineração e análise de dados, sendo uma biblioteca acessível e reutilizável em vários contextos.

#### O scikit-learn requer python (> = 3.5)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

In [2]:
house = pd.read_csv('../../scr/datasets/house.csv')
house.index.name = "house"

In [3]:
len(house)

21613

In [4]:
house.head()

Unnamed: 0_level_0,preco,quarto,banheiro,area_convivio,area_total,andar
house,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,221900.0,3,1.0,1180,5650,1.0
1,538000.0,3,2.25,2570,7242,2.0
2,180000.0,2,1.0,770,10000,1.0
3,604000.0,4,3.0,1960,5000,1.0
4,510000.0,3,2.0,1680,8080,1.0


## Treino e Teste

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
explicativa = house.drop(["preco"], axis=1)
resposta = house["preco"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(explicativa, 
                                                    resposta, 
                                                    test_size=0.3)

In [8]:
X_train.head()

Unnamed: 0_level_0,quarto,banheiro,area_convivio,area_total,andar
house,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
8082,4,2.75,2710,41811,1.5
21385,4,2.5,2130,4028,2.0
4278,3,2.0,1900,7980,1.0
6249,4,2.0,1440,4855,2.0
10234,2,1.75,1300,4000,2.0


In [9]:
X_test.head()

Unnamed: 0_level_0,quarto,banheiro,area_convivio,area_total,andar
house,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12037,4,1.5,1400,8500,2.0
10551,2,1.0,1270,4500,1.5
9297,5,2.5,3020,24750,1.0
7250,4,3.0,3230,438213,2.0
12390,2,1.0,1750,60872,1.0


In [10]:
y_train.head()

house
8082     710000.0
21385    392000.0
4278     235000.0
6249     332500.0
10234    440000.0
Name: preco, dtype: float64

In [11]:
y_test.head()

house
12037    257000.0
10551    510000.0
9297     449228.0
7250     950000.0
12390    650100.0
Name: preco, dtype: float64

## Modelos

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [13]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print(f"R² = {lr.score(X_test, y_test)}")

R² = 0.11079140427465861


In [14]:
knnr = KNeighborsRegressor()
knnr.fit(X_train, y_train)
print(f"R² = {knnr.score(X_test, y_test)}")

R² = -0.08237363894440408


In [15]:
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
print(f"R² = {rf.score(X_test, y_test)}")

R² = -0.012681074364135902


## Avaliação do modelo

In [16]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [17]:
y_pred = lr.predict(X_test)
print(f"MAE = {mean_absolute_error(y_pred, y_test)}")
print(f"MSQE = {mean_squared_error(y_pred, y_test)}")
print(f"R² = {lr.score(X_test, y_test)}")

MAE = 94094811.96873894
MSQE = 5.475059919704132e+16
R² = 0.11079140427465861


In [18]:
y_pred = knnr.predict(X_test)
print(f"MAE = {mean_absolute_error(y_pred, y_test)}")
print(f"MSQE = {mean_squared_error(y_pred, y_test)}")
print(f"R² = {knnr.score(X_test, y_test)}")

MAE = 67668195.72988896
MSQE = 6.664421101209485e+16
R² = -0.08237363894440408


In [19]:
y_pred = rf.predict(X_test)
print(f"MAE = {mean_absolute_error(y_pred, y_test)}")
print(f"MSQE = {mean_squared_error(y_pred, y_test)}")
print(f"R² = {rf.score(X_test, y_test)}")

MAE = 70631660.93070695
MSQE = 6.235308102449542e+16
R² = -0.012681074364135902


## Kfold e Kfold Estratificado

In [20]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [21]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [22]:
from sklearn.model_selection import RepeatedKFold

folds = RepeatedKFold(n_splits=3, n_repeats=10, random_state=None)

scores_linear = []
scores_knn = []
scores_rf = []

for train_index, test_index in folds.split(explicativa, resposta):
    
    X_train = explicativa.iloc()[train_index].values
    X_test = explicativa.iloc()[test_index].values
    y_train = resposta.iloc()[train_index].dropna()
    y_test = resposta.iloc()[test_index].dropna()

    linear_regression = LinearRegression()
    knn_regression = KNeighborsRegressor()
    random_florest = RandomForestRegressor(n_estimators=5)
    
    scores_linear.append(get_score(linear_regression, X_train, X_test, y_train, y_test))
    scores_knn.append(get_score(knn_regression, X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(random_florest, X_train, X_test, y_train, y_test))

In [23]:
from statistics import mean

In [24]:
mean(scores_linear)

0.1080134506900842

In [25]:
mean(scores_knn)

-0.032491287774119075

In [26]:
mean(scores_rf)

-0.1155040977987657

## Cross-Validation

In [27]:
from sklearn.model_selection import cross_val_score

In [28]:
cross_val_score(LinearRegression(), 
                explicativa, resposta, cv=3, scoring='r2')

array([0.09502369, 0.12362138, 0.10328607])

In [29]:
cross_val_score(KNeighborsRegressor(), 
                explicativa, resposta, cv=3, scoring='r2')

array([-0.05332119, -0.02238985, -0.07018131])

In [30]:
cross_val_score(RandomForestRegressor(n_estimators=100), 
                explicativa, resposta, cv=3, scoring='r2')

array([-0.0032743 ,  0.09566181, -0.02314308])