# Introdução ao Scikit-learn (Regressores)

<img src="img\scikit_learn.png" alt="drawing" width="300"/>

O scikit-learn é uma poderosa ferramenta para aprendizado de máquina em python de código aberto (open source) e pode ser usado comercialmente (licença BSD). Construído sobre as bibliotecas NumPy, SciPy e matplotlib, possui ferramentas simples e eficientes para mineração e análise de dados, sendo uma biblioteca acessível e reutilizável em vários contextos.

#### O scikit-learn requer python (> = 3.5)

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

In [2]:
house = pd.read_csv('../../scr/datasets/house.csv')
house.index.name = "house"

In [3]:
len(house)

21613

In [4]:
house.head()

Unnamed: 0_level_0,preco,quarto,banheiro,area_convivio,area_total,andar
house,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,221900.0,3,1.0,1180,5650,1.0
1,538000.0,3,2.25,2570,7242,2.0
2,180000.0,2,1.0,770,10000,1.0
3,604000.0,4,3.0,1960,5000,1.0
4,510000.0,3,2.0,1680,8080,1.0


## Treino e Teste

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
explicativa = house.drop(["area_total"], axis=1)
resposta = house["area_total"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(explicativa, 
                                                    resposta, 
                                                    test_size=0.3)

In [8]:
X_train.head()

Unnamed: 0_level_0,preco,quarto,banheiro,area_convivio,andar
house,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6584,550000.0,3,2.25,1980,1.0
19713,345000.0,2,2.25,1110,3.0
5760,1100000.0,4,3.25,3190,2.0
9773,355000.0,0,0.0,2460,2.0
13167,530000.0,3,2.5,1950,2.0


In [9]:
X_test.head()

Unnamed: 0_level_0,preco,quarto,banheiro,area_convivio,andar
house,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12246,490000.0,2,1.75,1440,1.0
8436,285000.0,2,1.0,1010,1.0
6061,199000.0,4,1.0,1280,1.5
14322,480000.0,5,1.75,2080,1.0
8236,415000.0,3,2.25,1950,1.0


In [10]:
y_train.head()

house
6584     40887
19713     1290
5760     11774
9773      8049
13167     9906
Name: area_total, dtype: int64

In [11]:
y_test.head()

house
12246      6265
8436       7200
6061      10521
14322    217800
8236       8868
Name: area_total, dtype: int64

## Modelos

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [13]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print(f"R² = {lr.score(X_test, y_test)}")

R² = 0.020460784659075637


In [14]:
knnr = KNeighborsRegressor()
knnr.fit(X_train, y_train)
print(f"R² = {knnr.score(X_test, y_test)}")

R² = -0.19448174889531256


In [None]:
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train, y_train)
print(f"R² = {rf.score(X_test, y_test)}")

## Avaliação do modelo

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [None]:
y_pred = lr.predict(X_test)
print(f"MAE = {mean_absolute_error(y_pred, y_test)}")
print(f"MSQE = {mean_squared_error(y_pred, y_test)}")
print(f"R² = {lr.score(X_test, y_test)}")

In [None]:
y_pred = knnr.predict(X_test)
print(f"MAE = {mean_absolute_error(y_pred, y_test)}")
print(f"MSQE = {mean_squared_error(y_pred, y_test)}")
print(f"R² = {knnr.score(X_test, y_test)}")

In [None]:
y_pred = rf.predict(X_test)
print(f"MAE = {mean_absolute_error(y_pred, y_test)}")
print(f"MSQE = {mean_squared_error(y_pred, y_test)}")
print(f"R² = {rf.score(X_test, y_test)}")

## Kfold e Kfold Estratificado

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
kf

In [None]:
def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [None]:
from sklearn.model_selection import RepeatedKFold

folds = RepeatedKFold(n_splits=3, n_repeats=10, random_state=None)

scores_linear = []
scores_knn = []
scores_rf = []

for train_index, test_index in folds.split(explicativa, resposta):
    
    X_train = explicativa.iloc()[train_index].values
    X_test = explicativa.iloc()[test_index].values
    y_train = resposta.iloc()[train_index].dropna()
    y_test = resposta.iloc()[test_index].dropna()

    linear_regression = LinearRegression()
    knn_regression = KNeighborsRegressor()
    random_florest = RandomForestRegressor(n_estimators=5)
    
    scores_linear.append(get_score(linear_regression, X_train, X_test, y_train, y_test))
    scores_knn.append(get_score(knn_regression, X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(random_florest, X_train, X_test, y_train, y_test))

In [None]:
from statistics import mean

In [None]:
mean(scores_linear)

In [None]:
mean(scores_knn)

In [None]:
mean(scores_rf)

## Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cross_val_score(LinearRegression(), 
                explicativa, resposta, cv=3, scoring='r2')

In [None]:
cross_val_score(KNeighborsRegressor(), 
                explicativa, resposta, cv=3, scoring='r2')

In [None]:
cross_val_score(RandomForestRegressor(n_estimators=100), 
                explicativa, resposta, cv=3, scoring='r2')