<a href="https://colab.research.google.com/github/CristianoDataScience/project_regressao_lightgbm/blob/main/project_regressao_lightgbm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Regressão com Light GBM

In [None]:
# Instalação do Algoritmo
#!pip install lightgbm



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold, cross_val_score

In [None]:
df = pd.read_csv("/content/drive/MyDrive/ml_project/housing.csv")
df.head(2)

Unnamed: 0,RM,LSTAT,PTRATIO,MEDV
0,6.575,4.98,15.3,504000.0
1,6.421,9.14,17.8,453600.0


In [None]:
independente = df.iloc[:, 0:3].values
independente

array([[ 6.575,  4.98 , 15.3  ],
       [ 6.421,  9.14 , 17.8  ],
       [ 7.185,  4.03 , 17.8  ],
       ...,
       [ 6.976,  5.64 , 21.   ],
       [ 6.794,  6.48 , 21.   ],
       [ 6.03 ,  7.88 , 21.   ]])

In [None]:
independente.shape

(489, 3)

In [None]:
dependente = df.iloc[:, 3].values
dependente

array([ 504000.,  453600.,  728700.,  701400.,  760200.,  602700.,
        480900.,  569100.,  346500.,  396900.,  315000.,  396900.,
        455700.,  428400.,  382200.,  417900.,  485100.,  367500.,
        424200.,  382200.,  285600.,  411600.,  319200.,  304500.,
        327600.,  291900.,  348600.,  310800.,  386400.,  441000.,
        266700.,  304500.,  277200.,  275100.,  283500.,  396900.,
        420000.,  441000.,  518700.,  646800.,  732900.,  558600.,
        531300.,  518700.,  445200.,  405300.,  420000.,  348600.,
        302400.,  407400.,  413700.,  430500.,  525000.,  491400.,
        396900.,  743400.,  518700.,  663600.,  489300.,  411600.,
        392700.,  336000.,  466200.,  525000.,  693000.,  493500.,
        407400.,  462000.,  365400.,  438900.,  508200.,  455700.,
        478800.,  491400.,  506100.,  449400.,  420000.,  436800.,
        445200.,  426300.,  588000.,  501900.,  520800.,  480900.,
        501900.,  558600.,  472500.,  466200.,  495600.,  6027

In [None]:
x_treino, x_teste, y_treino, y_teste = train_test_split(independente, dependente, test_size=0.3, random_state=0)

In [None]:
x_treino.shape, x_teste.shape

((342, 3), (147, 3))

In [None]:
lgbm =  lgb.LGBMRegressor(num_leaves=50, max_depth=3, learning_rate=0.1, n_estimators=80)
lgbm.fit(x_treino, y_treino)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 266
[LightGBM] [Info] Number of data points in the train set: 342, number of used features: 3
[LightGBM] [Info] Start training from score 455325.438596


In [None]:
lgbm.score(x_treino, y_treino)

0.8893163266944558

## Teste

In [None]:
lgbm.score(x_teste, y_teste)

0.8215549565147177

In [None]:
# Previsões para valores distintos
previsoes_teste = lgbm.predict(x_teste)
previsoes_teste

array([429957.83172558, 658026.10945464, 314361.66536898, 339412.14628855,
       509864.92362807, 254223.62239231, 314361.66536898, 417744.1309617 ,
       473848.39919632, 405816.83288083, 255705.38767794, 327025.39904138,
       463859.22894613, 211385.51753435, 495378.61293465, 326078.43264074,
       462952.07922391, 520669.26216917, 393563.64469714, 624169.82662562,
       613264.46251714, 770133.6164671 , 211385.51753435, 444399.85469015,
       395564.71237076, 876479.44580325, 790493.1590794 , 883874.17523368,
       402744.41141387, 428484.75089403, 264828.44602241, 385002.72533538,
       468671.55645878, 718942.65826556, 463925.57105078, 829322.29802178,
       515511.82671657, 264707.68975331, 498349.69463228, 528786.31174773,
       255705.38767794, 377051.89980474, 424289.2040299 , 490392.21819011,
       350829.31705271, 376110.4682326 , 509864.92362807, 334490.41901664,
       788696.91090459, 426933.53533782, 731946.7906507 , 437296.61150974,
       563005.03257261, 3

## Métricas

In [None]:
# Erro médio Absoluto
mean_absolute_error(y_teste, previsoes_teste)

55057.91594046354

In [None]:
# Raiz do erro quadrático médio (RMSE)
np.sqrt(mean_squared_error(y_teste, previsoes_teste))

71954.08357587352

## Validação Cruzada

In [None]:
# Separando os dados em folds
kfold = KFold(n_splits=12, shuffle=True, random_state=5)

In [None]:
# Criando o modelo
modelo = lgb.LGBMRegressor(num_leaves=50, max_depth=3, learning_rate=0.1, n_estimators=80)
resultado = cross_val_score(modelo, independente, dependente, cv=kfold)
resultado

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000072 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 337
[LightGBM] [Info] Number of data points in the train set: 448, number of used features: 3
[LightGBM] [Info] Start training from score 455128.125000
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000056 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 339
[LightGBM] [Info] Number of data points in the train set: 448, number of used features: 3
[LightGBM] [Info] Start training from score 454073.437500
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 338
[LightGBM] [Info] Number of data points in the train set: 448, number of used features: 3
[LightGBM] [Info] Start train

array([0.9042588 , 0.84159254, 0.84949108, 0.82755848, 0.73822486,
       0.87072778, 0.77074808, 0.85316955, 0.81458269, 0.77248802,
       0.84281123, 0.79924832])

In [None]:
# Usamos a média e o desvio padrão
print("Coeficiente de Determinação Médio: %.2f%%" % (resultado.mean()*100.0))
print("Coeficiente de Determinação Desvio Padrão: %.2f%%" % (resultado.std()*100.0))

Coeficiente de Determinação Médio: 82.37%
Coeficiente de Determinação Desvio Padrão: 4.50%


**REGRESSÃO COM LIGHT GBM:** R² = 0,89/0,82; RMSE = 71954,08. R² Validação Cruzada: 82,37%.