<span style='font-family:"Times New Roman"'>

## Bibliotecas

Biblotecas necessárias para a análise

In [1]:
# Ambiente
from dotenv import dotenv_values

# Manipulação de dados
import sqlite3
import pandas as pd
import numpy as np

# Pipeline de processamento
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.model_selection import train_test_split
from ModelUtitls import Preprocessor
from ModelUtitls import SelectModel
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

# Modelos
import pickle
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import ElasticNet
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Métricas
from sklearn.metrics import r2_score

# Desabilitar avisos
import warnings
warnings.filterwarnings('ignore')


<span style='font-family:"Times New Roman"'>

## Configurações
Configurações de execução

In [2]:
# Ler arquivo de configurações
config = dotenv_values("../.env")
# Versão
version = config["VERSION"]
# Aleatoriedade "padrão"
seed = 42
np.random.seed(seed=seed)
# Leitura de dados
pd.set_option('display.max_columns', 20)


<span style='font-family:"Times New Roman"'>

## Conexões

Banco de dados utilizado

In [3]:
#Conectar com o banco de dados
connection = sqlite3.connect(config["DB_CONFIG"])

___
<span style='font-family:"Times New Roman"'>

## 1. Acessar e Juntar Tabelas

Os dados forma armazenados de forma separada (numéricos e categóricos) e de forma versionada, e precisam ser concatenados novamente.

In [4]:
#Acessar features numéricas e categoricas
df_numeric = pd.read_sql_query(f"SELECT * FROM numerical_features_{version}", connection)
df_categor = pd.read_sql_query(f"SELECT * FROM categorical_features_{version}", connection)
df_numeric.drop(columns=["index"],inplace=True)
df_categor.drop(columns=["index"],inplace=True)
#Juntar tabelas
df = df_categor.join(df_numeric)
df.head()

Unnamed: 0,Neighborhood,ExterQual,BsmtQual,HeatingQC,CentralAir,KitchenQual,GarageFinish,OverallQual,YearBuilt,MasVnrArea,...,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,FullBath,TotRmsAbvGrd,GarageYrBlt,GarageCars,GarageArea,SalePrice
0,CollgCr,Gd,Gd,Ex,Y,Gd,RFn,7,2003,196.0,...,856,856,854,1710,2,8,2003.0,2,548,208500
1,Veenker,TA,Gd,Ex,Y,TA,RFn,6,1976,0.0,...,1262,1262,0,1262,2,6,1976.0,2,460,181500
2,CollgCr,Gd,Gd,Ex,Y,Gd,RFn,7,2001,162.0,...,920,920,866,1786,2,6,2001.0,2,608,223500
3,Crawfor,TA,TA,Gd,Y,Gd,Unf,7,1915,0.0,...,756,961,756,1717,1,7,1998.0,3,642,140000
4,NoRidge,Gd,Gd,Ex,Y,Gd,RFn,8,2000,350.0,...,1145,1145,1053,2198,2,9,2000.0,3,836,250000


___
<span style='font-family:"Times New Roman"'>

## 2. Dividir Treino e Teste e Versionar Dados

In [5]:
# Definir variável objetivo e preditores
y = df["SalePrice"]
X = df.drop(columns=["SalePrice"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)

#Salvar features
try:
    X_train.to_sql(f"train_features_{version}", connection)
    X_test.to_sql( f"test_features_{version}", connection)
    
    y_train.to_sql( f"train_target_{version}", connection)
    y_test.to_sql( f"test_target_{version}", connection)
    
except Exception as e:
    print(e)
 

Table 'train_features_V1' already exists.


___
<span style='font-family:"Times New Roman"'>

## 3. Avaliar Modelos e Selecionar o Melhor

In [6]:
preprocessor = Preprocessor(X).transformer()
models = {"Gradient Boosting": GradientBoostingRegressor(),
          "XG Boost": XGBRegressor(seed=seed),
          "Elastic Net": ElasticNet(),
          "Linear Regression": LinearRegression(),
          "Tree Regressor": DecisionTreeRegressor(random_state=seed),
          "LGBM Regressor": LGBMRegressor(),
          "Kernel Ridge": KernelRidge()}


In [7]:
# Avaliar modelos
avaliation = SelectModel(models, X_train, y_train,
                         seed, preprocessor).guess_best()

# Rankear os modelos
avaliation_table = pd.DataFrame(avaliation)
avaliation_table.sort_values(
    by=['r_square_mean'], ascending=False, inplace=True)
avaliation_table.reset_index(drop=True, inplace=True)
avaliation_table.head(10)


Unnamed: 0,model,r_square_mean,r_square_stdv,models_algor
0,Gradient Boosting,0.850603,0.071686,GradientBoostingRegressor()
1,LGBM Regressor,0.841636,0.052826,LGBMRegressor()
2,XG Boost,0.825126,0.088346,"XGBRegressor(base_score=None, booster=None, ca..."
3,Kernel Ridge,0.807142,0.101357,KernelRidge()
4,Linear Regression,0.805479,0.099232,LinearRegression()
5,Elastic Net,0.7681,0.13636,ElasticNet()
6,Tree Regressor,0.721169,0.096645,DecisionTreeRegressor(random_state=42)


___
<span style='font-family:"Times New Roman"'>

## 4. Otimizar modelo

In [8]:
#Definir algoritimo padrão
model_alg = avaliation_table.models_algor.head(1)[0]

In [9]:
#Avaliar modelo padrão
default_pipe= Pipeline(steps=[('preprocessor', preprocessor), ('model', model_alg )])
default_pipe.fit(X_train,y_train)
pred_default =default_pipe.predict(X_test)
r2_default =r2_score(y_test,pred_default,multioutput='variance_weighted')

In [10]:
# Grade de Parâmetros
parameters = {'learning_rate': [0.01, 0.02],'subsample': [0.9, 0.5],'n_estimators': [100, 500],'max_depth': [4, 6]}
# Condição de divisão
k_fold = KFold(n_splits=5, shuffle=True, random_state=seed)
#Otimizar e avaliar
otm_pipe = make_pipeline(preprocessor, GridSearchCV(model_alg ,param_grid=parameters,cv=k_fold,refit=True))
otm_pipe.fit(X_train,y_train)
pred_otm = otm_pipe.predict(X_test)
r2_otm =r2_score(y_test,pred_otm,multioutput='variance_weighted')

___
<span style='font-family:"Times New Roman"'>

## 5. Selecionar e salvar Melhor Versão do Modelo

In [11]:
# Selecionar o melhor
if r2_otm > r2_default:
    best_pipe = otm_pipe
else:
    best_pipe = default_pipe
    
#Salvar
pickle.dump(best_pipe, open(f'{config["DEPLOY_VOL"]}_{version}.pkl','wb'))