# Modelos de ML

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics



In [2]:
# Lendo a base de dados com as variáveis dummies

dados = pd.read_csv('../../dados/dados_OneHotEncoder.csv')
dados.head()

Unnamed: 0,usableAreas,bedrooms,bathrooms,parkingSpaces,suites,yearlyIptu,monthlyCondoFee,price,unitTypes_ALLOTMENT_LAND,unitTypes_APARTMENT,...,unitTypes_RESIDENTIAL_BUILDING,unitTypes_SHED_DEPOSIT_WAREHOUSE,unitTypes_TWO_STORY_HOUSE,unitTypes_VILLAGE_HOUSE,usageTypes_COMMERCIAL,usageTypes_RESIDENTIAL,zone_Zona Central,zone_Zona Norte,zone_Zona Oeste,zone_Zona Sul
0,5.01728,0.0,0.0,0.0,0.0,0.0,0.0,8.160804,0,0,...,0,0,0,0,0,1,0,0,1,0
1,3.7612,1.098612,0.693147,0.0,0.0,4.615121,5.993961,9.82558,0,1,...,0,0,0,0,0,1,1,0,0,0
2,3.871201,1.098612,1.098612,0.0,0.693147,5.247024,6.214608,9.680406,0,1,...,0,0,0,0,0,1,1,0,0,0
3,3.7612,1.098612,0.693147,0.0,0.0,4.615121,5.993961,9.705098,0,1,...,0,0,0,0,0,1,1,0,0,0
4,3.73767,1.098612,0.693147,0.693147,0.0,0.0,0.0,9.852247,0,1,...,0,0,0,0,0,1,0,0,1,0


## Dividindo em treino e teste

In [3]:
X = dados.drop('price', axis=1)
y = dados['price']

In [6]:
SEED = 42

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=SEED, )

## Regressão linear

In [8]:
import statsmodels.api as sm

X_train_constante = sm.add_constant(X_train)
X_train_constante

Unnamed: 0,const,usableAreas,bedrooms,bathrooms,parkingSpaces,suites,yearlyIptu,monthlyCondoFee,unitTypes_ALLOTMENT_LAND,unitTypes_APARTMENT,...,unitTypes_RESIDENTIAL_BUILDING,unitTypes_SHED_DEPOSIT_WAREHOUSE,unitTypes_TWO_STORY_HOUSE,unitTypes_VILLAGE_HOUSE,usageTypes_COMMERCIAL,usageTypes_RESIDENTIAL,zone_Zona Central,zone_Zona Norte,zone_Zona Oeste,zone_Zona Sul
30849,1.0,5.351858,1.386294,1.609438,1.098612,1.386294,7.972811,6.720220,0,0,...,0,0,0,0,0,1,0,0,1,0
16468,1.0,4.477337,1.386294,1.098612,0.693147,0.693147,7.003974,6.660575,0,1,...,0,0,0,0,0,1,0,0,1,0
7552,1.0,4.343805,1.386294,1.386294,0.693147,0.693147,5.860786,6.478510,0,1,...,0,0,0,0,0,1,0,0,1,0
53272,1.0,5.811141,1.609438,1.945910,1.386294,1.609438,9.305741,8.268988,0,1,...,0,0,0,0,0,1,0,0,1,0
31454,1.0,4.787492,1.386294,1.791759,1.098612,1.386294,0.000000,0.000000,0,1,...,0,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54343,1.0,5.874931,1.609438,1.609438,0.693147,1.098612,7.003974,8.160804,0,1,...,0,0,0,0,0,1,0,0,0,1
38158,1.0,4.574711,1.386294,0.693147,0.000000,1.098612,7.591357,6.857514,0,1,...,0,0,0,0,0,1,0,0,0,1
860,1.0,3.806662,1.098612,0.693147,0.693147,0.000000,0.000000,0.000000,0,1,...,0,0,0,0,0,1,0,0,1,0
15795,1.0,4.330733,1.098612,0.693147,0.000000,0.000000,4.744932,3.931826,0,1,...,0,0,0,0,0,1,0,0,0,1


In [9]:
# step wise regression

modelo_statsmodels = sm.OLS(y_train, X_train_constante, hasconst=True).fit()
print(modelo_statsmodels.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.810
Model:                            OLS   Adj. R-squared:                  0.810
Method:                 Least Squares   F-statistic:                     5198.
Date:                Thu, 26 May 2022   Prob (F-statistic):               0.00
Time:                        00:53:03   Log-Likelihood:                -21362.
No. Observations:               40147   AIC:                         4.279e+04
Df Residuals:                   40113   BIC:                         4.308e+04
Df Model:                          33                                         
Covariance Type:            nonrobust                                         
                                           coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------


In [10]:
reg_linear = LinearRegression(n_jobs=-1).fit(X_train, y_train)

In [11]:
reg_linear.score(X_train, y_train)

0.8104592371422159

Criando uma função para obter as métricas dos modelos de regressão como um DataFrame.

In [12]:
# Criando um dicionário com as métricas para usar em um dataframe

def df_metrics_regression(modelo, X_test, y_test):
    mae = metrics.mean_absolute_error(y_test, modelo.predict(X_test))
    mse = metrics.mean_squared_error(y_test, modelo.predict(X_test))
    rmse = np.sqrt(mse)
    r2 = metrics.r2_score(y_test, modelo.predict(X_test))
    return pd.DataFrame({'MAE': [mae], 'MSE': [mse], 'RMSE': [rmse], 'R2': [r2]})

In [13]:
metricas_reg_linear = df_metrics_regression(reg_linear, X_test, y_test)
metricas_reg_linear

Unnamed: 0,MAE,MSE,RMSE,R2
0,0.315061,0.181675,0.426233,0.798996


## Regressor da Árvore de Decisão


Vamos utilizar a validação cruzada para avaliarmos obtermos um valor mais generalista para o nosso modelo.

A utilização do KFold com shuffle foi nescessária pois o conjunto de dados está ordernado em ordem crescente de preço e ao fazer a validação cruzada há problemas pois a parte utilzada para teste não aprendeu a generalizar para aquele fold.

A métrica padrão retornada pelo `.DecisionTreeRegressor()` é o R2.

In [14]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

regressor = DecisionTreeRegressor(random_state=SEED, max_depth=3)
cross_val_score(regressor, X, y, cv=kf).mean()

0.6812426670866323

Ou seja, dividindo o conjunto de dados em 5 folds e calculando a média dos resultados do R2 obtemos um valor de 0.68.

Vamos analisar como o modelo se comporta para o conjunto de dados separados para teste.

In [15]:
regressor.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=3, random_state=42)

In [16]:
metricas_dt_reg = df_metrics_regression(regressor, X_test, y_test)
metricas_dt_reg

Unnamed: 0,MAE,MSE,RMSE,R2
0,0.408237,0.28767,0.536349,0.681724


## Regressor de Floresta Aleatória

Realizando a mesma análise para o modelo de árvore aleatória:

In [17]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

rf_reg = RandomForestRegressor(max_depth=3, random_state=SEED)
cross_val_score(rf_reg, X, y, cv=kf).mean()

0.7074251415820962

In [18]:
rf_reg.fit(X_train, y_train)

RandomForestRegressor(max_depth=3, random_state=42)

In [19]:
metricas_rf_reg = df_metrics_regression(rf_reg, X_test, y_test)
metricas_rf_reg

Unnamed: 0,MAE,MSE,RMSE,R2
0,0.389545,0.264735,0.514524,0.707099


## Analisando os resultados dos modelos:

In [24]:
metricas_modelos = pd.concat([metricas_reg_linear.T, metricas_dt_reg.T, metricas_rf_reg.T], axis=1)
metricas_modelos.columns = ['Linear Regression', 'Decision Tree', 'Random Forest']
metricas_modelos.head()

Unnamed: 0,Linear Regression,Decision Tree,Random Forest
MAE,0.315061,0.408237,0.389545
MSE,0.181675,0.28767,0.264735
RMSE,0.426233,0.536349,0.514524
R2,0.798996,0.681724,0.707099


Logo o modelo que apresentou melhor desempenho foi o modelo de regressão linear. 