<a href="https://colab.research.google.com/github/FelipeBuonoEvangelista/Linear_regression_models_with_boston_cahas_practice_test/blob/main/Houses_Boston_Training_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [94]:
#Bibliotecas usadas
import pandas as pd

import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn import datasets

from xgboost import XGBRegressor

#Base de dados para a análise
df = pd.read_csv(
    filepath_or_buffer="http://lib.stat.cmu.edu/datasets/boston",
    delim_whitespace=True,
    skiprows=21,
    header=None,
)

columns = [
    'CRIM',
    'ZN',
    'INDUS',
    'CHAS',
    'NOX',
    'RM',
    'AGE',
    'DIS',
    'RAD',
    'TAX',
    'PTRATIO',
    'B',
    'LSTAT',
    'MEDV',
]


values_w_nulls = df.values.flatten()
all_values = values_w_nulls[~np.isnan(values_w_nulls)]


df = pd.DataFrame(
    data = all_values.reshape(-1, len(columns)),
    columns = columns,
)

In [95]:
#Visão curta dos dados
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [96]:
#Características das casas
X = df[["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT"]]

#Preço das casas - Target
y = df[["MEDV"]]

#MODELING TECHNIQUES

1.Regressão linear do SKLEARN
<https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression>

2.Support Vector Regression do SKLEARN
<https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html>

3.Decision Tree Regression do XGBoost
<https://xgboost.readthedocs.io/en/stable/treemethod.html#>

Modeling Assumptions:

Apenas Variáveis numéricas

#TEST DESIGN

##Dataset split:

Separação de Train/Test dataset padrão com 20% de massa para teste via método SKLEARN
<https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html>

##Métrica de avaliação do modelo:

Validação da métrica MSE e RMSE para penalizar grandes erros de previsão.

Utilizando o método do SKLEARN.
<https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html>

In [97]:
#Criando as variáveis de teste e treino, por fim separando suas proporções
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)

#TÉCNICA 1. REGRESSÃO LINEAR



In [98]:
#Definindo minha variável de Regressão Linear
regLinear = LinearRegression().fit(X_train,y_train)

In [99]:
#Criando um modelo para predizer a massa de teste
yPredLinear = regLinear.predict(X_test)

In [100]:
#Calculando o Mean Squared Error
MSELinear = mean_squared_error(y_test,yPredLinear)

In [101]:
#Encontrando o RMSE (raiz do MSE)
print("MSE Linear:",MSELinear)
print("RMSE Linear:",np.sqrt(MSELinear))

MSE Linear: 24.291119474973485
RMSE Linear: 4.928602182665333


#TÉCNICA 2. SVR

In [None]:
#Treinando o modelo SVR
regSVR = SVR().fit(X_train,y_train)

In [103]:
#Criando um modelo para predizer a massa de teste
yPredSVR = regSVR.predict(X_test)

In [104]:
#Calculando o Mean Squared Error
MSESVR = mean_squared_error(y_test,yPredSVR)

In [105]:
#Encontrando o RMSE (raiz do MSE)
print("MSE SVR:",MSESVR)
print("RMSE SVR:",np.sqrt(MSESVR))

MSE SVR: 52.8383657679667
RMSE SVR: 7.269000327965785


#TÉCNICA3. Decision Tree Regression (XGBoost)

In [106]:
#Treinando o modelo Decision Tree Regression
regXGB = XGBRegressor().fit(X_train,y_train)

In [107]:
#Criando um modelo para predizer a massa de teste
yPredXGB = regXGB.predict(X_test)

In [108]:
#Calculando o Mean Squared Error
MSEXGB = mean_squared_error(y_test,yPredXGB)

In [109]:
#Encontrando o RMSE (raiz do MSE)
print("MSE XGB:",MSEXGB)
print("RMSE XGB:",np.sqrt(MSEXGB))

MSE XGB: 6.909231565384943
RMSE XGB: 2.62854171840299


#OTIMIZAÇÃO DE HIPERPARAMETROS


Utilizando o método GridSerachCV do SKLEARN

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html

In [120]:
#Visualizando os parâmetros do modelo XGB
regXGB.get_params().keys()

dict_keys(['objective', 'base_score', 'booster', 'callbacks', 'colsample_bylevel', 'colsample_bynode', 'colsample_bytree', 'device', 'early_stopping_rounds', 'enable_categorical', 'eval_metric', 'feature_types', 'gamma', 'grow_policy', 'importance_type', 'interaction_constraints', 'learning_rate', 'max_bin', 'max_cat_threshold', 'max_cat_to_onehot', 'max_delta_step', 'max_depth', 'max_leaves', 'min_child_weight', 'missing', 'monotone_constraints', 'multi_strategy', 'n_estimators', 'n_jobs', 'num_parallel_tree', 'random_state', 'reg_alpha', 'reg_lambda', 'sampling_method', 'scale_pos_weight', 'subsample', 'tree_method', 'validate_parameters', 'verbosity'])

In [129]:
#Definindo os parâmetros
parameters = {
    #Níveis de decisão para baixo da árvore
    "max_depth": [5, 6, 7],
    #Velocidade que o XGBoost aprende com o Dataset
    "learning_rate": [0.1, 0.2, 0.3],
    #Especificar a task do objetivo que será feito
    "objective": ['reg:squarederror'],
    #Decide qual modelo usar
    "booster": ['gbtree'],
    #Quantidade de threads paralelas para rodar o XGBoost
    "n_jobs": [5],
    #Valor de redução mínima da perda para se criar uma partição dentro de uma folha da árvore
    "gamma": [0, 1],
    #Soma do peso mínimo do nó filho da árvore
    "min_child_weight": [1,3],
    #máximo permitido dentro do peso da árvore
    "max_delta_step": [0,1],
    #porcentagem de sub-amostras no treinamento
    "subsample": [0.5, 1]
}

In [131]:
#Utilizando o GridSearchCV para configurar o modelo com as variações de hiperparâmetros
xgbGrid = GridSearchCV(XGBRegressor(),parameters,refit= 'neg_mean_squared_error',verbose=True)

In [132]:
#Treinando o modelo
xgbGridModel = xgbGrid.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


In [133]:
#Encontrando o melhor parâmetro
xgbGridModel.best_params_

{'booster': 'gbtree',
 'gamma': 0,
 'learning_rate': 0.2,
 'max_delta_step': 0,
 'max_depth': 5,
 'min_child_weight': 1,
 'n_jobs': 5,
 'objective': 'reg:squarederror',
 'subsample': 0.5}

In [134]:
#Encontrando o melhor parâmetro
yGrid = xgbGridModel.predict(X_test)

In [135]:
#Calculando o Mean Squared Error
MSEGrid = mean_squared_error(y_test,yGrid)

In [None]:
#Encontrando o RMSE (raiz do MSE)
print("MSE SVR:",MSEGrid)
print("RMSE SVR:",np.sqrt(MSEGrid))