# Imports

In [2]:
import pickle
import pathlib

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score

## Tratamento de dados feito pelo professor

In [3]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

/Users/enzoquental/Desktop/ml/ames/data


In [4]:
clean_data_path = DATA_DIR / 'processed' / 'ames_clean.pkl'

In [5]:
with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

In [6]:
model_data = data.copy()

In [7]:
categorical_columns = []
ordinal_columns = []
for col in model_data.select_dtypes('category').columns:
    if model_data[col].cat.ordered:
        ordinal_columns.append(col)
    else:
        categorical_columns.append(col)

In [8]:
ordinal_columns

['Lot.Shape',
 'Land.Slope',
 'Overall.Qual',
 'Overall.Cond',
 'Exter.Qual',
 'Exter.Cond',
 'Heating.QC',
 'Electrical',
 'Kitchen.Qual',
 'Functional',
 'Paved.Drive',
 'Fence']

In [9]:
categorical_columns

['MS.SubClass',
 'MS.Zoning',
 'Land.Contour',
 'Lot.Config',
 'Neighborhood',
 'Bldg.Type',
 'House.Style',
 'Roof.Style',
 'Mas.Vnr.Type',
 'Foundation',
 'Bsmt.Qual',
 'Bsmt.Cond',
 'Bsmt.Exposure',
 'BsmtFin.Type.1',
 'BsmtFin.Type.2',
 'Central.Air',
 'Garage.Type',
 'Garage.Finish',
 'Sale.Type',
 'Sale.Condition',
 'Condition',
 'Exterior']

In [10]:
for col in ordinal_columns:
    codes, _ = pd.factorize(data[col], sort=True)
    model_data[col] = codes

In [11]:
model_data[ordinal_columns].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2877 entries, 0 to 2929
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   Lot.Shape     2877 non-null   int64
 1   Land.Slope    2877 non-null   int64
 2   Overall.Qual  2877 non-null   int64
 3   Overall.Cond  2877 non-null   int64
 4   Exter.Qual    2877 non-null   int64
 5   Exter.Cond    2877 non-null   int64
 6   Heating.QC    2877 non-null   int64
 7   Electrical    2877 non-null   int64
 8   Kitchen.Qual  2877 non-null   int64
 9   Functional    2877 non-null   int64
 10  Paved.Drive   2877 non-null   int64
 11  Fence         2877 non-null   int64
dtypes: int64(12)
memory usage: 292.2 KB


In [12]:
data['Lot.Shape'].value_counts()

Reg    1825
IR1     960
IR2      76
IR3      16
Name: Lot.Shape, dtype: int64

In [13]:
model_data['Lot.Shape'].value_counts()

0    1825
1     960
2      76
3      16
Name: Lot.Shape, dtype: int64

In [14]:
model_data['Exterior'].value_counts()

VinylSd    1024
HdBoard     439
MetalSd     432
Wd Sdng     401
Plywood     218
CemntBd     126
BrkFace      86
WdShing      55
Stucco       42
AsbShng      41
Other        13
Name: Exterior, dtype: int64

In [15]:
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()

Unnamed: 0,0,1,2,3,4
AsbShng,0,0,0,0,0
BrkFace,1,0,0,1,0
CemntBd,0,0,0,0,0
HdBoard,0,0,0,0,0
MetalSd,0,0,0,0,0
Plywood,0,0,0,0,0
Stucco,0,0,0,0,0
VinylSd,0,1,0,0,1
Wd Sdng,0,0,1,0,0
WdShing,0,0,0,0,0


In [16]:
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data, drop_first=True)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()

Unnamed: 0,0,1,2,3,4
BrkFace,1,0,0,1,0
CemntBd,0,0,0,0,0
HdBoard,0,0,0,0,0
MetalSd,0,0,0,0,0
Plywood,0,0,0,0,0
Stucco,0,0,0,0,0
VinylSd,0,1,0,0,1
Wd Sdng,0,0,1,0,0
WdShing,0,0,0,0,0
Other,0,0,0,0,0


In [17]:
model_data = pd.get_dummies(model_data, drop_first=True)

In [18]:
model_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2877 entries, 0 to 2929
Columns: 165 entries, Lot.Frontage to Exterior_Other
dtypes: bool(2), float64(34), int64(12), uint8(117)
memory usage: 1.4 MB


In [19]:
for cat in categorical_columns:
    dummies = []
    for col in model_data.columns:
        if col.startswith(cat + "_"):
            dummies.append(f'"{col}"')
    dummies_str = ', '.join(dummies)
    print(f'From column "{cat}" we made {dummies_str}\n')

From column "MS.SubClass" we made "MS.SubClass_30", "MS.SubClass_50", "MS.SubClass_60", "MS.SubClass_70", "MS.SubClass_80", "MS.SubClass_85", "MS.SubClass_90", "MS.SubClass_120", "MS.SubClass_160", "MS.SubClass_190", "MS.SubClass_Other"

From column "MS.Zoning" we made "MS.Zoning_RH", "MS.Zoning_RL", "MS.Zoning_RM"

From column "Land.Contour" we made "Land.Contour_HLS", "Land.Contour_Low", "Land.Contour_Lvl"

From column "Lot.Config" we made "Lot.Config_CulDSac", "Lot.Config_FR2", "Lot.Config_FR3", "Lot.Config_Inside"

From column "Neighborhood" we made "Neighborhood_BrDale", "Neighborhood_BrkSide", "Neighborhood_ClearCr", "Neighborhood_CollgCr", "Neighborhood_Crawfor", "Neighborhood_Edwards", "Neighborhood_Gilbert", "Neighborhood_IDOTRR", "Neighborhood_MeadowV", "Neighborhood_Mitchel", "Neighborhood_NAmes", "Neighborhood_NPkVill", "Neighborhood_NWAmes", "Neighborhood_NoRidge", "Neighborhood_NridgHt", "Neighborhood_OldTown", "Neighborhood_SWISU", "Neighborhood_Sawyer", "Neighborhood_Sa

In [20]:
X = model_data.drop(columns=['SalePrice']).copy()
y = model_data['SalePrice'].copy()

In [21]:
X.values, y.values

(array([[141.0, 31770.0, 1, ..., 0, 0, 0],
        [80.0, 11622.0, 0, ..., 0, 0, 0],
        [81.0, 14267.0, 1, ..., 1, 0, 0],
        ...,
        [62.0, 10441.0, 0, ..., 0, 0, 0],
        [77.0, 10010.0, 0, ..., 0, 0, 0],
        [74.0, 9627.0, 0, ..., 0, 0, 0]], dtype=object),
 array([5.33243846, 5.0211893 , 5.23552845, ..., 5.12057393, 5.23044892,
        5.27415785]))

In [22]:
RANDOM_SEED = 42  

In [23]:
Xtrain, Xtest, ytrain, ytest = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_SEED,
)

## Modelo de regessão linear simples já feito pelo professor

In [24]:
model = LinearRegression()

model.fit(Xtrain, ytrain)
ypred = model.predict(Xtest)

RMSE = np.sqrt(mean_squared_error(ytest, ypred))
error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 15.11%


# Feature Engenineering

- StandardScaler: vamos padronizar todos os dados.

- Área: vamos criar uma nova feature equivalente à somatória das áreas do imóvel.

```OBS:``` A partir daqui os codigos não foram mais feitos pelo professor, e em alguns casos foi utilizado o auxilio do ChatGpt.

In [25]:
area_related_features = ['Lot.Area', 'BsmtFin.SF.1', 'BsmtFin.SF.2', 'Bsmt.Unf.SF', 'Total.Bsmt.SF', 
                        'X1st.Flr.SF', 'X2nd.Flr.SF', 'Gr.Liv.Area']
model_data['Total.Area'] = model_data[area_related_features].sum(axis=1)

scaler = StandardScaler()
model_data = scaler.fit_transform(model_data)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [26]:
# Defining a function to calculate the average error
def calculate_error(y_true, y_pred):
    RMSE = np.sqrt(mean_squared_error(y_true, y_pred))
    error_percent = 100 * (10**RMSE - 1)
    return error_percent

# Creating the models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'Elastic Net': ElasticNet()
}

# Training the models and calculating the average error for each one
errors = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    errors[model_name] = calculate_error(y_test, y_pred)

errors


{'Linear Regression': 15.11389759301347,
 'Ridge': 15.100140789075,
 'Lasso': 27.561181012775382,
 'Elastic Net': 25.208200541428315}

# Analisadno o Resultado

A partir dos resultados 

# Achando hipermarametros para os modelos

### ```Ridge```

In [27]:
# Creating a Ridge regression model
ridge_model = Ridge()

# Defining hyperparameters to tune
ridge_params = {'alpha': [0.01, 0.1, 1, 10, 100]}

# Setting up GridSearchCV for Ridge model
ridge_grid = GridSearchCV(ridge_model, ridge_params, cv=5, scoring='neg_mean_squared_error')

# Fitting the model
ridge_grid.fit(X_train, y_train)

# Getting the best hyperparameters
ridge_best_params = ridge_grid.best_params_
ridge_best = ridge_grid.best_estimator_
ridge_best_params

{'alpha': 10}

### ```Lasso```

In [28]:
# Creating a Lasso regression model
lasso_model = Lasso()

# Defining hyperparameters to tune
lasso_params = {'alpha': [0.0001, 0.001, 0.01, 0.1, 1]}

# Setting up GridSearchCV for Lasso model
lasso_grid = GridSearchCV(lasso_model, lasso_params, cv=5, scoring='neg_mean_squared_error')

# Fitting the model
lasso_grid.fit(X_train, y_train)

# Getting the best hyperparameters
lasso_best_params = lasso_grid.best_params_
lasso_best = lasso_grid.best_estimator_
lasso_best_params


{'alpha': 0.0001}

### ```Elastic Net```

In [29]:
# Creating an Elastic Net regression model
elastic_model = ElasticNet()

# Defining hyperparameters to tune
elastic_params = {'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1],
                  'l1_ratio': [0.2, 0.4, 0.6, 0.8]}
# Setting up GridSearchCV for Elastic Net model
elastic_grid = GridSearchCV(elastic_model, elastic_params, cv=5, scoring='neg_mean_squared_error')

# Fitting the model
elastic_grid.fit(X_train, y_train)

# Getting the best hyperparameters
elastic_best_params = elastic_grid.best_params_
elastic_best = elastic_grid.best_estimator_
elastic_best_params

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


{'alpha': 0.0001, 'l1_ratio': 0.6}

# Testando os modelos com os melhores hiperparametros

In [30]:
ridge_best.fit(X_train, y_train)
y_pred = ridge_best.predict(X_test)
print(f'Erro Médio Ridge: {calculate_error(y_test, y_pred)}%')


Erro Médio Ridge: 15.333634985764121%


In [31]:
lasso_best.fit(X_train, y_train)
y_pred = lasso_best.predict(X_test)
print(f'Erro Médio Lasso: {calculate_error(y_test, y_pred)}%')

Erro Médio Lasso: 15.438146030643596%


In [32]:
elastic_best.fit(X_train, y_train)
y_pred = elastic_best.predict(X_test)
print(f'Erro Médio Elastic Net: {calculate_error(y_test, y_pred)}%')

Erro Médio Elastic Net: 15.272100126746002%


# TTest

In [33]:
from scipy.stats import ttest_rel

# Function to perform paired t-test and return the p-value
def paired_ttest(model_1_preds, model_2_preds, y_test):
    errors_1 = model_1_preds - y_test
    errors_2 = model_2_preds - y_test
    t_stat, p_value = ttest_rel(errors_1, errors_2)
    return p_value

# Getting predictions from each model
predictions = {}
for model_name, model in models.items():
    predictions[model_name] = model.predict(X_test)

# Storing the p-values for each model comparison
p_values = {}

# Comparing each model with each other model
for i, (name_1, preds_1) in enumerate(predictions.items()):
    for name_2, preds_2 in list(predictions.items())[i+1:]:
        p_value = paired_ttest(preds_1, preds_2, y_test)
        comparison_name = f"{name_1} vs {name_2}"
        p_values[comparison_name] = p_value

p_values


{'Linear Regression vs Ridge': 0.19170962930163057,
 'Linear Regression vs Lasso': 0.8254325951188646,
 'Linear Regression vs Elastic Net': 0.7343092941383618,
 'Ridge vs Lasso': 0.788770669013646,
 'Ridge vs Elastic Net': 0.7712548233478899,
 'Lasso vs Elastic Net': 0.042630713029028294}

# Analisando os resultados

Durante o notebook, foram testados 4 modelos: Ridge, Lasso, Regressão Linear Simples, Elastic Net. Realizamos features engenineering para padronizar os dados e criar uma nova feature a respeito da área do imóvel. Por fim, testamos os modelos com os melhores hiperparametros.

A partir dos resultados obtidos, podemos concluir que não há uma diferença significativa nos erros de previsão entre a maioria dos pares de modelos, conforme evidenciado pelos valores-p acima do limiar de significância de ```0.05```. No entanto, ao comparar o modelo Lasso com o Elastic Net, obtemos um valor-p de ```0.0426```, que é menor que ```0.05```, indicando uma diferença estatisticamente significativa nos erros de previsão desses dois modelos.

Acabamos escolhendo o Elastic Net como o melhor modelo.

# Modelo Final

Agora que escolhemos o Elastic Net como melhor modelo, vamos analisa-lo mais a fundo e medir seu desempenho.

In [34]:
from sklearn.linear_model import ElasticNetCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def train_elasticnet(X_train, y_train, alphas, l1_ratios, cv=7):
    """
    Train an ElasticNet model with cross-validation.
    """
    model = ElasticNetCV(alphas=alphas, l1_ratio=l1_ratios, cv=cv)
    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model and return RMSE and R^2.
    """
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    return rmse, r2

def print_results(model, rmse, r2):
    """
    Print the evaluation results.
    """
    print(f"Optimal alpha: {model.alpha_}")
    print(f"Optimal l1_ratio: {model.l1_ratio_}")
    print(f"Test RMSE: {rmse}")
    print(f"Test R^2: {r2}")

# Defining possible values for hyperparameters
alphas = np.logspace(-5, 5, 11)
l1_ratios = [0.2, 0.4, 0.6, 0.8]

# Training the model
elasticnet_model = train_elasticnet(X_train, y_train, alphas, l1_ratios)

# Evaluating the model
rmse, r2 = evaluate_model(elasticnet_model, X_test, y_test)

# Printing the results
print_results(elasticnet_model, rmse, r2)


  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descen

Optimal alpha: 0.0001
Optimal l1_ratio: 0.4
Test RMSE: 0.06145973212735336
Test R^2: 0.8791027813953507


O Elastic Net foi escolhido como o melhor modelo para a análise acima por ser uma combinação entre a regressão Ridge e a regressão Lasso. Ele possui a capacidade de lidar com a multicolinearidade dos dados, como a regressão Ridge, e também a capacidade de realizar seleção de variáveis, como a regressão Lasso.