In [1]:
# Data manipulation
import numpy as np
import pandas as pd

# Sklearn basics for data preprocessing & model building & ...
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error


# Metrics & Cross Validation
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

# Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge


# Others
import pickle
import pathlib


In [2]:
# I asked to Claude 3.5 sonnet to format and organize the columns

NUMERIC_FEATURES = {
   'continuous': [
       'Lot.Frontage', 'Lot.Area', 'Mas.Vnr.Area',
       'BsmtFin.SF.1', 'BsmtFin.SF.2', 'Bsmt.Unf.SF', 
       'Total.Bsmt.SF', 'X1st.Flr.SF', 'X2nd.Flr.SF',
       'Low.Qual.Fin.SF', 'Gr.Liv.Area', 'Garage.Area',
       'Wood.Deck.SF', 'Open.Porch.SF', 'Enclosed.Porch',
       'X3Ssn.Porch', 'Screen.Porch', 'Pool.Area',
       'Misc.Val'
   ],
   'discrete': [
       'Bsmt.Full.Bath', 'Bsmt.Half.Bath', 'Full.Bath',
       'Half.Bath', 'Bedroom.AbvGr', 'Kitchen.AbvGr',
       'TotRms.AbvGrd', 'Fireplaces', 'Garage.Cars',
       'Mo.Sold'
   ],
   'time_related': [
       'Garage.Age'
   ]
}

CATEGORICAL_FEATURES = {
   'ordinal': [
       'Overall.Qual', 'Overall.Cond', 'Exter.Qual',
       'Bsmt.Qual', 'Kitchen.Qual', 'Garage.Finish',
       'Paved.Drive'
   ],
   'nominal': [
       'MS.SubClass', 'MS.Zoning', 'Neighborhood',
       'Bldg.Type', 'House.Style', 'Roof.Style',
       'Mas.Vnr.Type', 'Foundation', 'Garage.Type',
       'Sale.Type', 'Sale.Condition', 'Condition',
       'Exterior'
   ]
}

# 🗂️ Evaluating Models
This notebook is the final data preparation to start training and testing differents models. I am going to use the data from the previous notebooks to prepare the data to be used in the models.

---

- I am going to use the following models:
    - 🌲 *Random Forest Regressor*
    - 🚀 *XGBoost*
    - 🎯 *SVR*
    - 👎 *Linear Regression* 



## 📋 Previous Steps Summary
* 📤 Load the data
* 📊 Divided columns into:
  * **continuous** variables 📈 
  * **discrete** variables 🔢
  * **ordinal** variables 📅
  * **categorical** variables 🏷️
* 🎯 Remove the outliers 
* ⚖️ Remove with low representation
* 🔄 Merge classes into a new class that unify them, when it was unbalanced:
  * *Example* 💡: 
    * In the case of the Sale.Type class, It merged the classes `'WD '`, `'CWD'`, `'VWD'` into a new class called **GroupedWD**, making the class more balanced! ✅
    * And for `'COD'`, `'ConLI'`, `'Con'`, `'ConLD'`, `'Oth'`, `'ConLw'`, It merged them into a new class called **Other**. 🔀
* 🗑️ Dropped the columns that was not relevant because almost all the values were the same, example:
  * **Street**️:
    * **Paved**: 2895 
    * **Gravel**: 6 
* 🤝 Merge ordinal classes that were similar or not relevant to be separated.
* 🔍 Remove redundant quality indicators when count variables were sufficient, example:
  * For Fireplaces, dropped the `Fireplace.QU` (quality) column since:
    * **Distribution showed** 📊:
      * No fireplace: 1392 
      * One fireplace: 1257 
      * Two fireplaces: 219 
      * Three fireplaces: 11 
      * Four fireplaces: 1 
    * The `Fireplaces` count variable already effectively conveyed the presence and quantity information! 📈
    * Quality ratings ("good"/"typical") were less relevant for the analysis 
* 📐 Log transformation for SalePrice to improve distribution 
* 🧩 Missing values imputation 
* ❌ Remove the rows with missing values 
* ⏰ Converted year-based variable to age => More meaningful representation of time
* 🚫 Avoids negative values which are logically impossible, and would cause issues in the regression models 😵
* 🧹 Remove irrelevant columns

---
Taking into account that there is not a lot to do in the data preparation because after handling outliers, balancing classes, merging similar categories, removing irrelevant and redundant features, transforming variables, imputing missing values, etc..., the dataset is ready to be used in the models. 🫶
<br>
The only remaining step is to prepare the categorical variables for modeling. And the approach that I am going to use is the **One-Hot Encoding** which converts categorical variables into a numerical format.

### 💾 Load the data

In [3]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

/Users/andreoliveira/Documents/INSPER/4º SEMESTRE/MACHINE LEARNING/projeto-final/data


In [4]:
clean_data_path = DATA_DIR / 'processed' / 'ames_clean.pkl'

In [5]:
with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

### 🔪 Splitting the data

In [6]:
RANDOM_SEED = 42
TEST_SIZE = 0.2

In [7]:
X = data.drop(columns=['SalePrice']).copy()
y = data['SalePrice'].copy()

In [8]:
X["MS.SubClass"] = X["MS.SubClass"].astype(str) # I was having a problem with this column, because somehow it was being treated as a number even though it is a category

In [9]:
Xtrain, Xtest, ytrain, ytest = train_test_split(
    X,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_SEED,
)

### 🧪 Building the pipelines

In [10]:
# I was inspired to build these pipelines from the book "Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow", 02 chapter

numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')), # Just to be safe 😉
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), # Fill missing values with 'missing' - ref: https://scikit-learn.org/1.5/modules/generated/sklearn.impute.SimpleImputer.html
    ('encoder', OneHotEncoder(handle_unknown="ignore")) # Will create a binary column for each category
])

preprocessing_pipeline = ColumnTransformer( # I could just use the ColumnTransformer to do all the preprocessing but I wanted to separate better the features
    transformers=[
        ('num_continuous', numeric_pipeline, NUMERIC_FEATURES['continuous']),
        ('num_discrete', numeric_pipeline, NUMERIC_FEATURES['discrete']),
        ('time_features', numeric_pipeline, NUMERIC_FEATURES['time_related']),
        ('cat_ordinal', categorical_pipeline, CATEGORICAL_FEATURES['ordinal']),
        ('cat_nominal', categorical_pipeline, CATEGORICAL_FEATURES['nominal'])
    ],
    remainder='drop'
)

In [11]:
test_preprocessing = preprocessing_pipeline.fit_transform(Xtrain)

test_preprocessing

array([[-0.04649523,  0.2567838 ,  0.41876064, ...,  0.        ,
         0.        ,  0.        ],
       [-0.33840665, -0.30029636, -0.57211164, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.34271999, -0.06920127,  1.32155539, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [-0.04649523,  0.34480835, -0.57211164, ...,  0.        ,
         0.        ,  0.        ],
       [-0.43571045, -0.50944662, -0.57211164, ...,  0.        ,
         0.        ,  0.        ],
       [-0.04649523, -0.09568219,  2.58216512, ...,  1.        ,
         0.        ,  0.        ]])

### 🧙‍♂️ Evaluating the models, *where the magic happens!*

In [12]:
# Made my best to make the code more readable and organized 🫣

param_grids = {
   'Linear': {
       'regressor__fit_intercept': [True, False],
   },
   'Random Forest': {
       'regressor__max_depth': [3, 5, 7],
       'regressor__min_samples_split': [2, 5, 10],
   },
   'Ridge': {
       'regressor__alpha': np.logspace(-2, 2, 7)
   },
   'SVR': {
       'regressor__C': [0.1, 1, 10],
       'regressor__kernel': ['linear', 'rbf'],
       'regressor__gamma': ['scale', 'auto']
   }
}

def run_model_grid_search(model, param_grid, X, y, model_name):
    print(f"\n🔍 Starting Grid Search for {model_name}")

    pipeline_steps = [
        ('preprocessor', preprocessing_pipeline),
        ('regressor', model)
    ]

    if model_name == 'Ridge':
        pipeline_steps.insert(1, ('poly', PolynomialFeatures(include_bias=False))) # Add the PolynomialFeatures to the pipeline, before the regressor -> it will create new features based on the polynomial degree -> Hope it helps to improve the model 🫠

    pipeline = Pipeline(pipeline_steps)

    grid = GridSearchCV(
        pipeline,
        param_grid,
        cv=5,
        scoring='neg_mean_squared_error', # Because it's a regression problem!
        n_jobs=-1,
        verbose=1
    )

    grid.fit(X, y)

    print(f"✨ Best parameters: {grid.best_params_}")
    print(f"📊 Best score: {-grid.best_score_}")
    print(f"👹 Test score (just a quick check): {grid.best_estimator_.score(Xtest, ytest)}")

    return grid

In [13]:
MODELS = {
    'Linear': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Ridge': Ridge(),
    'SVR': SVR()
}

In [14]:
prepared_models = {}

for model_name, model in MODELS.items():
    prepared_models[model_name] = run_model_grid_search(model, param_grids[model_name], Xtrain, ytrain, model_name)


🔍 Starting Grid Search for Linear
Fitting 5 folds for each of 2 candidates, totalling 10 fits
✨ Best parameters: {'regressor__fit_intercept': True}
📊 Best score: 2.218702922199614e+19
👹 Test score (just a quick check): 0.8740021739338963

🔍 Starting Grid Search for Random Forest
Fitting 5 folds for each of 9 candidates, totalling 45 fits
✨ Best parameters: {'regressor__max_depth': 7, 'regressor__min_samples_split': 2}
📊 Best score: 0.004369248909151798
👹 Test score (just a quick check): 0.8664540945563061

🔍 Starting Grid Search for Ridge
Fitting 5 folds for each of 7 candidates, totalling 35 fits
✨ Best parameters: {'regressor__alpha': np.float64(100.0)}
📊 Best score: 0.0039489689414568185
👹 Test score (just a quick check): 0.9139682520768085

🔍 Starting Grid Search for SVR
Fitting 5 folds for each of 12 candidates, totalling 60 fits
✨ Best parameters: {'regressor__C': 1, 'regressor__gamma': 'auto', 'regressor__kernel': 'rbf'}
📊 Best score: 0.003988953074696268
👹 Test score (just a q

In [15]:
# Best performing model: Ridge with alpha = 100, resulting in a score of ~0.91

model_pipeline =  Pipeline([
    ('preprocessor', preprocessing_pipeline),
    ('poly', PolynomialFeatures(include_bias=False)),
    ('regressor', Ridge(alpha=100))
])

model_pipeline.fit(Xtrain, ytrain)

In [16]:
model_pipeline.score(Xtest, ytest)

0.9139682520768085

In [17]:
ypred = model_pipeline.predict(Xtest)

In [18]:
RMSE = np.sqrt(mean_squared_error(ytest, ypred))

In [19]:
error_percent = 100 * (10**RMSE - 1)
print(f'Average error is {error_percent:.2f}%')

Average error is 12.31%


In [21]:
import joblib

joblib.dump(model_pipeline, '../api/model.joblib')

['../api/model.joblib']

NameError: name 'data' is not defined