In [52]:
# data manipulation
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt

# modeling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import root_mean_squared_log_error, mean_absolute_error, r2_score
import optuna
from xgboost import XGBRegressor

#### Importing the data

In [2]:
df_train = pd.read_csv("data/train_preprocessed.csv", low_memory=False)

Since the dataset is very big (over 400k instances), we are going to use only 1/4 of it, for choosing the hyperparameters

In [9]:
df_train_subset = df_train.sample(frac=0.25, random_state=42)
len(df_train_subset)

103174

In [10]:
# splitting into X and Y
df_t = df_train_subset[df_train_subset['saleYear'] < 2012]
df_v = df_train_subset[df_train_subset['saleYear'] == 2012]

X_train, X_val = df_t.drop(columns="SalePrice"), df_v.drop(columns="SalePrice")
y_train, y_val = df_t['SalePrice'], df_v['SalePrice']

#### Evaluation Function

In [59]:
def score_rmsle(model):
    train_preds = model.predict(X_train)
    val_preds = model.predict(X_val)
    scores = {
        "train MAE": mean_absolute_error(y_train, train_preds),
        "val MAE": mean_absolute_error(y_val, val_preds),
        "train r2": r2_score(y_train, train_preds),
        "val r2": r2_score(y_val, val_preds),
        "train rmsle": root_mean_squared_log_error(y_train, train_preds),
        "val rmsle": root_mean_squared_log_error(y_val, val_preds)
    }
    return scores

### Training the models
We are going to be training a random forest model

In [16]:
models = {
    'RandomForest': []
}

#### Random Forest

In [14]:
clf = RandomForestRegressor(random_state=42).fit(X_train, y_train)

0.8571974230974703


In [27]:
base_scores = score_rmsle(clf)
base_scores

{'train MAE': 1846.564844402788,
 'val MAE': 6475.541203607353,
 'train r2': 0.983550519486161,
 'val r2': 0.8571974230974703,
 'train rmsle': 0.09684217127535479,
 'val rmsle': 0.25893849947720776}

In [28]:
models['RandomForest'].append(base_scores)

In [46]:
def objective_RF(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 1000)
    max_features = trial.suggest_categorical('max_features', ['log2', 'sqrt', None])
    max_depth = trial.suggest_int('max_depth', 10, 100)
    min_samples_split = trial.suggest_int('min_samples_slit', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 4)

    clf = RandomForestRegressor(
        n_estimators=n_estimators,
        max_features=max_features,
        max_depth=max_depth,
        min_samples_leaf=min_samples_leaf,
        min_samples_split=min_samples_split,
        random_state=42,
        n_jobs=-1
    )

    clf.fit(X_train, y_train)
    score = clf.score(X_val, y_val)

    return score

In [47]:
study = optuna.create_study(direction="maximize")
study.optimize(objective_RF, n_trials=20)

trial = study.best_trial

[I 2024-10-04 07:25:25,705] A new study created in memory with name: no-name-2fa59df4-9eec-4ae9-8d87-d5132e429209
[I 2024-10-04 07:25:30,199] Trial 0 finished with value: 0.772571611730743 and parameters: {'n_estimators': 265, 'max_features': 'log2', 'max_depth': 48, 'min_samples_slit': 10, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.772571611730743.
[I 2024-10-04 07:25:39,356] Trial 1 finished with value: 0.8361143364329204 and parameters: {'n_estimators': 229, 'max_features': 'sqrt', 'max_depth': 69, 'min_samples_slit': 2, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.8361143364329204.
[I 2024-10-04 07:26:58,340] Trial 2 finished with value: 0.8562261830855342 and parameters: {'n_estimators': 533, 'max_features': None, 'max_depth': 92, 'min_samples_slit': 5, 'min_samples_leaf': 4}. Best is trial 2 with value: 0.8562261830855342.
[I 2024-10-04 07:27:17,785] Trial 3 finished with value: 0.8109388296321162 and parameters: {'n_estimators': 920, 'max_features': 'sqrt', '

In [48]:
print(f"Highest accuracy: {trial.value}")
for k, v in trial.params.items():
    print(f"{k}: {v}")

Highest accuracy: 0.8592110576984001
n_estimators: 742
max_features: None
max_depth: 35
min_samples_slit: 3
min_samples_leaf: 2


Training the model with the entire dataset, using the parameters we found

In [61]:
train = df_train[df_train['saleYear'] < 2012]
val = df_train[df_train['saleYear'] == 2012]

X_train_c, X_val_c = train.drop(columns="SalePrice"), val.drop(columns="SalePrice")
y_train_c, y_val_c = train['SalePrice'], val['SalePrice']

In [62]:
def score_rmsle_c(model):
    train_preds = model.predict(X_train_c)
    val_preds = model.predict(X_val_c)
    scores = {
        "train MAE": mean_absolute_error(y_train_c, train_preds),
        "val MAE": mean_absolute_error(y_val_c, val_preds),
        "train r2": r2_score(y_train_c, train_preds),
        "val r2": r2_score(y_val_c, val_preds),
        "train rmsle": root_mean_squared_log_error(y_train_c, train_preds),
        "val rmsle": root_mean_squared_log_error(y_val_c, val_preds)
    }
    return scores

In [63]:
best_params = trial.params
rf = RandomForestRegressor(
    n_estimators=best_params['n_estimators'],
    max_features=best_params['max_features'],
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_slit'],
    min_samples_leaf=best_params['min_samples_leaf'],
    random_state=42,
    n_jobs=-1
).fit(X_train_c, y_train_c)

score = score_rmsle_c(rf)

In [64]:
score

{'train MAE': 1948.2014523406506,
 'val MAE': 6042.512938396712,
 'train r2': 0.9795558906388644,
 'val r2': 0.8744567343774503,
 'train rmsle': 0.10392332243580103,
 'val rmsle': 0.2501525859447095}

In [74]:
df_test = pd.read_csv("data/test_preprovessed.csv", low_memory=False)
df_test = df_test.reindex(columns=X_train_c.columns)

In [75]:
test_preds = rf.predict(df_test)