In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

from preprocessing import make_preprocessor
import warnings
warnings.filterwarnings(action="ignore")

In [2]:
df = pd.read_csv("train.csv")
df['LogSalePrice'] = np.log1p(df['SalePrice'])

In [3]:
y = df['LogSalePrice'].to_numpy()
X = df.drop(columns=['SalePrice', 'LogSalePrice', 'Id'])

In [12]:
X_train, X_valid, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

preprocessor = make_preprocessor(X_train, drop_engineered_cols=True)
X_train = preprocessor.fit_transform(X_train)
feature_names = preprocessor.named_steps["encode"].get_feature_names_out()
X_valid = preprocessor.transform(X_valid)

print(X_train.shape)

(1168, 166)


## Modelling

In [21]:
def rmse(model):
    preds = model.predict(X_valid)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    
    print('-'*25)
    print(f'RMSE: {rmse:.4f}')
    print('-'*25)

### Linear Regression

In [22]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)

rmse(linear_model)

-------------------------
RMSE: 0.1379
-------------------------


### Ridge Regression

In [23]:
ridge_model = Ridge(alpha=2)
ridge_model.fit(X_train, y_train)

rmse(ridge_model)

-------------------------
RMSE: 0.1392
-------------------------


### XGBoost

In [24]:
xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.01,
    max_depth=3,
    subsample=0.6,
    colsample_bytree=0.5,
    random_state=42
)

xgb_model.fit(
    X_train,
    y_train,
    eval_set=[(X_valid, y_val)],
    verbose=False
)

rmse(xgb_model)

-------------------------
RMSE: 0.1304
-------------------------
