In [12]:
import pickle
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('./datasets/final_dataset.csv')

In [3]:
X=df.drop(['price'],axis=1)
y=df['price']

In [4]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((208630, 25), (89413, 25), (208630,), (89413,))

In [5]:
def save_model(model, filename):
    with open(filename, 'wb') as file:
        pickle.dump(model, file)

def load_model(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [6]:
def adjusted_r2_score(r2, n, k):
    return 1 - (1 - r2) * (n - 1) / (n - k - 1)

In [7]:
def evaluate_metrics(X_train, X_test, y_train_pred, y_test_pred):
    n_train, k_train = X_train.shape
    n_test, k_test = X_test.shape

    # MSE and RMSE
    mse_train = mean_squared_error(y_train, y_train_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)

    rmse_train = np.sqrt(mse_train)
    rmse_test = np.sqrt(mse_test)

    # R2 and Adjusted R2
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)

    adj_r2_train = adjusted_r2_score(r2_train, n_train, k_train)
    adj_r2_test = adjusted_r2_score(r2_test, n_test, k_test)

    # MAE
    mae_train = mean_absolute_error(y_train, y_train_pred)
    mae_test = mean_absolute_error(y_test, y_test_pred)

    # Print results
    print(f'Train MSE: {mse_train}, Test MSE: {mse_test}')
    print(f'Train RMSE: {rmse_train}, Test RMSE: {rmse_test}')
    print(f'Train R2: {r2_train}, Test R2: {r2_test}')
    print(f'Train Adjusted R2: {adj_r2_train}, Test Adjusted R2: {adj_r2_test}')
    print(f'Train MAE: {mae_train}, Test MAE: {mae_test}')

In [11]:
model = load_model('./checkpoint/xgboost_1.pkl')
y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

evaluate_metrics(X_train,X_test,y_train_pred,y_pred)

Train MSE: 12243257.0714403, Test MSE: 12630506.834383842
Train RMSE: 3499.0365918978755, Test RMSE: 3553.9424354347443
Train R2: 0.9762985450916396, Test R2: 0.9755727621847804
Train Adjusted R2: 0.9762957046074077, Test Adjusted R2: 0.9755659303082729
Train MAE: 1970.5494623681268, Test MAE: 2003.4320262517153


In [13]:
n_components = 7
pca = PCA(n_components=n_components)

X_pca = pca.fit_transform(X)

In [14]:
X_train,X_test,y_train,y_test=train_test_split(X_pca,y,test_size=0.30,random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((208630, 7), (89413, 7), (208630,), (89413,))

In [15]:
model = load_model('./checkpoint/xgboost_2.pkl')
y_pred = model.predict(X_test)
y_train_pred = model.predict(X_train)

evaluate_metrics(X_train,X_test,y_train_pred,y_pred)

Train MSE: 15276383.876703791, Test MSE: 15890663.30826332
Train RMSE: 3908.501487361082, Test RMSE: 3986.309484756963
Train R2: 0.9704267809208139, Test R2: 0.9692676614832402
Train Adjusted R2: 0.9704257886355633, Test Adjusted R2: 0.9692652552825846
Train MAE: 2263.5966047445604, Test MAE: 2294.724770404177
