In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/prasertcbs/basic-dataset/master/usedcars_with_missing_values.csv")

In [None]:
df['year'] = df['year'].astype('category')

In [None]:
df

## Clean data

In [None]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, root_mean_squared_error
import math

In [None]:
df_dummy = pd.get_dummies(df)
df_dummy = df_dummy.dropna()
df_dummy = df_dummy.reset_index(drop = True)

In [None]:
outcomename = 'price'
X = df_dummy.drop(columns = outcomename)
Y = df_dummy[outcomename]

In [None]:
class create_model:
    def __init__(self, model_name, test_size, X, Y, n_splits=5):
        self.model_name = model_name
        self.test_size = test_size
        self.X = X
        self.Y = Y
        self.n_splits = n_splits
    
    def model(self):
        kf = KFold(n_splits = self.n_splits, shuffle = True)
        scores = []
        for train_index, test_index in kf.split(self.X):
            self.X_train, self.X_test = self.X.values[train_index], self.X.values[test_index]
            self.Y_train, self.Y_test = self.Y.values[train_index], self.Y.values[test_index]
            self.model_name.fit(self.X_train, self.Y_train)
            scores.append(self.model_name.score(self.X_test, self.Y_test))
        self.mean_score = sum(scores) / self.n_splits
        return self.model_name
    
    def score(self):
        self.Y_pred = self.model_name.predict(self.X)
        self.r2 = r2_score(self.Y, self.Y_pred)
        self.Y_predict = self.model_name.predict(self.X_test)
        self.rmse = root_mean_squared_error(self.Y_test, self.Y_predict)
        # print("Mean K-fold R^2 score:", self.mean_score)
        # print("Overall R^2 score:", self.r2)
        # return(self.mean_score, self.r2)

In [None]:
linear_model = create_model(LinearRegression(), 0.1, X, Y, n_splits=5)

In [None]:
def mean_score(num: int, model: str):
    for number in range(num):
        model.model()
        model.score()
        yield model.mean_score, model.r2, model.rmse

In [None]:
linear_yield = mean_score(100, linear_model)

In [None]:
df_linear = pd.DataFrame(linear_yield)
df_linear.columns = ['Mean_K-fold_R^2_score', 'Overall_R^2_score', 'root_mean_squared_error']

In [None]:
df_linear.head()

In [None]:
df_linear.describe()

The Overall R^2 score is slightly higher than the Mean K-fold R^2 score, suggested that the model is performing well on the test set as well as on the training set. It indicates that the model has learned to generalize well to new data, which is a desirable quality in a model.

In [None]:
def hist_kfold_r2score(kfold, r2score, w):
    fig, axs = plt.subplots(1, 2, tight_layout=True)

    axs[0].hist(kfold, edgecolor='black', bins=np.arange(min(kfold), max(kfold) + w, w))
    axs[0].set_xlabel('Mean K-fold R^2 score')
    axs[0].set_ylabel('Frequency')
    axs[0].set_xlim([0, 1])

    axs[1].hist(r2score, edgecolor='black', bins=np.arange(min(r2score), max(r2score) + w, w))
    axs[1].set_xlabel('Overall R^2 score')
    axs[1].set_ylabel('Frequency')
    axs[1].set_xlim([0, 1])

In [None]:
kfold = df_linear['Mean_K-fold_R^2_score']
r2score = df_linear['Overall_R^2_score']
hist_kfold_r2score(kfold, r2score, 0.05)

In [None]:
mean_squared_error_score = df_linear['root_mean_squared_error']
mean_error_score = df_linear['mean_error']
hist_kfold_r2score(mean_squared_error_score, mean_error_score, 0.05)

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
decision_tree_model = create_model(DecisionTreeRegressor(), 0.1, X, Y, n_splits=5)
tree_yield = mean_score(100, decision_tree_model)
df_tree = pd.DataFrame(tree_yield)
df_tree.columns = ['Mean_K-fold_R^2_score', 'Overall_R^2_score']

In [None]:
df_tree.head()

In [None]:
df_tree.describe()

In [None]:
kfold = df_tree['Mean_K-fold_R^2_score']
r2score = df_tree['Overall_R^2_score']
hist_kfold_r2score(kfold, r2score, 0.05)

In [None]:
mean_squared_error_score = df_tree['mean_squared_error']
mean_error_score = df_tree['mean_error']
hist_kfold_r2score(mean_squared_error_score, mean_error_score, 0.05)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
forest_model = create_model(RandomForestRegressor(), 0.1, X, Y, n_splits=5)
forest_yield = mean_score(100, forest_model)
df_forest = pd.DataFrame(forest_yield)
df_forest.columns = ['Mean_K-fold_R^2_score', 'Overall_R^2_score']

In [None]:
df_forest.head()

In [None]:
df_forest.describe()

In [None]:
kfold = df_forest['Mean_K-fold_R^2_score']
r2score = df_forest['Overall_R^2_score']
hist_kfold_r2score(kfold, r2score, 0.03)

In [None]:
mean_squared_error_score = df_forest['mean_squared_error']
mean_error_score = df_forest['mean_error']
hist_kfold_r2score(mean_squared_error_score, mean_error_score, 0.05)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
xg_model = create_model(GradientBoostingRegressor(), 0.1, X, Y, n_splits = 5)
xg_yield = mean_score(100, xg_model)
df_xg = pd.DataFrame(xg_yield)
df_xg.columns = ['Mean_K-fold_R^2_score', 'Overall_R^2_score']

In [None]:
df_xg.head()

In [None]:
df_xg.describe

In [None]:
kfold = df_xg['Mean_K-fold_R^2_score']
r2score = df_xg['Overall_R^2_score']
hist_kfold_r2score(kfold, r2score, 0.05)

In [None]:
mean_squared_error_score = df_xg['mean_squared_error']
mean_error_score = df_xg['mean_error']
hist_kfold_r2score(mean_squared_error_score, mean_error_score, 0.05)