In [1]:
import pandas as pd
import numpy as np
import math
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("https://github.com/prasertcbs/basic-dataset/raw/master/usedcars_with_missing_values.csv")
raw_data = df.dropna()
raw_data.reset_index(inplace = True)

In [3]:
raw_data

Unnamed: 0,index,year,model,price,mileage,color,transmission
0,1,2011,SEL,20995.0,10926.0,Gray,AUTO
1,2,2011,SEL,19995.0,7351.0,Silver,AUTO
2,3,2011,SEL,17809.0,11613.0,Gray,AUTO
3,4,2012,SE,17500.0,8367.0,White,AUTO
4,5,2010,SEL,17495.0,25125.0,Silver,AUTO
...,...,...,...,...,...,...,...
125,145,2006,SES,6200.0,95000.0,Silver,AUTO
126,146,2002,SE,5995.0,87003.0,Red,AUTO
127,147,2000,SE,5980.0,96841.0,Red,AUTO
128,148,2001,SE,4899.0,151479.0,Yellow,AUTO


In [4]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         130 non-null    int64  
 1   year          130 non-null    int64  
 2   model         130 non-null    object 
 3   price         130 non-null    float64
 4   mileage       130 non-null    float64
 5   color         130 non-null    object 
 6   transmission  130 non-null    object 
dtypes: float64(2), int64(2), object(3)
memory usage: 7.2+ KB


In [5]:
data = raw_data.copy()
category_columns = ['color','model','transmission']
data[category_columns] = data[category_columns].astype('category')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130 entries, 0 to 129
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   index         130 non-null    int64   
 1   year          130 non-null    int64   
 2   model         130 non-null    category
 3   price         130 non-null    float64 
 4   mileage       130 non-null    float64 
 5   color         130 non-null    category
 6   transmission  130 non-null    category
dtypes: category(3), float64(2), int64(2)
memory usage: 5.2 KB


In [7]:
dummies_data = pd.get_dummies(data)

In [8]:
dummies_data

Unnamed: 0,index,year,price,mileage,model_SE,model_SEL,model_SES,color_Black,color_Blue,color_Gold,color_Gray,color_Green,color_Red,color_Silver,color_White,color_Yellow,transmission_AUTO,transmission_MANUAL
0,1,2011,20995.0,10926.0,0,1,0,0,0,0,1,0,0,0,0,0,1,0
1,2,2011,19995.0,7351.0,0,1,0,0,0,0,0,0,0,1,0,0,1,0
2,3,2011,17809.0,11613.0,0,1,0,0,0,0,1,0,0,0,0,0,1,0
3,4,2012,17500.0,8367.0,1,0,0,0,0,0,0,0,0,0,1,0,1,0
4,5,2010,17495.0,25125.0,0,1,0,0,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,145,2006,6200.0,95000.0,0,0,1,0,0,0,0,0,0,1,0,0,1,0
126,146,2002,5995.0,87003.0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
127,147,2000,5980.0,96841.0,1,0,0,0,0,0,0,0,1,0,0,0,1,0
128,148,2001,4899.0,151479.0,1,0,0,0,0,0,0,0,0,0,0,1,1,0


In [9]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor

In [10]:
def split(spilt_size, X, Y):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = spilt_size, shuffle = True)
    return X_train, X_test, y_train, y_test

In [11]:
def find_param(model_name, X_train, y_train, param_grids):
        grid_search = GridSearchCV(estimator = model_name, param_grid = param_grids, cv = 5, n_jobs = -1)
        grid_search.fit(X_train, y_train)
        for param_grid in param_grids.keys():
            yield param_grid, grid_search.best_params_[param_grid]

In [12]:
def score(model, X_train, y_train, X_test, y_test):
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        rmse = math.sqrt(mse)
        train_score = model.score(X_train, y_train)
        test_score = model.score(X_test, y_test)
        print(f"train score : {train_score}")
        print(f"test  score : {test_score}")
        print(f"Mean Squared Error: {mse}")
        print(f"Root Mean Squared Error: {rmse}")

In [13]:
def kfold(models, kfold_n, X, Y):
    kf = KFold(n_splits = kfold_n, shuffle = True, random_state = 42)
    result = {}
    for model in models:
        cv_scores = cross_val_score(model, X, Y,cv = kf)
        result[model.__class__.__name__] = np.mean(cv_scores)
    return result

In [14]:
target = 'price'
X = dummies_data.drop(columns = target)
Y = dummies_data[target]

In [15]:
X_train, X_test, y_train, y_test = split( 0.1, X, Y)

In [16]:
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
print('Linear Model')
score(linear_model, X_train, y_train, X_test, y_test)

Linear Model
train score : 0.9553396571006799
test  score : 0.8577874410342037
Mean Squared Error: 453582.636588995
Root Mean Squared Error: 673.4854390326453


In [17]:
tree_model = DecisionTreeRegressor()
param_grid = {'max_depth': list(range(1,11))}
param = dict(find_param(tree_model, X_train, y_train, param_grid))
tree_model = DecisionTreeRegressor(max_depth = param['max_depth'])
tree_model.fit(X_train, y_train)
print('Tree Model')
score(tree_model, X_train, y_train, X_test, y_test)

Tree Model
train score : 0.9999950810840436
test  score : 0.9921973262242559
Mean Squared Error: 24886.3909726637
Root Mean Squared Error: 157.75421063370607


In [18]:
Forest_model = RandomForestRegressor()
param_grid = {'max_depth': list(range(1,10)), 'max_features': list(range(1,15))}
param = dict(find_param(tree_model, X_train, y_train, param_grid))
Forest_model = RandomForestRegressor(max_depth = param['max_depth'], max_features = param['max_features'], n_estimators = 100)
Forest_model.fit(X_train, y_train)
print('Forest Model')
score(Forest_model, X_train, y_train, X_test, y_test)

Forest Model
train score : 0.997168877141902
test  score : 0.9831156988701931
Mean Squared Error: 53851.96553043025
Root Mean Squared Error: 232.06026271300792


In [19]:
xgb_model = XGBRegressor()
param_grid = {'learning_rate': list(i/10 for i in range(1,10)), 'max_depth': list(range(2,15)), 'n_estimators' : list(range(10,100,10))}
param = dict(find_param(xgb_model, X_train, y_train, param_grid))
xgb_model = XGBRegressor(learning_rate = param['learning_rate'], max_depth = param['max_depth'], n_estimators = param['n_estimators'])
xgb_model.fit(X_train, y_train)
print('XGB Model')
score(xgb_model, X_train, y_train, X_test, y_test)

XGB Model
train score : 0.9999766216227293
test  score : 0.9984810768018852
Mean Squared Error: 4844.55939747737
Root Mean Squared Error: 69.60286917561208


In [20]:
model_list = [linear_model, tree_model, Forest_model, xgb_model]
result = kfold(model_list, 5, X, Y)
result

{'LinearRegression': 0.9384454103604287,
 'DecisionTreeRegressor': 0.9855270239504698,
 'RandomForestRegressor': 0.9800414522508559,
 'XGBRegressor': 0.9908117767626867}

In [21]:
# class model():
#     def __init__(self, model_name, X, Y, split_seed, spilt_size, kfold_seed, **kwargs):
#         self.model_name = model_name
#         self.X = X
#         self.Y = Y
#         self.split_seed = split_seed
#         self.spilt_size = spilt_size
#         self.kfold_seed = kfold_seed
#         self.kwargs = kwargs
    
#     def split(self):
#         self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.Y, test_size = self.spilt_size, random_state = self.spilt_size, shuffle = False)

#     @property
#     def create_model(self):
#         self.model = self.model_name()
#         self.model.fit(self.X_train, self.y_train)

#     def grid_search(self):
#         for kwarg in self.kwargs:
#             yield kwarg
#         grid_search = GridSearchCV(estimator = self.model_name, param_grid= , cv=5, n_jobs=-1)
#         grid_search.fit(self.X_train, self.y_train)

#     @property
#     def score(self):
#         self.y_pred = self.model.predict(self.X_test)
#         self.mse = mean_squared_error(self.y_test, self.y_pred)
#         self.rmse = math.sqrt(self.mse)
#         self.train_model_score = self.model.score(self.X_train, self.y_train)
#         self.test_mdoel_score = self.model.score(self.X_test, self.y_test)