Kornkanok Klinsumalee 6341005826

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
url = 'https://github.com/prasertcbs/basic-dataset/raw/master/usedcars_with_missing_values.csv'
raw_df = pd.read_csv(url)

In [3]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          150 non-null    int64  
 1   model         146 non-null    object 
 2   price         146 non-null    float64
 3   mileage       146 non-null    float64
 4   color         146 non-null    object 
 5   transmission  146 non-null    object 
dtypes: float64(2), int64(1), object(3)
memory usage: 7.2+ KB


# Missing Value

In [4]:
df = raw_df.dropna()

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130 entries, 1 to 149
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          130 non-null    int64  
 1   model         130 non-null    object 
 2   price         130 non-null    float64
 3   mileage       130 non-null    float64
 4   color         130 non-null    object 
 5   transmission  130 non-null    object 
dtypes: float64(2), int64(1), object(3)
memory usage: 7.1+ KB


# Categorical -> Numeric

In [6]:
category_column = ['color','model','transmission']

In [7]:
df1 = df.copy()

for col in category_column:

    # python version 3.9
    # ohenc=OneHotEncoder(sparse=False)

    # python version 3.10
    ohenc=OneHotEncoder(sparse_output=False)
    
    m=ohenc.fit_transform(df[[col]])
    df_dummy = pd.DataFrame(m, columns=list(ohenc.categories_[0]))

    df1 = pd.merge(df1, df_dummy, on = df.index, how = "inner")
    df1.drop(['key_0', col], axis=1, inplace=True)

In [8]:
df1

Unnamed: 0,year,price,mileage,Black,Blue,Gold,Gray,Green,Red,Silver,White,Yellow,SE,SEL,SES,AUTO,MANUAL
0,2011,20995.0,10926.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,2011,19995.0,7351.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,2011,17809.0,11613.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
3,2012,17500.0,8367.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
4,2010,17495.0,25125.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,2006,6200.0,95000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
126,2002,5995.0,87003.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
127,2000,5980.0,96841.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
128,2001,4899.0,151479.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0


# Split data 
split data to train and test

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X = df1.drop(columns = 'price')
y = df1['price']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Model

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import mean_squared_error

from math import sqrt
from statistics import mode

## Linear

In [13]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [14]:
y_pred = lr.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"train score : {lr.score(X_train, y_train)}")
print(f"test  score : {lr.score(X_test, y_test)}")
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", sqrt(mse))

train score : 0.837089963921001
test  score : 0.8096755188185504
Mean Squared Error: 2614591.7003099825
Root Mean Squared Error: 1616.9699132358594


## Decision Tree

In [15]:
dt = DecisionTreeRegressor()

param_grid = {'max_depth': list(range(1,11))}

grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

dt_max_depth = grid_search.best_params_['max_depth']

print('Best Hyperparameters:', grid_search.best_params_)

Best Hyperparameters: {'max_depth': 6}


In [16]:
dt = DecisionTreeRegressor(max_depth=dt_max_depth)
dt.fit(X_train, y_train)

print(f"train score : {dt.score(X_train, y_train)}")
print(f"test  score : {dt.score(X_test, y_test)}")

y_pred = dt.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", sqrt(mse))

train score : 0.9412594885391893
test  score : 0.8104326155715817
Mean Squared Error: 2604191.04730665
Root Mean Squared Error: 1613.7506149670864


## Random Forest

In [17]:
param_grid = {'max_depth': list(range(1,10)),
            'max_features': list(range(1,15))}

rf = RandomForestRegressor()

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

rf_max_depth = grid_search.best_params_['max_depth']
rf_max_features = grid_search.best_params_['max_features']

print('Best Hyperparameters:', grid_search.best_params_)

Best Hyperparameters: {'max_depth': 7, 'max_features': 6}


In [18]:
rf = RandomForestRegressor(max_depth=rf_max_depth, max_features=rf_max_features, n_estimators=100)
rf.fit(X_train, y_train)

print(f"train score : {rf.score(X_train, y_train)}")
print(f"test  score : {rf.score(X_test, y_test)}")

y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", sqrt(mse))

train score : 0.9420569442926501
test  score : 0.8622414355209462
Mean Squared Error: 1892464.895202654
Root Mean Squared Error: 1375.6688901049752


## XGBoots

In [19]:
param_grid = {'learning_rate': [i/10 for i in range(1,10)],
            'max_depth': list(range(2,15)),
            'n_estimators' : list(range(10,100,10))}

xgb = XGBRegressor()

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

xgb_learning_rate = grid_search.best_params_['learning_rate']
xgb_max_depth = grid_search.best_params_['max_depth']
xgb_n_estimators = grid_search.best_params_['n_estimators']
 
print('Best Hyperparameters:', grid_search.best_params_)

Best Hyperparameters: {'learning_rate': 0.5, 'max_depth': 2, 'n_estimators': 10}


In [20]:
xgb = XGBRegressor(learning_rate=xgb_learning_rate, max_depth=xgb_max_depth, n_estimators=xgb_n_estimators)
xgb.fit(X_train, y_train)

print(f"train score : {xgb.score(X_train, y_train)}")
print(f"test  score : {xgb.score(X_test, y_test)}")

y_pred = xgb.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", sqrt(mse))

train score : 0.8971755984721294
test  score : 0.835806469707697
Mean Squared Error: 2255616.5075662043
Root Mean Squared Error: 1501.8710023055257


# Model Evaluation

In [21]:
from sklearn.model_selection import cross_val_score, KFold

In [22]:
kf = KFold(n_splits=5, shuffle=True, random_state = 42)

model_list = [lr, dt, rf, xgb]

result = {}

for model in model_list:
    cv_scores = cross_val_score(model,X,y,cv=kf)
    result[model.__class__.__name__] = np.mean(cv_scores)

result

{'LinearRegression': 0.7596046638944937,
 'DecisionTreeRegressor': 0.6039092536493117,
 'RandomForestRegressor': 0.7890734579901322,
 'XGBRegressor': 0.7589293611907687}