In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, cross_val_score, ShuffleSplit, StratifiedKFold, KFold

from sklearn.metrics import r2_score, make_scorer, mean_squared_error

from matplotlib import pyplot as plt

In [3]:
data = pd.read_csv('tidy_data.csv')
data = data.drop(['name', 'CPU', 'GPU', 'old_price', 'Date First Available'], axis=1)
data.head()

Unnamed: 0,Brand,Best Sellers Rank,weight,Laptop type,Laptop purpose,Screen Size,Hard Drive Size,Hard Drive Type,RAM speed,RAM size,RAM type,CPU rank,GPU performance,Optical Drive Type,Operating System,Number of USB 3.0 Ports,Number of USB 2.0 Ports,Customer Reviews,price
0,samsung,293.0,1.17934,traditional,general,13.3,32.0,other,2.133,4.0,ddr4,2521.0,1.8,No,chrome os,0.0,0.0,4.2,100.0
1,lenovo,256.0,1.587573,2 in 1,general,14.0,32.0,other,2.133,32.0,ddr,2867.0,1.0,Yes,chrome os,2.0,0.0,4.5,103.0
2,hp,86.0,1.356241,traditional,general,14.0,32.0,hybrid,1.866,4.0,ddr,2867.0,1.7,No,chrome os,1.0,0.0,4.2,106.0
3,samsung,328.0,1.814369,traditional,general,13.3,32.0,ssd,2.5,4.0,ddr4,2521.0,0.9,No,chrome os,2.0,1.0,4.4,109.0
4,acer,15.0,1.4016,traditional,general,14.0,512.0,ssd,2.8,4.0,lpddr4,2521.0,1.7,No,chrome os,4.0,1.0,4.5,115.0


In [3]:
X = pd.get_dummies(data[['weight','CPU rank','GPU performance','Brand','Laptop type','Laptop purpose','Hard Drive Type','RAM type','Operating System']], 
                   columns=['Brand', 'Laptop type', 'Laptop purpose', 'Hard Drive Type', 'RAM type', 'Operating System'])
y = data['price']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=1)

In [5]:
scale = StandardScaler().fit(X_train)
X_train = scale.fit_transform(X_train)
X_test = scale.transform(X_test)

# Grid Search

In [6]:
score = make_scorer(r2_score)

## Ridge

In [7]:
rb = Ridge()
grid_rb = GridSearchCV(estimator=rb, param_grid={'alpha':[1,2,3,4,5,6]}, cv=5, scoring=score)
re = grid_rb.fit(X_train, y_train)
print(f"Best score: {re.best_score_}")
print(f"Best params: {re.best_params_}")

Best score: 0.638803193472148
Best params: {'alpha': 6}


In [8]:
ridge = Ridge(alpha=6)
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
cross_val_score(ridge, X, y, cv=cv, scoring=score)

array([0.6227397 , 0.59610649, 0.56322946, 0.65814102, 0.63323297])

In [9]:
ridge = Ridge(alpha=6).fit(X_train, y_train)
ridge_r2 = r2_score(y_test, ridge.predict(X_test))
ridge_rmse = mean_squared_error(y_test, ridge.predict(X_test), squared = False)
print(f'R2 score: {ridge_r2}')
print(f'RMSE: {ridge_rmse}')

R2 score: 0.5786732249762574
RMSE: 519.7246639439443


In [10]:
with open('C:/Users/trinh/OneDrive/Máy tính/DS105 _Final project/Code source/Demo/ridge.pkl', 'wb') as file:
    pickle.dump(ridge, file)

## SVM

In [11]:
svr1 = SVR(kernel='rbf')
grid_svr = GridSearchCV(estimator=svr1, 
                        param_grid={'C':[10, 100], 
                                    'gamma':[.01, .001, .0001]}, 
                        cv=5, scoring=score)
re = grid_svr.fit(X_train, y_train)
print(f"Best score: {re.best_score_}")
print(f"Best params: {re.best_params_}")

Best score: 0.513327933722786
Best params: {'C': 100, 'gamma': 0.01}


In [12]:
svr2 = SVR(kernel='poly')
grid_svr = GridSearchCV(estimator=svr2, 
                        param_grid={'C':[10, 100], 
                                    'gamma':[.01, .001, .0001]}, 
                        cv=5, scoring=score)
re = grid_svr.fit(X_train, y_train)
print(f"Best score: {re.best_score_}")
print(f"Best params: {re.best_params_}")

Best score: 0.08577952445376917
Best params: {'C': 100, 'gamma': 0.01}


In [13]:
svr = SVR(kernel='linear')
grid_svr = GridSearchCV(estimator=svr, 
                        param_grid={'C':[10, 100], 
                                    'gamma':[.01, .001, .0001]}, 
                        cv=5, scoring=score)
re = grid_svr.fit(X_train, y_train)
print(f"Best score: {re.best_score_}")
print(f"Best params: {re.best_params_}")

Best score: 0.6117759659358769
Best params: {'C': 100, 'gamma': 0.01}


In [14]:
svm = SVR(kernel='linear', C=100, gamma=.01).fit(X_train, y_train)
svm_r2 = r2_score(y_test, svm.predict(X_test))
svm_rmse = mean_squared_error(y_test, svm.predict(X_test), squared = False)

print(f'R2 score: {svm_r2}')
print(f'RMSE: {svm_rmse}')

R2 score: 0.5816978890523938
RMSE: 517.8557772722206


In [15]:
with open('C:/Users/trinh/OneDrive/Máy tính/DS105 _Final project/Code source/Demo/svm.pkl', 'wb') as file:
    pickle.dump(svm, file)

## Decision Tree

In [16]:
tree = DecisionTreeRegressor()
grid_tr = GridSearchCV(estimator=tree, param_grid={'max_depth':[1,10,20,50,100,150,200]}, cv=5, scoring=score)
re = grid_tr.fit(X_train, y_train)
print(f"Best score: {re.best_score_}")
print(f"Best params: {re.best_params_}")

Best score: 0.5976651785935114
Best params: {'max_depth': 100}


In [17]:
tree = DecisionTreeRegressor(max_depth=10)
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
cross_val_score(tree, X, y, cv=cv, scoring=score)

array([0.57459179, 0.58811338, 0.60855427, 0.59050855, 0.66783454])

In [18]:
tree = DecisionTreeRegressor(max_depth=10).fit(X_train, y_train)
tree_r2 = r2_score(y_test, tree.predict(X_test))
tree_rmse = mean_squared_error(y_test, tree.predict(X_test), squared = False)

print(f'R2 score: {tree_r2}')
print(f'RMSE: {tree_rmse}')

R2 score: 0.5668686044208091
RMSE: 526.9551216502458


In [19]:
with open('C:/Users/trinh/OneDrive/Máy tính/DS105 _Final project/Code source/Demo/tree.pkl', 'wb') as file:
    pickle.dump(tree, file)

## Random Forest

In [20]:
forest = RandomForestRegressor()
grid_fr = GridSearchCV(estimator=forest, param_grid={'n_estimators':[10,20,50,100,200]}, cv=5, scoring=score)
re = grid_fr.fit(X_train, y_train)
print(f"Best score: {re.best_score_}")
print(f"Best params: {re.best_params_}")

Best score: 0.7444424080171215
Best params: {'n_estimators': 200}


In [21]:
forest = RandomForestRegressor(n_estimators=20)
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
cross_val_score(forest, X, y, cv=cv, scoring=score)

array([0.70045734, 0.67424011, 0.64623563, 0.75233008, 0.76912682])

In [22]:
forest = RandomForestRegressor(n_estimators=20).fit(X_train, y_train)
forest_r2 = r2_score(y_test, forest.predict(X_test))
forest_rmse = mean_squared_error(y_test, forest.predict(X_test), squared = False)

print(f'R2 score: {forest_r2}')
print(f'RMSE: {forest_rmse}')

R2 score: 0.7563863538441802
RMSE: 395.19787753880485


In [23]:
with open('C:/Users/trinh/OneDrive/Máy tính/DS105 _Final project/Code source/Demo/forest.pkl', 'wb') as file:
    pickle.dump(forest, file)

# Result

In [24]:
compare = pd.DataFrame({'R2 score':[ridge_r2, svm_r2, tree_r2, forest_r2]},
                       index=['Ridge', 'SVM', 'Decision Tree', 'Random Forest'])
compare

Unnamed: 0,R2 score
Ridge,0.578673
SVM,0.581698
Decision Tree,0.566869
Random Forest,0.756386
