In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras import layers, regularizers, callbacks
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import warnings 
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("../Output/Data/processed_data.csv")
df

Unnamed: 0,Price,CPU Name,CPU Achitecture,CPU Core,CPU Thread,CPU Base Clock,CPU Max Clock,RAM,Memory Type,Max DDR Support,Storage,Storage Type,GPU VRAM,Display Type,Display Size,Display Frequency,OS,Warrant,Display Width,Display Height
0,23499000,i7-1355U,7,10.0,12.0,5.0,5.0,32,4,4.0,512,1,0.0,3,14.0,60,11,12.0,1920,1080
1,21499000,i7-1355U,7,10.0,12.0,5.0,5.0,16,4,4.0,512,1,0.0,3,15.6,60,11,12.0,1920,1080
2,25489000,i5-1135G7,10,4.0,8.0,0.9,4.2,32,5,4.0,512,1,0.0,3,14.0,60,10,12.0,1920,1080
3,85999000,i9-13950HX,7,24.0,32.0,5.5,5.5,64,5,4.0,1024,1,0.0,3,16.0,60,11,36.0,1920,1200
4,21999000,i7-1355U,7,10.0,12.0,5.0,5.0,16,4,4.0,1024,1,0.0,3,15.0,120,11,12.0,1920,1080
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
906,49490000,155H,7,16.0,22.0,0.9,4.8,16,5,4.0,1024,1,8.0,1,19.9,60,11,24.0,2880,1800
907,26490000,155H,7,16.0,22.0,0.9,4.8,16,5,4.0,512,1,0.0,3,21.7,60,11,12.0,2880,1800
908,89990000,i9-12900,7,16.0,24.0,1.8,5.0,0,5,4.0,2048,1,12.0,3,25.9,60,11,24.0,3840,2160
909,20990000,i5-12500,7,6.0,12.0,4.6,4.6,8,4,4.0,512,1,0.0,3,25.1,60,11,24.0,1920,1080


In [3]:
df.shape

(911, 20)

In [4]:
df.isnull().sum()

Price                0
CPU Name             0
CPU Achitecture      0
CPU Core             0
CPU Thread           0
CPU Base Clock       0
CPU Max Clock        0
RAM                  0
Memory Type          0
Max DDR Support      0
Storage              0
Storage Type         0
GPU VRAM             0
Display Type         0
Display Size         0
Display Frequency    0
OS                   0
Warrant              0
Display Width        0
Display Height       0
dtype: int64

In [5]:
df['Price'] = df['Price'] / 25500 # chuyển sang đô la
df['Price']

0       921.529412
1       843.098039
2       999.568627
3      3372.509804
4       862.705882
          ...     
906    1940.784314
907    1038.823529
908    3529.019608
909     823.137255
910     685.882353
Name: Price, Length: 911, dtype: float64

In [8]:
df['CPU Name'].value_counts()

CPU Name
i5-1335U     122
i7-1355U      82
155H          36
i9-14900      35
i5-12450H     33
            ... 
i5-1035G4      1
i7-12800H      1
i7-1270P       1
i7-10875H      1
i5-11500H      1
Name: count, Length: 88, dtype: int64

# Model Selection, Training, Evaluation

In [6]:
X = df.drop('Price', axis=1)
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared error: {mse}')

r2 = r2_score(y_test,y_pred)
print(f'R2 Score: {r2}')

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean absolute error: {mae}')

ValueError: could not convert string to float: 'i5-1235U'

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
para = 0.015
# Lasso Regression (L1 regularization)
lasso = Lasso(alpha=para)  # Alpha is the regularization parameter (lambda)
lasso.fit(X_train_scaled, y_train)
lasso_pred = lasso.predict(X_test_scaled)
lasso_mse = mean_squared_error(y_test, lasso_pred)
print("Lasso MSE:", lasso_mse)

# Ridge Regression (L2 regularization)
ridge = Ridge(alpha=para)  # Alpha is the regularization parameter (lambda)
ridge.fit(X_train_scaled, y_train)
ridge_pred = ridge.predict(X_test_scaled)
ridge_mse = mean_squared_error(y_test, ridge_pred)
print("Ridge MSE:", ridge_mse)

In [None]:
forest = RandomForestRegressor()
forest.fit(X_train_scaled, y_train)

forest.score(X_test_scaled, y_test)

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(random_state=45, max_samples=0.22),
    'Support Vector Machine': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet Regression': ElasticNet(),
    'XGBoost': xgb.XGBRegressor(),
    'LightGBM': lgb.LGBMRegressor(),
    'CatBoost': cb.CatBoostRegressor(silent=True)
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate mean absolute error scores
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    # Print the name of the model and its mean absolute error scores
    print(f'{name}: Train MAE = {train_mae}') 
    print(f'{name}: Test MAE = {test_mae}')
    print('***********************')