In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import accuracy_score, mean_squared_error, mean_absolute_error, r2_score
# import tensorflow as tf
# from tensorflow.keras import layers, regularizers, callbacks
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
import warnings 
warnings.filterwarnings("ignore")

In [2]:
loaded_df = pd.read_csv("../Output/Data/processed_data.csv")
loaded_df.head(1)
loaded_df['Price'] = loaded_df['Price'] / 1000000 #/ 25500 # chuyển sang đô la
loaded_df = loaded_df.drop(columns=['CPU Name','GPU Name','Storage Type'])
loaded_df = loaded_df.sample(frac=1,random_state=42).reset_index(drop=True)
split_index = int(len(loaded_df)*0.85)
train_val_df = loaded_df.iloc[:split_index]
test_df = loaded_df.iloc[split_index:]
X_test = test_df.drop('Price', axis=1)
y_test = test_df['Price']

In [22]:
def grid_search_polynomial(pow : float = 2,cross_val : bool = True,random_state : int = 42):
    cols = train_val_df.columns
    result = {}
    def inner_grid_search(df_):
        X_ = df_.drop('Price', axis=1)
        y_ = df_['Price']
        X_train, X_val, y_train, y_val = train_test_split(X_, y_, test_size=0.2, random_state=random_state)
        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        mse = mean_squared_error(y_val, y_val)
        r2 = r2_score(y_val,y_pred)
        mae = mean_absolute_error(y_val, y_val)
        inner_result = [r2,mae,mse]
        return inner_result
    def inner_grid_search_cross_val(df_):
        X_ = df_.drop('Price', axis=1)
        y_ = df_['Price']
        model = LinearRegression()
        mse = -cross_val_score(model,X_,y_,cv=5,scoring='neg_mean_squared_error').mean()
        r2 = cross_val_score(model,X_,y_,cv=5,scoring='r2').mean()
        mae = -cross_val_score(model,X_,y_,cv=5,scoring='neg_mean_absolute_error').mean()
        inner_result = [r2,mae,mse]
        return inner_result
    for col in cols:
        df = train_val_df.copy()
        df[col+' S'] = df[col] ** pow
        if (cross_val == True):
            result[col] = inner_grid_search_cross_val(df)
        else:
            result[col] = inner_grid_search(df)
    result['Base'] = inner_grid_search_cross_val(train_val_df.copy())
    return result

In [None]:
def grid_search_product(cross_val : bool = True,random_sate : int = 42):
    cols = raw_df.columns
    result = {}
    def inner_grid_search(df_):
        X_ = df_.drop('Price', axis=1)
        y_ = df_['Price']
        X_train, X_test, y_train, y_test = train_test_split(X_, y_, test_size=0.2, random_state=random_sate)
        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test,y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        inner_result = [r2,mae,mse]
        return inner_result
    def inner_grid_search_cross_val(df_):
        X_ = df_.drop('Price', axis=1)
        y_ = df_['Price']
        model = LinearRegression()
        mse = -cross_val_score(model,X_,y_,cv=5,scoring='neg_mean_squared_error').mean()
        r2 = cross_val_score(model,X_,y_,cv=5,scoring='r2').mean()
        mae = -cross_val_score(model,X_,y_,cv=5,scoring='neg_mean_absolute_error').mean()
        inner_result = [r2,mae,mse]
        return inner_result
    for i in range(len(cols)):
        for j in range(i+1,len(cols)):
            df = raw_df.copy()
            df['Product Column'] = df[cols[i]] * df[cols[j]]
            if (cross_val == True):
                result[cols[i] + '|' + cols[j]] = inner_grid_search_cross_val(df)
            else:
                result[cols[i] + '|' + cols[j]] = inner_grid_search(df)
    result['Base'] = inner_grid_search_cross_val(raw_df.copy())
    return result

In [13]:
def format_result(result_ : dict):
    base = result_['Base']
    for key in result_:
        if (key != 'Base'):
            for i in range(len(result_[key])):
                result_[key][i] -= base[i]
def sorted_print(result_ : dict):
    local_result = result_.copy()
    while(len(local_result) > 0):
        max = -1
        max_key = None
        for key in local_result:
            if (local_result[key][0] > max):
                max = local_result[key][0]
                max_key = key
        print(local_result[max_key],max_key)
        local_result.pop(max_key)


In [23]:
result = grid_search_polynomial(cross_val=False)
format_result(result)
sorted_print(result)
# for col in result:
#     print(result[col],col)

[0.7689934027938298, 7.094943824387819, 114.96112662181508] Base
[0.19156261376523276, -7.094943824387819, -114.96112662181508] Price
[0.0592992363377306, -7.094943824387819, -114.96112662181508] CPU Lithography
[0.059275658353095895, -7.094943824387819, -114.96112662181508] RAM
[0.05849049103012838, -7.094943824387819, -114.96112662181508] Memory Type
[0.05823958152399755, -7.094943824387819, -114.96112662181508] CPU Max Clock
[0.05823877812134237, -7.094943824387819, -114.96112662181508] GPU NVIDIA
[0.05823877812134148, -7.094943824387819, -114.96112662181508] Display Type
[0.05823877812134115, -7.094943824387819, -114.96112662181508] GPU Onboard
[0.05823877812134115, -7.094943824387819, -114.96112662181508] GPU AMD
[0.05823877812134104, -7.094943824387819, -114.96112662181508] Max DDR Support
[0.05823877812134093, -7.094943824387819, -114.96112662181508] CPU Intel
[0.05823877812134093, -7.094943824387819, -114.96112662181508] OS
[0.05819637861091187, -7.094943824387819, -114.9611266

In [None]:
raw_df.info()

In [None]:
correlation_matrix = raw_df.corr()
# print(correlation_matrix)

In [None]:
# Plot the correlation matrix
plt.figure(figsize=(16, 16),dpi=320)
plt.matshow(correlation_matrix, cmap='coolwarm', fignum=1)
plt.colorbar()

# Set axis labels
plt.xticks(range(len(correlation_matrix.columns)), correlation_matrix.columns, rotation=90)
plt.yticks(range(len(correlation_matrix.columns)), correlation_matrix.columns)
for i in range(len(correlation_matrix.columns)):
    for j in range(len(correlation_matrix.columns)):
        plt.text(i, j, f"{correlation_matrix.iloc[i, j]:.2f}", ha='center', va='center', color='black')
# Show plot
plt.show()

In [34]:
df = train_val_df.copy()
# df['Display Resolution'] = raw_df['Display Height'] * raw_df['Display Width']
# df['Display Ratio'] = raw_df['Display Size'] / raw_df['Display Resolution']
# df = df.drop(columns=['Display Height','Display Width'])
df['GPU VRAM S'] = df['GPU VRAM']**2
# df['Warrant S'] = df['Warrant']**2

# df['Display Width 2'] = df['Display Width']**2
# df['Display Height 2'] = df['Display Height']**2
# df['Display Size'] = 1/df['Display Size']
# df['Product 1'] = df['RAM'] * df['Display Size']
# df['Product 2'] = df['RAM'] * df['GPU VRAM']
# df['Product 3'] = df['RAM'] * df['CPU Thread']
# df['Product 4'] = df['RAM'] * df['CPU Core']
# df['Product 5'] = df['RAM'] * df['CPU Cache']
# df['Product 6'] = df['RAM'] * df['CPU Base Clock']
# df['Product 7'] = df['RAM'] * df['CPU Max Clock']
# df['Product 8'] = df['RAM'] * df['Base Power']
# df['Product 9'] = df['RAM'] * df['Max Power']
# df['Storage Ratio'] = df['Storage'] / df['RAM']
# df['CPU Effecient'] = df['Base Power'] / df['CPU Core']
# df['CPU Max Effecient'] = df['Base Power'] / df['CPU Core']
# df['Display Frequency'] = df['Display Frequency']**2
# df = df.drop(columns=['Display Size'])
test_df = loaded_df[split_index:]
test_df['GPU VRAM S'] = test_df['GPU VRAM']**2
# test_df['Warrant S'] = test_df['Warrant']**2
# test_df['Product 1'] = test_df['RAM'] * test_df['Display Size']
# test_df['Product 2'] = test_df['RAM'] * test_df['GPU VRAM']
X_test = test_df.drop('Price', axis=1)
y_test = test_df['Price']

In [39]:
df = train_val_df.copy()
test_df = loaded_df[split_index:]
X_test = test_df.drop('Price', axis=1)
y_test = test_df['Price']

In [40]:
X = df.drop('Price', axis=1)
y = df['Price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2,random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared error: {mse}')

r2 = r2_score(y_test,y_pred)
print(f'R2 Score: {r2}')

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean absolute error: {mae}')

Mean squared error: 125.97907982012181
R2 Score: 0.8204951455796452
Mean absolute error: 7.704596090127005


In [None]:
model = LinearRegression()
mse = -cross_val_score(model,X,y,cv=5,scoring='neg_mean_squared_error').mean()
r2 = cross_val_score(model,X,y,cv=5,scoring='r2').mean()
mae = -cross_val_score(model,X,y,cv=5,scoring='neg_mean_absolute_error').mean()
print(f'Mean squared error: {mse}')
print(f'R2 Score: {r2}')
print(f'Mean absolute error: {mae}')

# Model Selection, Training, Evaluation

In [None]:
X = df.drop('Price', axis=1)
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print(f'Mean squared error: {mse}')

r2 = r2_score(y_test,y_pred)
print(f'R2 Score: {r2}')

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean absolute error: {mae}')

In [None]:
def grid_search_single_axis(from_ : float,to_ : float,step : float,model,set_func : callable,
                            X_train_,X_test_,y_train_,y_test_,result = None):
    if (result == None):
        result = []
    set_func(model,from_)
    if (from_ > to_):
        return result
    model.fit(X_train_,y_train_)
    y_pred_ = model.predict(X_test_)
    r2_ =  r2_score(y_test_,y_pred_)
    result.append([from_,r2_])
    from_ += step
    return grid_search_single_axis(from_,to_,step,model,set_func,X_train_,X_test_,y_train_,y_test_,result)
    

In [None]:
para = 1
lasso = Ridge(alpha=para)
def set_function(model,alpha):
    model.alpha = alpha
result = grid_search_single_axis(0,1,0.01,lasso,set_function,X_train,X_test,y_train,y_test)
for step in result:
    print('R2 : {} , Alpha : {}'.format(step[1],step[0]))

In [None]:
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)
para = 1
# Lasso Regression (L1 regularization)
lasso = Lasso(alpha=para)  # Alpha is the regularization parameter (lambda)
lasso.fit(X_train, y_train)
lasso_pred = lasso.predict(X_test)
lasso_mse = mean_squared_error(y_test, lasso_pred)
lasso_r2 = r2_score(y_test,lasso_pred)
print("Lasso MSE:", lasso_mse)
print("Lasso R2:", lasso_r2)

# Ridge Regression (L2 regularization)
ridge = Ridge(alpha=para)  # Alpha is the regularization parameter (lambda)
ridge.fit(X_train, y_train)
ridge_pred = ridge.predict(X_test)
ridge_mse = mean_squared_error(y_test, ridge_pred)
ridge_r2 = r2_score(y_test,ridge_pred)
print("Ridge MSE:", ridge_mse)
print("Ridge R2:", ridge_r2)

In [None]:
forest = RandomForestRegressor()
forest.fit(X_train, y_train)

forest.score(X_test, y_test)

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(random_state=45, max_samples=0.22),
    'Support Vector Machine': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'Ridge Regression': Ridge(),
    'Lasso Regression': Lasso(),
    'ElasticNet Regression': ElasticNet(),
    'XGBoost': xgb.XGBRegressor(),
    'LightGBM': lgb.LGBMRegressor(),
    'CatBoost': cb.CatBoostRegressor(silent=True)
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(X_train, y_train)
    
    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate mean absolute error scores
    train_mae = mean_absolute_error(y_train, y_train_pred)
    test_mae = mean_absolute_error(y_test, y_test_pred)
    r2 = r2_score(y_test,y_test_pred)
    
    # Print the name of the model and its mean absolute error scores
    print(f'{name}: Train MAE = {train_mae}') 
    print(f'{name}: Test MAE = {test_mae}')
    print(f'{name}: R2 = {r2}')
    print('***********************')