In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, KFold, RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

from sklearn.metrics import r2_score, mean_absolute_error

In [3]:
df = pd.read_csv('/content/df_final.csv')
df.head()

Unnamed: 0,company,type_name,os,gpu_company,cpu_model,screen_type,touchscreen,ips_panel,ppi,ssd,hdd,flash_storage,hybrid,ram,weight,cpu_freq,price
0,Apple,Ultrabook,Mac,Intel,Intel Core i5,Standard,0,1,202.372769,128,0,0,0,8,1.37,2.3,11.175773
1,Apple,Ultrabook,Mac,Intel,Intel Core i5,Standard,0,0,127.67794,0,0,128,0,8,1.34,1.8,10.776808
2,HP,Notebook,Other/Linux/No,Intel,Intel Core i5,Full HD,0,0,141.211998,256,0,0,0,8,1.86,2.5,10.329964
3,Apple,Ultrabook,Mac,AMD,Intel Core i7,Standard,0,1,202.372769,512,0,0,0,16,1.83,2.7,11.814481
4,Apple,Ultrabook,Mac,Intel,Intel Core i5,Standard,0,1,202.372769,256,0,0,0,8,1.37,3.1,11.473113


In [4]:
X = df.drop(columns = ['price'])
y = df[['price']].copy()

print(f"Shape of X: {X.shape}\n")
print(f"Shape of y: {y.shape}")

Shape of X: (1273, 16)

Shape of y: (1273, 1)


In [5]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size = 0.2, random_state = 1)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size = 0.5, random_state = 1)

print(f"X_train shape: {X_train.shape}")
print(f"X_val shape: {X_val.shape}")
print(f"X_test shape: {X_test.shape}\n")
print(f"y_train shape: {y_train.shape}")
print(f"y_val shape: {y_val.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (1018, 16)
X_val shape: (127, 16)
X_test shape: (128, 16)

y_train shape: (1018, 1)
y_val shape: (127, 1)
y_test shape: (128, 1)


In [6]:
preprocessor = ColumnTransformer(transformers = [
    ('categorical', OneHotEncoder(sparse_output = False, drop = 'first', handle_unknown = 'ignore'), [0,1,2,3,4,5]),
    ('numerical', MinMaxScaler(), [6,7,8,9,10,11,12,13,14,15])
    ],
    remainder = 'passthrough'
)

X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)
X_test = preprocessor.transform(X_test)
X_temp = preprocessor.transform(X_temp)

nominal_col = preprocessor.transformers_[0][1].get_feature_names_out()
numerical_col = preprocessor.transformers_[1][1].get_feature_names_out()
cols = list(nominal_col) + list(numerical_col)

X_train_df = pd.DataFrame(data = X_train, columns = cols)
X_val_df = pd.DataFrame(data = X_val, columns = cols)
X_test_df = pd.DataFrame(data = X_test, columns = cols)
X_temp_df = pd.DataFrame(data = X_temp, columns = cols)

In [7]:
print(X_train_df.shape)
print(X_val_df.shape)
print(X_test_df.shape)

(1018, 45)
(127, 45)
(128, 45)


In [22]:
# Define model
rf = RandomForestRegressor()

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [6, 8, 10, 12],
    'max_features': [0.25, 0.5, 0.75],
}

# Apply GridSearchCV
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='r2', n_jobs=-1, verbose = 1)
grid_search.fit(X_train, y_train)

print("Best Score:", grid_search.best_score_)

# Get the best parameters
print("Best parameters:", grid_search.best_params_)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Score: 0.8801595987324937
Best parameters: {'max_depth': 12, 'max_features': 0.25, 'n_estimators': 150}


In [23]:
# Get predictions
y_train_pred = grid_search.best_estimator_.predict(X_train)
y_val_pred = grid_search.best_estimator_.predict(X_val)

# Evaluate performance
print("Train R2:", r2_score(y_train, y_train_pred))
print("Train MAE:", mean_absolute_error(y_train, y_train_pred))

print("\nValidation R2:", r2_score(y_val, y_val_pred))
print("Validation MAE:", mean_absolute_error(y_val, y_val_pred))

# Get predictions
y_test_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate performance
print("\nTest R2:", r2_score(y_test, y_test_pred))
print("Test MAE:", mean_absolute_error(y_test, y_test_pred))

# Get predictions
y_temp_pred = grid_search.best_estimator_.predict(X_temp)

# Evaluate performance
print("\nTemp R2:", r2_score(y_temp, y_temp_pred))
print("Temp MAE:", mean_absolute_error(y_temp, y_temp_pred))

Train R2: 0.9665867225222144
Train MAE: 0.0890633868888859

Validation R2: 0.872625737691971
Validation MAE: 0.14542655954588443

Test R2: 0.9066146291072595
Test MAE: 0.1422100537123761

Temp R2: 0.8923808230242928
Temp MAE: 0.1438119997549469


In [27]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

# Define model
gb = GradientBoostingRegressor(random_state=42)

# Hyperparameter distributions
param_dist = {
    'learning_rate': np.linspace(0.05, 0.2, 5),
    'n_estimators': [100, 300, 500, 700],
    'max_depth': np.arange(3, 8),
    'min_samples_split': np.arange(2, 10),
    'min_samples_leaf': np.arange(1, 5),
    'subsample': np.linspace(0.6, 1.0, 5),
    'max_features': ['sqrt', 'log2']
}

# Run RandomizedSearchCV
random_search = RandomizedSearchCV(
    gb, param_distributions=param_dist, n_iter=50, cv=5, scoring='r2', n_jobs=-1, random_state=42
)
random_search.fit(X_train, y_train)

# Best parameters
print("Best parameters:", random_search.best_params_)

Best parameters: {'subsample': np.float64(0.8), 'n_estimators': 300, 'min_samples_split': np.int64(3), 'min_samples_leaf': np.int64(2), 'max_features': 'sqrt', 'max_depth': np.int64(5), 'learning_rate': np.float64(0.05)}


In [30]:
from sklearn.metrics import r2_score

# Predict on validation set
y_train_pred = random_search.best_estimator_.predict(X_train)
y_val_pred = random_search.best_estimator_.predict(X_val)
y_test_pred = random_search.best_estimator_.predict(X_test)
y_temp_pred = random_search.best_estimator_.predict(X_temp)

# Evaluate performance
print("Train R2:", r2_score(y_train, y_train_pred))
print("Train MAE:", mean_absolute_error(y_train, y_train_pred))

print("\nValidation R2:", r2_score(y_val, y_val_pred))
print("Validation MAE:", mean_absolute_error(y_val, y_val_pred))

# Evaluate performance
print("\nTest R2:", r2_score(y_test, y_test_pred))
print("Test MAE:", mean_absolute_error(y_test, y_test_pred))

# Evaluate performance
print("\nTemp R2:", r2_score(y_temp, y_temp_pred))
print("Temp MAE:", mean_absolute_error(y_temp, y_temp_pred))

Train R2: 0.9565372236768408
Train MAE: 0.1006777928629836

Validation R2: 0.8856913861318199
Validation MAE: 0.13212584785399328

Test R2: 0.9153664964616468
Test MAE: 0.13473137887148334

Temp R2: 0.9029501362118799
Temp MAE: 0.13343372224708636
