In [135]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import  LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from xgboost import XGBRegressor

In [136]:
df = pd.read_csv('cleaned_laptop_dataset.csv')

In [137]:
df.head()

Unnamed: 0,Brand,Price,Processor_Brand,RAM_Expandable,RAM_Size (GB),RAM_TYPE,Processer_Speed(Ghz),Display_type,GPU_Brand,SSD(GB),HDD(GB),Operating_System,Display_Tier,GPU,GPU_Tier,Processor_Tier
0,Lenovo,127011.5,AMD,Yes,8,DDR4,4.0,LCD,AMD,512.0,0,Windows 11,Large,Radeon Graphics,Entry-level,Mid-End
1,Dell,274750.0,Intel,Yes,16,DDR5,3.3,LCD,NVIDIA,512.0,0,Windows 11,Large,RTX 3050,Mid-end,Mid-End
2,Hp,194215.0,Intel,Yes,8,DDR4,4.2,LCD,Intel,512.0,0,Windows 11,Large,Iris Xe,Low-end,Mid-End
3,Infinix,76965.0,Intel,No,8,LPDDR4X,1.7,LCD,Intel,512.0,0,Windows 11,Large,UHD Graphics,Entry-level,Low-End
4,Acer,122465.0,Intel,Yes,16,DDR4,0.0,LCD,Intel,512.0,0,Windows 11,Small,Iris Xe,Low-end,Mid-End


In [138]:
le_dict = {}
for col in ['Brand', 'Processor_Brand', 'RAM_TYPE', 'Display_type', 'Operating_System', 'GPU', 'GPU_Brand']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le  # store encoder if you need to inverse transform later

In [139]:
df['RAM_Expandable'] = df['RAM_Expandable'].map({'Yes': 1, 'No': 0})

In [140]:
processor_tier_map = {
    'Low-End': 1,
    'Mid-End': 2,
    'Upper Mid-End': 3,
    'High-End': 4,
    'Apple M-Series': 5
}
df['Processor_Tier'] = df['Processor_Tier'].map(processor_tier_map)

In [141]:
gpu_tier_map = {
    'Entry-Level': 1,
    'Low-End': 2,
    'Mid-End': 3,
    'High-End': 4
}
df['GPU_Tier'] = df['GPU_Tier'].map(gpu_tier_map)

In [142]:
display_map = {
    'Small': 1,
    'Medium': 2,
    'Large': 3
}

df['Display_Tier'] = df['Display_Tier'].map(display_map)

In [143]:
print(X_train.dtypes[X_train.dtypes == 'object'])

Series([], dtype: object)


In [144]:
X = df.drop(columns=['Price'])
y = df['Price']

In [145]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [146]:
xgb_model = XGBRegressor(
    n_estimators=500,     # number of trees
    learning_rate=0.05,   # shrinkage step
    max_depth=8,          # tree depth
    subsample=0.8,        # row sampling
    colsample_bytree=0.8, # feature sampling
    random_state=42,
    objective='reg:squarederror'
)

xgb_model.fit(X_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [147]:
y_pred = xgb_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("RMSE:", rmse)
print("R² Score:", r2)

RMSE: 83555.41331229653
R² Score: 0.770413152298593


In [148]:
y_log = np.log1p(y)   # log(Price + 1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_log, test_size=0.2, random_state=42)

In [149]:

xgb = XGBRegressor(
    objective="reg:squarederror",
    random_state=42,
    n_jobs=-1
)

In [150]:
param_dist = {
    "n_estimators": [300, 500, 800, 1200],
    "learning_rate": [0.01, 0.05, 0.1],
    "max_depth": [4, 6, 8, 10],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "min_child_weight": [1, 3, 5],
    "gamma": [0, 0.1, 0.3]
}
search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=20,              # try 20 random combinations
    scoring="neg_root_mean_squared_error",
    cv=3,                   # 3-fold cross validation
    verbose=1,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)

print("Best Parameters:", search.best_params_)


Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best Parameters: {'subsample': 0.6, 'n_estimators': 800, 'min_child_weight': 1, 'max_depth': 4, 'learning_rate': 0.05, 'gamma': 0.1, 'colsample_bytree': 1.0}


In [151]:
best_model = search.best_estimator_

y_pred_log = best_model.predict(X_test)
y_pred = np.expm1(y_pred_log)   # inverse log1p
y_true = np.expm1(y_test)

mse = mean_squared_error(y_true, y_pred)
rmse = np.sqrt(mse)   # take square root manually
r2 = r2_score(y_true, y_pred)

print("RMSE:", rmse)
print("R² Score:", r2)

RMSE: 79995.38644579858
R² Score: 0.7895602859639991
