# Laptop Price Prediction

In this project, I aim to predict laptop prices using Linear Regression by analyzing various features.

### Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

### Setting Display Options

In [2]:
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

## Exploratory Data Analysis and Feature Engineering

In [3]:
df = pd.read_csv("laptop_data.csv")

In [4]:
df = df.sort_values(by="Price")
df.head(10)

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price
1215,1215,Acer,Netbook,11.6,1366x768,Intel Celeron Dual Core 3205U 1.5GHz,2GB,32GB SSD,Intel HD Graphics,Chrome OS,1.3kg,9270.72
20,20,Asus,Netbook,11.6,1366x768,Intel Atom x5-Z8350 1.44GHz,2GB,32GB Flash Storage,Intel HD Graphics 400,Windows 10,0.98kg,10224.432
1120,1120,Vero,Notebook,13.3,Full HD 1920x1080,Intel Atom X5-Z8350 1.44GHz,4GB,32GB Flash Storage,Intel HD Graphics 400,Windows 10,1.35kg,10442.88
31,31,Asus,Notebook,14.0,1366x768,AMD E-Series E2-6110 1.5GHz,2GB,32GB Flash Storage,AMD Radeon R2,Windows 10,1.65kg,10602.72
290,290,Acer,Notebook,15.6,1366x768,Intel Celeron Dual Core 3205U 1.5GHz,2GB,16GB SSD,Intel HD Graphics,Chrome OS,2.19kg,10602.72
791,791,Vero,Notebook,14.0,1920x1080,Intel Celeron Dual Core N3350 1.1GHz,4GB,32GB Flash Storage,Intel HD Graphics 500,Windows 10,1.22kg,10810.512
1268,1268,HP,Netbook,11.6,1366x768,Intel Celeron Dual Core N3060 1.6GHz,2GB,32GB Flash Storage,Intel HD Graphics 400,Windows 10,1.17kg,11135.52
1296,1296,HP,Netbook,11.6,1366x768,Intel Celeron Dual Core N3060 1.6GHz,2GB,32GB Flash Storage,Intel HD Graphics 400,Windows 10,1.17kg,11135.52
1282,1282,HP,Netbook,11.6,1366x768,Intel Celeron Dual Core N3060 1.6GHz,2GB,32GB Flash Storage,Intel HD Graphics 400,Windows 10,1.17kg,11135.52
1102,1102,Acer,Notebook,15.6,1366x768,Intel Celeron Dual Core 3205U 1.5GHz,4GB,16GB SSD,Intel HD Graphics,Chrome OS,2.20kg,11135.52


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1303 entries, 1215 to 196
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        1303 non-null   int64  
 1   Company           1303 non-null   object 
 2   TypeName          1303 non-null   object 
 3   Inches            1303 non-null   float64
 4   ScreenResolution  1303 non-null   object 
 5   Cpu               1303 non-null   object 
 6   Ram               1303 non-null   object 
 7   Memory            1303 non-null   object 
 8   Gpu               1303 non-null   object 
 9   OpSys             1303 non-null   object 
 10  Weight            1303 non-null   object 
 11  Price             1303 non-null   float64
dtypes: float64(2), int64(1), object(9)
memory usage: 132.3+ KB


In [6]:
df.isnull().sum()

Unnamed: 0          0
Company             0
TypeName            0
Inches              0
ScreenResolution    0
Cpu                 0
Ram                 0
Memory              0
Gpu                 0
OpSys               0
Weight              0
Price               0
dtype: int64

In [7]:
for column in df.columns:
    print(column, df[column].nunique())

Unnamed: 0 1303
Company 19
TypeName 6
Inches 18
ScreenResolution 40
Cpu 118
Ram 9
Memory 39
Gpu 110
OpSys 9
Weight 179
Price 791


### Checking Consistency of Data Units in Columns

In [8]:
print(df["Weight"].str.endswith("kg").all())
print(df["Ram"].str.endswith("GB").all())
print(df["Cpu"].str.endswith("GHz").all())

True
True
True


### Preprocessing Column Data for Numerical Analysis

In [9]:
df["Cpu"] = df["Cpu"].apply(lambda x: float(x.split(" ")[-1][:-3]))
df["Ram"] = df["Ram"].str[:-2].astype(int)
df["Weight"] = df["Weight"].str[:-2].astype(float)

In [10]:
df["Touch Screen"] = df["ScreenResolution"].apply(lambda x: 1 if "Touchscreen" in x else 0)
df["IPS"] = df["ScreenResolution"].apply(lambda x: 1 if "IPS" in x else 0)
df["Full HD"] = df["ScreenResolution"].apply(lambda x: 1 if "Full HD" in x else 0)
df["4K"] = df["ScreenResolution"].apply(lambda x: 1 if "4K" in x else 0)

In [11]:
df["ScreenResolution"].str[-1].str.isdigit().all()
df["ScreenResolution"] = df["ScreenResolution"].apply(lambda x: int((x.split(" ")[-1]).split("x")[0]) * int((x.split(" ")[-1]).split("x")[1]))
df.head()

Unnamed: 0.1,Unnamed: 0,Company,TypeName,Inches,ScreenResolution,Cpu,Ram,Memory,Gpu,OpSys,Weight,Price,Touch Screen,IPS,Full HD,4K
1215,1215,Acer,Netbook,11.6,1049088,1.5,2,32GB SSD,Intel HD Graphics,Chrome OS,1.3,9270.72,0,0,0,0
20,20,Asus,Netbook,11.6,1049088,1.44,2,32GB Flash Storage,Intel HD Graphics 400,Windows 10,0.98,10224.432,0,0,0,0
1120,1120,Vero,Notebook,13.3,2073600,1.44,4,32GB Flash Storage,Intel HD Graphics 400,Windows 10,1.35,10442.88,0,0,1,0
31,31,Asus,Notebook,14.0,1049088,1.5,2,32GB Flash Storage,AMD Radeon R2,Windows 10,1.65,10602.72,0,0,0,0
290,290,Acer,Notebook,15.6,1049088,1.5,2,16GB SSD,Intel HD Graphics,Chrome OS,2.19,10602.72,0,0,0,0


In [12]:
df["Memory1"] = df["Memory"].apply(lambda x: x.split(" ")[x.split(" ").index("SSD") - 1] if "SSD" in x else "0GB")
df["Memory2"] = df["Memory"].apply(lambda x: x.split(" ")[x.split(" ").index("HDD") - 1] if "HDD" in x else "0GB")
df["Memory3"] = df["Memory"].apply(lambda x: x.split(" ")[x.split(" ").index("Flash") - 1] if "Flash Storage" in x else "0GB")

df["Memory1"] = df["Memory1"].apply(lambda x: float(x[:-2]) if "GB" in x else 1024 * float(x[:-2]))
df["Memory2"] = df["Memory2"].apply(lambda x: float(x[:-2]) if "GB" in x else 1024 * float(x[:-2]))
df["Memory3"] = df["Memory3"].apply(lambda x: float(x[:-2]) if "GB" in x else 1024 * float(x[:-2]))


### Dropping Irrelevant Columns and Encoding Categorical Data

In [13]:
df = df.drop(["Company", "Gpu", "Unnamed: 0", "Memory"], axis=1)
df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,Inches,ScreenResolution,Cpu,Ram,Weight,Price,Touch Screen,IPS,Full HD,4K,Memory1,Memory2,Memory3,TypeName_Gaming,TypeName_Netbook,TypeName_Notebook,TypeName_Ultrabook,TypeName_Workstation,OpSys_Chrome OS,OpSys_Linux,OpSys_Mac OS X,OpSys_No OS,OpSys_Windows 10,OpSys_Windows 10 S,OpSys_Windows 7,OpSys_macOS
1215,11.6,1049088,1.5,2,1.3,9270.72,0,0,0,0,32.0,0.0,0.0,False,True,False,False,False,True,False,False,False,False,False,False,False
20,11.6,1049088,1.44,2,0.98,10224.432,0,0,0,0,0.0,0.0,32.0,False,True,False,False,False,False,False,False,False,True,False,False,False
1120,13.3,2073600,1.44,4,1.35,10442.88,0,0,1,0,0.0,0.0,32.0,False,False,True,False,False,False,False,False,False,True,False,False,False
31,14.0,1049088,1.5,2,1.65,10602.72,0,0,0,0,0.0,0.0,32.0,False,False,True,False,False,False,False,False,False,True,False,False,False
290,15.6,1049088,1.5,2,2.19,10602.72,0,0,0,0,16.0,0.0,0.0,False,False,True,False,False,True,False,False,False,False,False,False,False


## Model Training and Evaluation
### Preparing Data for Model Training with Feature Scaling

In [14]:
df = df.select_dtypes(include=[np.number, "bool"])
x = df.drop("Price", axis=1)
y = df[["Price"]]
scaler = MinMaxScaler()
scaled_x = scaler.fit_transform(x)
scaled_x = pd.DataFrame(scaled_x)
scaled_x.columns = x.columns
scaled_x.head()

Unnamed: 0,Inches,ScreenResolution,Cpu,Ram,Weight,Touch Screen,IPS,Full HD,4K,Memory1,Memory2,Memory3,TypeName_Gaming,TypeName_Netbook,TypeName_Notebook,TypeName_Ultrabook,TypeName_Workstation,OpSys_Chrome OS,OpSys_Linux,OpSys_Mac OS X,OpSys_No OS,OpSys_Windows 10,OpSys_Windows 10 S,OpSys_Windows 7,OpSys_macOS
0,0.180723,0.0,0.222222,0.0,0.15212,0.0,0.0,0.0,0.0,0.03125,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.180723,0.0,0.2,0.0,0.072319,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.385542,0.141403,0.2,0.032258,0.164589,0.0,0.0,1.0,0.0,0.0,0.0,0.0625,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.46988,0.0,0.222222,0.0,0.239401,0.0,0.0,0.0,0.0,0.0,0.0,0.0625,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.662651,0.0,0.222222,0.0,0.374065,0.0,0.0,0.0,0.0,0.015625,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
x_train, x_test, y_train, y_test = train_test_split(scaled_x, y, train_size=.20, random_state=42)

### Building a Linear Regression Model

In [16]:
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)

### Testing Linear Regression Model

In [17]:
mse = mean_squared_error(y_test, y_pred) ** .5
print(mse)
r2 = r2_score(y_test, y_pred)
print(r2)

19068.505054787427
0.7324156776169715


### Building a Decision Tree Regression Model

In [18]:
dtr = DecisionTreeRegressor()
dtr.fit(x_train, y_train)
y_pred = dtr.predict(x_test)

### Testing Decision Tree Regression Model

In [19]:
mse = mean_squared_error(y_test, y_pred) ** .5
print(mse)
r2 = r2_score(y_test, y_pred)
print(r2)

23128.831682390195
0.6063277853362824


### Building a Random Forest Regression Model

In [20]:
rfr = RandomForestRegressor()
rfr.fit(x_train, y_train)
y_pred = rfr.predict(x_test)

  return fit_method(estimator, *args, **kwargs)


### Testing Random Forest Regression Model

In [21]:
mse = mean_squared_error(y_test, y_pred) ** .5
print(mse)
r2 = r2_score(y_test, y_pred)
print(r2)

19101.090363319083
0.7315003706537402


### Building an XGB Regression Model

In [22]:
xg = XGBRegressor()
xg.fit(x_train, y_train)
y_pred = xg.predict(x_test)

### Testing XGB Regression Model

In [23]:
mse = mean_squared_error(y_test, y_pred) ** .5
print(mse)
r2 = r2_score(y_test, y_pred)
print(r2)

19577.21598848963
0.7179479598999023


### Optimizing XGB Regression Model

In [24]:
xgb = XGBRegressor(random_state=42, eval_metric='rmse')

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
}

grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                           scoring='neg_mean_squared_error', 
                           cv=5, verbose=2, n_jobs=-1)

grid_search.fit(x_train, y_train)

print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score : {grid_search.best_score_}")

best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(x_test)
print(f"Test MSE: {mean_squared_error(y_test, y_pred) ** .5}")
print(f"Test R2: {r2_score(y_test, y_pred)}")

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Best Score : -408082491.81158936
Test MSE: 17571.346727015873
Test R2: 0.7727847695350647


After optimization, I was unable to surpass the R² score limit of 0.80, but I achieved a value very close to it. The best result obtained after optimization with XGBoost was an R² score of 0.77.
