# Laptop Price Prediction

Dataset: Laptop Price Personal Datasets (Ref: https://www.kaggle.com/datasets/anubhavgoyal10/laptop-prices-dataset)

Features:
> brand: The brand of the laptop (e.g., ASUS, MSI, Dell, HP, Lenovo, etc.) <br>
> model: The specific model or series of the laptop <br>
> processor: The type and specifications of the processor (e.g., Intel i5, Ryzen 7) <br>
> ram: The amount of RAM (in GB) <br>
> storage: The type and capacity of storage (e.g., 512GB SSD, 1TB HDD) <br>
> gpu: The graphics processing unit (GPU) details if available <br>
> os: The operating system (e.g., Windows, MacOS, Linux) <br>
> warranty: The warranty period for the laptop (e.g., 1 year, 2 years) <br>
> display: Indicates whether the laptop has a touchscreen feature (Yes/No) <br>
> weight: The weight of the laptop (Casual , ThinNlight , Gaming) <br>
> price: The price of the laptop <br>

### Data Loading

In [3]:
# installing the necessary libaries

!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
# Load the necessary libararies

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

 # Importing the dataset

In [5]:
# Load data as a pandas dataframe

df = pd.read_csv('https://raw.githubusercontent.com/DilukshanA/Laptop-Price-Predictor/refs/heads/main/laptopPrice.csv')

In [10]:
df.head(12)

Unnamed: 0,brand,processor_brand,processor_name,processor_gnrtn,ram_gb,ram_type,ssd,hdd,os,os_bit,graphic_card_gb,weight,warranty,Touchscreen,msoffice,Price,rating,Number of Ratings,Number of Reviews
0,ASUS,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,34649,2 stars,3,0
1,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,38999,3 stars,65,5
2,Lenovo,Intel,Core i3,10th,4 GB,DDR4,0 GB,1024 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,39999,3 stars,8,1
3,ASUS,Intel,Core i5,10th,8 GB,DDR4,512 GB,0 GB,Windows,32-bit,2 GB,Casual,No warranty,No,No,69990,3 stars,0,0
4,ASUS,Intel,Celeron Dual,Not Available,4 GB,DDR4,0 GB,512 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,26990,3 stars,0,0
5,ASUS,Intel,Celeron Dual,Not Available,4 GB,DDR4,0 GB,512 GB,Windows,64-bit,0 GB,Casual,No warranty,No,No,22990,3 stars,0,0
6,ASUS,Intel,Celeron Dual,Not Available,4 GB,DDR4,0 GB,512 GB,Windows,32-bit,0 GB,Casual,No warranty,No,No,21990,3 stars,31,3
7,ASUS,Intel,Core i5,10th,8 GB,DDR4,0 GB,1024 GB,Windows,32-bit,2 GB,Casual,No warranty,No,No,58799,3 stars,0,0
8,Lenovo,Intel,Core i5,10th,4 GB,DDR4,0 GB,1024 GB,Windows,32-bit,0 GB,Casual,No warranty,No,No,49999,3 stars,0,0
9,acer,AMD,Ryzen 5,10th,4 GB,DDR4,0 GB,512 GB,Windows,32-bit,4 GB,Casual,No warranty,No,No,59990,4 stars,1946,240


In [9]:
# Print the shape of the dataframe

df.shape

(823, 19)

In [None]:
# Print a concise summary of the pandas dataframe

df.info()

### Data Preprocessing and EDA

In [None]:
# Rename specific columns in the DataFrame for better readability

df.rename(columns = {'processor_gnrtn':'Generation', 'ram_gb':'RAM'}, inplace = True)

In [None]:
# Select only numeric columns (float and int) from the DataFrame

numeric_df = df.select_dtypes(include=[float, int])

# Convert the 'Price' column to numeric, coercing errors to NaN if conversion fails

df['Price'] = pd.to_numeric(df['Price'], errors='coerce')

# Calculate the correlation of numeric columns with 'Price'

correlation = numeric_df.corr()['Price']

print(correlation)

In [None]:
# Count the number of occurrences of each brand in the 'brand' column

df.brand.value_counts()

In [None]:
# Create a bar plot showing the average price for each brand

sns.barplot(data = df, x = 'brand', y = 'Price', palette = 'magma')

# Rotate the x-axis labels vertically for better visibility

plt.xticks(rotation = 'vertical')

# Display the plot

plt.show()

In [None]:
# Count the number of occurrences of each processor brand in the 'processor_brand' column

df.processor_brand.value_counts()

In [None]:
sns.barplot(data = df, x = 'processor_brand', y = 'Price')
plt.show()

In [None]:
df.processor_name.value_counts()

In [None]:
sns.barplot(data = df, x = 'processor_name', y = 'Price')
plt.xticks(rotation ='vertical')
plt.show()

In [None]:
df.Generation.value_counts()

In [None]:
sns.barplot(data = df, x = 'Generation', y = 'Price', palette = 'husl')
plt.xticks(rotation = 'vertical')
plt.show()

Prices aren't really varying with generation and there are a lot of missing values as well; so I am getting rid of this column.

In [None]:
df.drop('Generation', axis= 1, inplace = True)

In [None]:
df.columns

In [None]:
df.RAM.value_counts()

In [None]:
sns.barplot(data = df, x = 'RAM', y = 'Price')
plt.show()

In [None]:
df['RAM'] = df.RAM.str.replace(' GB', '')
df['RAM'] = df.RAM.astype(int)

In [None]:
df.ram_type.value_counts()

In [None]:
sns.barplot(data = df, x = 'ram_type', y = 'Price')
plt.xticks(rotation = 'vertical')
plt.show()

In [None]:
df.ssd.value_counts()

In [None]:
sns.barplot(data = df, x = 'ssd', y = 'Price', palette = 'hls')
plt.xticks(rotation = 'vertical')
plt.show()

In [None]:
df['ssd'] = df.ssd.str.replace(' GB', '')

In [None]:
df['ssd'] = df.ssd.astype(int)

In [None]:
df.hdd.value_counts()

In [None]:
sns.barplot(data = df, x = 'hdd', y = 'Price')
plt.show()

hdd will be changed to a boolean column with values 0 and 1. A laptop either has hdd or not.

In [None]:
df['hdd'] = df['hdd'].apply(lambda x:0 if (x == '0 GB') else 1)

In [None]:
df['hdd'].value_counts()

In [None]:
df.os.value_counts()

In [None]:
sns.barplot(data =df, x = 'os', y = 'Price')
plt.show()

In [None]:
df.os_bit.value_counts()

In [None]:
sns.barplot(data =df, x = 'os_bit', y = 'Price')
plt.show()

os_bit doesn't depend on price at all.
So I am removing this.

In [None]:
df.drop('os_bit', axis = 1, inplace = True)

In [None]:
df.graphic_card_gb.value_counts()

In [None]:
sns.barplot(data =df, x = 'graphic_card_gb', y = 'Price', palette = 'viridis')
plt.show()

In [None]:
df.graphic_card_gb = df.graphic_card_gb.str.replace(' GB', '')
df.graphic_card_gb = df.graphic_card_gb.astype(int)

In [None]:
df.weight.value_counts()

In [None]:
sns.barplot(data = df, x = 'weight', y = 'Price')
plt.show()

In [None]:
df.warranty.value_counts()

In [None]:
def fetch_warranty(text):
    if text == '1 year':
        return 1
    elif text == '2 years':
        return 2
    elif text == '3 years':
        return 3
    else:
        return 0

In [None]:
df['warranty'] = df['warranty'].apply(fetch_warranty)

In [None]:
df.warranty.dtype

In [None]:
df.Touchscreen.value_counts()

In [None]:
sns.barplot(data =df, x = 'Touchscreen', y = 'Price')
plt.show()

In [None]:
df.msoffice.value_counts()

In [None]:
sns.barplot(data =df, x = 'msoffice', y = 'Price')
plt.show()

In [None]:
sns.kdeplot(df['Price'])
plt.show()

In [None]:
df['Price'].skew()

Price is skewed towards the right, so I am applying log transformation

In [None]:
sns.kdeplot(np.log(df['Price']))
plt.show()

In [None]:
np.log(df['Price']).skew()

In [None]:
#log tranformation of the price
df['Price'] = np.log(df['Price'])

In [None]:
sns.boxplot(df['Price'])
plt.show()

In [None]:
df['rating'].value_counts()

In [None]:
df[df['rating'] == '1 star']

In [None]:
def fetch_rating(input):
    if input == '1 star':
        return 1
    elif input == '2 stars':
        return 2
    elif input == '3 stars':
        return 3
    elif input == '4 stars':
        return 4
    else:
        return 5

In [None]:
df['rating'] = df['rating'].apply(fetch_rating)

In [None]:
sns.kdeplot(df['Number of Ratings'])
plt.show()

In [None]:
sns.kdeplot(df['Number of Reviews'])
plt.show()

In [None]:
df.describe()

In [None]:
df_num = df.loc[:, ['RAM', 'hdd', 'graphic_card_gb', 'warranty', 'Price', 'rating', 'Number of Ratings', 'Number of Reviews']]

In [None]:
df_num.corr()

The last 2 columns are highly skewed and more than half of the values are 0. They are highly correlated to each other. Anyway reviews don't affect the prices of laptops in practical applications.

Rating has a negligible correlation with Price. So these 3 columns will be eremoved.

In [None]:
df.drop(columns = ['rating', 'Number of Reviews', 'Number of Ratings'], inplace = True)

In [None]:
df

In [None]:
X = df.drop('Price', axis = 1)
y = df['Price']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.26, random_state = 4)

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
trans = ColumnTransformer(transformers = [
    ('ohe', OneHotEncoder(sparse_output = False, drop = 'first'), [0, 1, 2, 4, 7, 9, 11, 12])
], remainder = 'passthrough')

In [None]:
pipe = Pipeline([
    ('transformation', trans)
])

In [None]:
X_train_transformed = pipe.fit_transform(X_train)
X_test_transformed = pipe.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train_transformed, columns=pipe.named_steps['transformation'].get_feature_names_out())
X_test = pd.DataFrame(X_test_transformed, columns=pipe.named_steps['transformation'].get_feature_names_out())

In [None]:
X_train

## Model building

In [None]:
def model_acc(model):
    model.fit(X_train, y_train)
    acc = model.score(X_test, y_test)
    print(str(model)+ ' --> ' +str(acc))

In [None]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
model_acc(lr)

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
model_acc(rf)

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators':[10, 50, 100],
              'criterion':['squared_error','absolute_error','poisson']}

grid_obj = GridSearchCV(estimator=rf, param_grid=parameters)

grid_fit = grid_obj.fit(X_train, y_train)

best_model = grid_fit.best_estimator_
best_model

In [None]:
best_model.score(X_test, y_test)

In [None]:
import pickle
with open('predictor.pickle', 'wb') as file:
    pickle.dump(best_model, file)

In [None]:
X_test.columns

In [None]:
num_columns = X_test.shape[1]
print("Number of columns:", num_columns)

In [None]:
pred_value = best_model.predict([[1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0]])
pred_value

In [None]:
pred_value = best_model.predict([[1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0]])
pred_value

In [None]:
df.head()

In [None]:
pred_value = best_model.predict([[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 32, 256, 256, 2, 3]])
pred_value