# **Analysing the factors of Laptop pricing**

## **Import Packages**

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import mean_absolute_error

## **Load Data**

In [39]:
df = pd.read_csv("../data/laptop_price - dataset.csv")
df.tail()

Unnamed: 0,Company,Product,TypeName,Inches,ScreenResolution,CPU_Company,CPU_Type,CPU_Frequency (GHz),RAM (GB),Memory,GPU_Company,GPU_Type,OpSys,Weight (kg),Price (Euro)
1270,Lenovo,Yoga 500-14ISK,2 in 1 Convertible,14.0,IPS Panel Full HD / Touchscreen 1920x1080,Intel,Core i7 6500U,2.5,4,128GB SSD,Intel,HD Graphics 520,Windows 10,1.8,638.0
1271,Lenovo,Yoga 900-13ISK,2 in 1 Convertible,13.3,IPS Panel Quad HD+ / Touchscreen 3200x1800,Intel,Core i7 6500U,2.5,16,512GB SSD,Intel,HD Graphics 520,Windows 10,1.3,1499.0
1272,Lenovo,IdeaPad 100S-14IBR,Notebook,14.0,1366x768,Intel,Celeron Dual Core N3050,1.6,2,64GB Flash Storage,Intel,HD Graphics,Windows 10,1.5,229.0
1273,HP,15-AC110nv (i7-6500U/6GB/1TB/Radeon,Notebook,15.6,1366x768,Intel,Core i7 6500U,2.5,6,1TB HDD,AMD,Radeon R5 M330,Windows 10,2.19,764.0
1274,Asus,X553SA-XX031T (N3050/4GB/500GB/W10),Notebook,15.6,1366x768,Intel,Celeron Dual Core N3050,1.6,4,500GB HDD,Intel,HD Graphics,Windows 10,2.2,369.0


## **Data Preparation**

### **Checking Null Values**

In [None]:
print(df.isnull().sum())

### **Simplify 'Memory' to numeric**

In [40]:
df['Memory'] = df['Memory'].str.replace('TB', '000', regex=False)
df['Memory'] = df['Memory'].str.replace('GB', '', regex=False)
df['Memory'] = df['Memory'].str.replace('Flash Storage', '', regex=False)
df['Memory'] = df['Memory'].str.replace(' ', '', regex=False)
df['Memory'] = df['Memory'].str.extract(r'(\d+)').astype(float)

### **Convert screen resolution into pixels**

In [41]:
df['Resolution_Width'] = df['ScreenResolution'].str.extract(r'(\d+)x\d+').astype(float)
df['Resolution_Height'] = df['ScreenResolution'].str.extract(r'\d+x(\d+)').astype(float)

### **Drop unused columns**

In [42]:
df.drop(['Product', 'ScreenResolution', 'GPU_Type', 'CPU_Type'], axis=1, inplace=True)

### **Encode categorical data**

In [43]:
le = LabelEncoder()
categorical_cols = ['Company', 'TypeName', 'CPU_Company', 'GPU_Company', 'OpSys']
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

### **Data Seperation As X and Y**

In [44]:
y = df["Price (Euro)"]
x = df.drop("Price (Euro)", axis=1)

### **Data spliting**

In [45]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

## **Model Building**

### **Linear Regression**

In [46]:
lr = LinearRegression()
lr.fit(x_train, y_train)
y_pred_lr = lr.predict(x_test)

### **Linear Regression metrics**

In [None]:
print("Linear Regression:")
print("R^2 Score:", r2_score(y_test, y_pred_lr))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_lr)))
print("MAE:",mean_absolute_error(y_test, y_pred_lr))

## **Plotting**

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y= y_pred_lr, color='orange', label='Predicted Points')

# Plot the ideal line where prediction == actual
plt.plot([y_test.min(), y_test.max()],
         [y_test.min(), y_test.max()],
         color='red', linestyle='--', label='Perfect Prediction')

plt.title("Actual vs Predicted Laptop Prices")
plt.xlabel("Actual Price (EUR)")
plt.ylabel("Predicted Price (EUR)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
residuals = y_test - y_pred_lr  # Actual - Predicted
plt.scatter(y_pred_lr, residuals, alpha=0.5)
plt.axhline(y=0, color='red', linestyle='--')
plt.title("Residuals vs Predicted Price")
plt.xlabel("Predicted Price (EUR)")
plt.ylabel("Residuals")
plt.grid(True)
plt.show()
