In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import metrics

import tensorflow as tf
import keras
from keras import layers

In [20]:
# read the dataset
df = pd.read_csv("laptop_cleaned2.csv")

# show loaded dataset
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Brand,Price,Rating,Processor_brand,Processor_name,Processor_variant,Processor_gen,Core_per_processor,...,Graphics_name,Graphics_brand,Graphics_GB,Graphics_integreted,Display_size_inches,Horizontal_pixel,Vertical_pixel,ppi,Touch_screen,Operating_system
0,0,HP Victus 15-fb0157AX Gaming Laptop (AMD Ryzen...,HP,50399,4.3,AMD,AMD Ryzen 5,5600H,5.0,6.0,...,AMD Radeon RX 6500M,AMD,4.0,False,15.6,1920,1080,141.21,True,Windows 11 OS
1,1,Lenovo V15 G4 ‎82YU00W7IN Laptop (AMD Ryzen 3 ...,Lenovo,26690,4.45,AMD,AMD Ryzen 3,7320U,7.0,4.0,...,AMD Radeon Graphics,AMD,,False,15.6,1920,1080,141.21,False,Windows 11 OS
2,2,HP 15s-fq5007TU Laptop (12th Gen Core i3/ 8GB/...,HP,37012,4.65,Intel,Intel Core i3,1215U,12.0,6.0,...,Intel UHD Graphics,Intel,,False,15.6,1920,1080,141.21,False,Windows 11 OS
3,3,Samsung Galaxy Book2 Pro 13 Laptop (12th Gen C...,Samsung,69990,4.75,Intel,Intel Core i5,1240P,12.0,12.0,...,Intel Iris Xe Graphics,Intel,,False,13.3,1080,1920,165.63,False,Windows 11 OS
4,4,Tecno Megabook T1 Laptop (11th Gen Core i3/ 8G...,Tecno,23990,4.25,Intel,Intel Core i3,1115G4,11.0,2.0,...,Intel UHD Graphics,Intel,,False,15.6,1920,1080,141.21,False,Windows 11 OS


In [21]:
df.isnull().sum()

Unnamed: 0                  0
Name                        0
Brand                       0
Price                       0
Rating                      0
Processor_brand             0
Processor_name              0
Processor_variant          24
Processor_gen             128
Core_per_processor         10
Total_processor           447
Execution_units           447
Low_Power_Cores             0
Energy_Efficient_Units      0
Threads                    46
RAM_GB                      0
RAM_type                   22
Storage_capacity_GB         0
Storage_type                0
Graphics_name               2
Graphics_brand              2
Graphics_GB               652
Graphics_integreted         2
Display_size_inches         0
Horizontal_pixel            0
Vertical_pixel              0
ppi                         0
Touch_screen                0
Operating_system            0
dtype: int64

In [None]:
df.shape

(1015, 29)

In [33]:
df.duplicated().sum()

np.int64(0)

I will impute missing values for core_per_processor, threads, ram_type, graphics_name, graphics_brand and graphics_integrated and will drop total_processor, Execution_units and graphics_gb due to almost half of the rows being empty for the first two and more than 60 percent missing for the last.

In [None]:
df["Brand"].nunique()

31

In [45]:
df["Brand"].value_counts()

Brand
Lenovo       217
HP           213
Asus         157
Dell         116
MSI           97
Acer          69
Samsung       28
Apple         20
Infinix       20
Chuwi          8
Zebronics      7
Microsoft      7
LG             7
Honor          6
Xiaomi         6
Gigabyte       6
Avita          6
Ultimus        5
Wings          3
Primebook      3
Fujitsu        3
AXL            2
Tecno          1
Jio            1
ASUS           1
iBall          1
Walker         1
Colorful       1
Ninkear        1
Huawei         1
Razer          1
Name: count, dtype: int64

In [46]:
brand_counts = df["Brand"].value_counts()
rare_brands = brand_counts[brand_counts < 10].index  # brands with <10 rows
df["Brand"] = df["Brand"].replace(rare_brands, "Other")

In [48]:
df["Brand"].unique()

array(['HP', 'Lenovo', 'Samsung', 'Other', 'Dell', 'Asus', 'Apple',
       'Acer', 'MSI', 'Infinix'], dtype=object)

In [49]:
df["Brand"].value_counts()

Brand
Lenovo     217
HP         213
Asus       157
Dell       116
MSI         97
Other       78
Acer        69
Samsung     28
Apple       20
Infinix     20
Name: count, dtype: int64

In [51]:
df["Brand"].isna().sum()

np.int64(0)

In [52]:
df["Processor_brand"].nunique()

5

In [53]:
df["Processor_brand"].value_counts()

Processor_brand
Intel        739
AMD          250
Apple         18
MediaTek       7
Microsoft      1
Name: count, dtype: int64

In [23]:
missing_core = df[df["Core_per_processor"].isna()]
missing_core[["Processor_name", "Processor_variant"]].drop_duplicates()



Unnamed: 0,Processor_name,Processor_variant
80,Intel Core i9,14900HX
140,eration Intel Core,L3…
194,Qualcomm X Elite,Elite
253,Intel Core i7,i7
501,Intel Celeron,N4500
531,Intel Core i5,i5
648,HiSilicon Kirin 9006C 9006C,9006C
778,Intel Core i7,12700H
902,Intel Core i3,N305


In [24]:
for name, variant in missing_core[["Processor_name", "Processor_variant"]].drop_duplicates().values:
    subset = df[
        (df["Processor_name"] == name) &
        (df["Processor_variant"] == variant) &
        (df["Core_per_processor"].notna())
    ]
    if not subset.empty:
        print(f"Processor: {name}, Variant: {variant}")
        print(subset["Core_per_processor"].value_counts())
        print("----")


Processor: Intel Core i9, Variant: 14900HX
Core_per_processor
24.0    8
Name: count, dtype: int64
----
Processor: Intel Core i7, Variant: i7
Core_per_processor
4.0    1
Name: count, dtype: int64
----
Processor: Intel Celeron , Variant: N4500
Core_per_processor
2.0    14
Name: count, dtype: int64
----
Processor: Intel Core i7, Variant: 12700H
Core_per_processor
14.0    6
Name: count, dtype: int64
----
Processor: Intel Core i3, Variant: N305
Core_per_processor
8.0    5
Name: count, dtype: int64
----


In [25]:
# Step 1: Build a dictionary of (Processor_name, Processor_variant) → most common core value
core_map = (
    df.groupby(["Processor_name", "Processor_variant"])["Core_per_processor"]
      .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else None)
      .dropna()
      .to_dict()
)

# Step 2: Function to fill missing Core_per_processor using the dictionary
def fill_core(row):
    if pd.isna(row["Core_per_processor"]):
        return core_map.get((row["Processor_name"], row["Processor_variant"]), row["Core_per_processor"])
    return row["Core_per_processor"]

# Step 3: Apply the function to the DataFrame
df["Core_per_processor"] = df.apply(fill_core, axis=1)

# Step 4: Check if there are still missing values
print("Remaining NaN values:", df["Core_per_processor"].isna().sum())


Remaining NaN values: 5


In [26]:
missing_core = df[df["Core_per_processor"].isna()]
missing_core[["Processor_name", "Processor_variant"]].drop_duplicates()

Unnamed: 0,Processor_name,Processor_variant
140,eration Intel Core,L3…
194,Qualcomm X Elite,Elite
531,Intel Core i5,i5
648,HiSilicon Kirin 9006C 9006C,9006C


I am going to drop the remaining 4 rows because I don't know which kind of processor they are and I couldn't find any specs for the HiSilicon Kirin 9006C

In [27]:
# Drop rows where Core_per_processor is still NaN
df = df.dropna(subset=["Core_per_processor"])

# Reset index (optional, to keep things tidy)
df = df.reset_index(drop=True)

print("Remaining NaN values:", df["Core_per_processor"].isna().sum())
print("New dataset shape:", df.shape)


Remaining NaN values: 0
New dataset shape: (1015, 29)


In [39]:
df["Processor_brand"].value_counts()

Processor_brand
Intel        739
AMD          250
Apple         18
MediaTek       7
Microsoft      1
Name: count, dtype: int64

In [42]:
df["Brand"].value_counts()

Brand
Lenovo       217
HP           213
Asus         157
Dell         116
MSI           97
Acer          69
Samsung       28
Apple         20
Infinix       20
Chuwi          8
Zebronics      7
Microsoft      7
LG             7
Honor          6
Xiaomi         6
Gigabyte       6
Avita          6
Ultimus        5
Wings          3
Primebook      3
Fujitsu        3
AXL            2
Tecno          1
Jio            1
ASUS           1
iBall          1
Walker         1
Colorful       1
Ninkear        1
Huawei         1
Razer          1
Name: count, dtype: int64

In [43]:
# How many unique brands?
df["Brand"].nunique()


31

In [38]:
missing_generation = df[df["Processor_gen"].isna()]
missing_generation[["Processor_name","Processor_variant"]]

Unnamed: 0,Processor_name,Processor_variant
13,Apple M1,
17,Apple M3,
23,Apple M2,
24,Intel Core 5,120U
26,Intel Core Ultra,155H
...,...,...
897,Intel Core i3,N305
947,Intel Core i3,N305
977,Intel Celeron,N4020
979,Intel Celeron,N4020
