In [5]:
import os

# Check where Jupyter thinks you are
print(os.getcwd())


G:\Projects\laptop-spec-tracker\scripts


In [7]:
import os
os.listdir("../data")

['.ipynb_checkpoints',
 'smartprix_laptops.csv',
 'smartprix_laptops_20250617-102604.csv']

In [8]:
import pandas as pd

df = pd.read_csv("../data/smartprix_laptops_20250617-102604.csv")
df.head()

Unnamed: 0,Name,Specs,Price
0,Huawei MateBook Fold Laptop (32GB/ 2TB SSD/ Ha...,"32 GB RAM\n2 TB SSD\n18 inches, 3296 x 2472 pi...","₹3,19,930"
1,Lenovo V15 G4 83CR000VIN Laptop (AMD Ryzen 7 7...,"7th Gen AMD Ryzen 7 7730U\nOcta Core, 16 Threa...","₹33,999"
2,Dell Latitude 13 7390 Laptop (8th Gen Ci7/ 8GB...,"8th Gen Intel Core i7 8650u\nQuad Core, 8 Thre...","₹21,995"
3,HP Victus 15-fa1319TX Gaming Laptop (13th Gen ...,13th Gen Intel Core i5 13420H\nOcta Core (4P +...,"₹68,841"
4,Lenovo LOQ 15IAX9 83LK0068IN Gaming Laptop (12...,12th Gen Intel Core i7 12650HX\n14 Cores (6P +...,"₹68,990"


In [9]:
import unicodedata

# Clean specs column
def clean_specs(text):
    if pd.isna(text):
        return ""
    # Normalize any unicode inconsistencies
    text = unicodedata.normalize("NFKC", text)
    # Replace thin spaces (U+2009) with regular space
    text = text.replace("\u2009", " ")
    # Replace line breaks with commas for easier parsing
    text = text.replace("\n", ", ")
    # Remove extra spaces
    return ' '.join(text.split())

# Create new column
df["CleanSpecs"] = df["Specs"].apply(clean_specs)

# Preview changes
df[["Specs", "CleanSpecs"]].head()

Unnamed: 0,Specs,CleanSpecs
0,"32 GB RAM\n2 TB SSD\n18 inches, 3296 x 2472 pi...","32 GB RAM, 2 TB SSD, 18 inches, 3296 x 2472 pi..."
1,"7th Gen AMD Ryzen 7 7730U\nOcta Core, 16 Threa...","7th Gen AMD Ryzen 7 7730U, Octa Core, 16 Threa..."
2,"8th Gen Intel Core i7 8650u\nQuad Core, 8 Thre...","8th Gen Intel Core i7 8650u, Quad Core, 8 Thre..."
3,13th Gen Intel Core i5 13420H\nOcta Core (4P +...,"13th Gen Intel Core i5 13420H, Octa Core (4P +..."
4,12th Gen Intel Core i7 12650HX\n14 Cores (6P +...,"12th Gen Intel Core i7 12650HX, 14 Cores (6P +..."


In [17]:
import re

def extract_ram(text):
    match = re.search(r"(\d+\s*GB)\s+RAM", text, flags=re.IGNORECASE)
    return match.group(1) if match else None

def extract_ssd(text):
    match = re.search(r"(\d+(?:\s*TB|\s*GB))\s+SSD", text, flags=re.IGNORECASE)
    return match.group(1) if match else None

df["RAM"] = df["CleanSpecs"].apply(extract_ram)
df["SSD"] = df["CleanSpecs"].apply(extract_ssd)

df[["CleanSpecs", "RAM", "SSD"]].head(10)

Unnamed: 0,CleanSpecs,RAM,SSD
0,"32 GB RAM, 2 TB SSD, 18 inches, 3296 x 2472 pi...",32 GB,2 TB
1,"7th Gen AMD Ryzen 7 7730U, Octa Core, 16 Threa...",,512 GB
2,"8th Gen Intel Core i7 8650u, Quad Core, 8 Thre...",,256 GB
3,"13th Gen Intel Core i5 13420H, Octa Core (4P +...",,512 GB
4,"12th Gen Intel Core i7 12650HX, 14 Cores (6P +...",,512 GB
5,"13th Gen Intel Core i5 1334U, 10 Cores (2P + 8...",,512 GB
6,"Apple M4, 10 Cores (4P + 6E), 16 GB RAM, 256 G...",16 GB,256 GB
7,"7th Gen AMD Ryzen 5 7520U, Quad Core, 8 Thread...",,512 GB
8,"12th Gen Intel Core i5 12500H, 12 Cores (4P + ...",,512 GB
9,"12th Gen Intel Core i5 12500H, 12 Cores (4P + ...",,512 GB


In [19]:
# Show rows where RAM is not None
df[df["RAM"].notna()][["CleanSpecs", "RAM", "SSD"]].head(25)

Unnamed: 0,CleanSpecs,RAM,SSD
0,"32 GB RAM, 2 TB SSD, 18 inches, 3296 x 2472 pi...",32 GB,2 TB
6,"Apple M4, 10 Cores (4P + 6E), 16 GB RAM, 256 G...",16 GB,256 GB
61,"14th Gen Intel Core i7 14650HX, 16 Cores (8P +...",16 GB,1 TB
62,"7th Gen AMD Ryzen 5 7530U, Hexa Core, 12 Threa...",16 GB,512 GB
69,"5th Gen AMD Ryzen 7 5825U, Octa Core, 16 Threa...",16 GB,512 GB
71,"Apple M4, 10 Cores (4P + 6E), 16 GB RAM, 512 G...",16 GB,512 GB
77,"13th Gen Intel Core i3 1315U, Hexa Core (2P + ...",8 GB,256 GB
81,"13th Gen Intel Core i3 1315U, Hexa Core (2P + ...",8 GB,512 GB
102,"13th Gen Intel Core i3, 16 GB RAM, 512 GB SSD,...",16 GB,512 GB
106,"Apple M4, 10 Cores (4P + 6E), 24 GB RAM, 512 G...",24 GB,512 GB


In [20]:
def extract_ram_size(text):
    match = re.search(r"(\d+\s*GB)\s+RAM", text, flags=re.IGNORECASE)
    return match.group(1).strip() if match else None

def extract_ram_type(text):
    match = re.search(r"\b(LP?DDR\d(?:X)?)\b", text, flags=re.IGNORECASE)
    return match.group(1).upper() if match else None

# Apply to CleanSpecs
df["RAM_Size"] = df["CleanSpecs"].apply(extract_ram_size)
df["RAM_Type"] = df["CleanSpecs"].apply(extract_ram_type)

# Check top RAM type frequencies
df["RAM_Type"].value_counts().head(10)

RAM_Type
LPDDR5X    178
LPDDR5      91
LPDDR4X     39
LPDDR4       7
Name: count, dtype: int64