In [1]:
#Importing libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import numpy as np

In [5]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}


### Scraping Fridges data

In this process, we are scraping fridge product data from the Kilimall site. The data is then cleaned by handling missing values, removing duplicates, and standardizing the format for consistency. This ensures that the dataset is accurate, structured, and ready for analysis or further processing.


In [6]:
# getting data with product link
fridges_links = []

for x in range(1,17):
    result = requests.get(f"https://www.kilimall.co.ke/search?q=FRIDGE&page={x}&source=search|enterSearch|FRIDGE",headers=headers)
    soup = BeautifulSoup(result.content, "html.parser")
    fridges_list = soup.find_all("div", class_ = "product-item")

    for fridges in fridges_list:
        for link in fridges.find_all("a", href = True):
            fridges_links.append("https://www.kilimall.co.ke" + link["href"])


In [8]:
# Initialize a list to store all fridges data
fridges = []

# Loop through all 16 pages
for x in range(1, 17):  # Pages 1 to 16
    print(f"Scraping page {x}...")
    
    # Send a GET request to the page
    result = requests.get(f'https://www.kilimall.co.ke/search?q=FRIDGE&page={x}&source=search|enterSearch|FRIDGE', headers=headers)
    
    # Check if the request was successful
    if result.status_code == 200:
        soup = BeautifulSoup(result.text, 'html.parser')  # Parse the HTML content
        
        # Extract fridge details from divs with the class "info-box".
        fridges_info = soup.find_all('div', class_="info-box")
        
        # Extract relevant details
        for fridge_info in fridges_info:
            # Safely extract data, handle cases where tags are missing
            fridge_name = fridge_info.find('p', class_='product-title')
            fridge_price = fridge_info.find('div', class_='product-price')
            fridge_reviews = fridge_info.find('span', class_='reviews')
            
            # Clean and append extracted data
            fridges.append({
                "Name": fridge_name.text.strip() if fridge_name else "N/A",
                "Price": fridge_price.text.strip() if fridge_price else "N/A",
                "Reviews": fridge_reviews.text.strip() if fridge_reviews else "N/A"
            })
    else:
        print(f"Failed to fetch page {x}, Status code: {result.status_code}")

            

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...


In [9]:
#Save results as a DataFrame
df_fridges = pd.DataFrame(fridges)
df_fridges["Links"] = fridges_links
df_fridges.sample(20)

Unnamed: 0,Name,Price,Reviews,Links
566,Heavy Duty Fridge Stand Washing machine Cooker...,"KSh 3,698",(0),https://www.kilimall.co.ke/listing/1000153670-...
557,"7.5L Car Refrigerator, Hot And Cold Mini Car F...","KSh 5,159",(0),https://www.kilimall.co.ke/listing/1000282976-...
435,Hisense Fridge 91 Liters Double Door+2 YEARS W...,"KSh 26,599",(0),https://www.kilimall.co.ke/listing/1000080955-...
86,Ramtons RF/244 -2 Door Direct Cool Fridge- 213...,"KSh 47,999",(1),https://www.kilimall.co.ke/listing/2563201-ram...
530,Portable Mini Car Fridge Cooler Warmer 7.5L,"KSh 4,499",(0),https://www.kilimall.co.ke/listing/1001017095-...
399,Mika 211 Litres Double Door Fridge,"KSh 56,025",(0),https://www.kilimall.co.ke/listing/1000276509-...
154,"Mika MRDCS92DS,92L - Single Door Mini Fridge","KSh 24,500",(0),https://www.kilimall.co.ke/listing/1001032104-...
164,Ramtons RF/130- 213L 2 Door Direct Cool Fridge...,"KSh 51,300",(0),https://www.kilimall.co.ke/listing/1000422672-...
95,"Mika Fridge, 202L, 2 Door Top Mount Freezer, N...","KSh 59,999",(0),https://www.kilimall.co.ke/listing/1001015266-...
400,NUNIX 138L BCD-138 Fridge Double Door Direct C...,"KSh 33,999",(0),https://www.kilimall.co.ke/listing/1000074739-...


In [5]:
df_fridges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 566 entries, 0 to 565
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Name     566 non-null    object
 1   Price    566 non-null    object
 2   Reviews  566 non-null    object
 3   Links    566 non-null    object
dtypes: object(4)
memory usage: 17.8+ KB


##### Data cleaning

In [13]:
# Function to clean the product name
def clean_name(name):
    
    # Remove words in parentheses or curly brackets if they contain "offer", "offers", "sale", or "sales"
    name = re.sub(r'\(([^)]*?(OFFER|OFFERS|SALE|SALES)[^)]*?)\)', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\{([^}]*?(OFFER|OFFERS|SALE|SALES)[^}]*?)\}', '', name, flags=re.IGNORECASE)

    # Remove variations of "offer" and "sale" (including "offers", "sales")
    name = re.sub(r'\b(\w+)\s+(OFFER|OFFERS|SALE|SALES)\b', '', name, flags=re.IGNORECASE)
    
    # Remove unnecessary marketing phrases
    name = re.sub(r'\b(BLACK FRIDAY|BLACK FRIDAY OFFERS|BEST DEALS|LIMITED|LIMITED TIME|TECH WEEK|OFFER|BEST WHOLESALE PRICE|SPECIAL OFFERS)\b', '', name, flags=re.IGNORECASE)
    
    # Remove all remaining parentheses, curly braces, brackets, and clean extra spaces
    name = re.sub(r'[\(\)\{\}\[\]]', '', name)  # Remove parentheses, braces, and brackets
    name = re.sub(r'\s+', ' ', name)  # Replace multiple spaces with a single space
    
    # Remove special characters like '!', '+' if they appear as the first word
    name = re.sub(r'^[!+\[\]]+', '', name).strip()  # Strip unwanted characters at the start
    
    # Remove emojis using a regex for unicode emoji ranges
    name = re.sub(r'[^\w\s,.-]', '', name)  # Remove non-alphanumeric characters (including emojis)

    # Final trim to remove leading/trailing spaces
    name = name.strip()
    
    return name

# Apply the cleaning function to the 'Name' column in the DataFrame

df_fridges['Name'] = df_fridges['Name'].apply(clean_name)

In [7]:
#Remove commas and any text from Price column
df_fridges["Price"] = df_fridges['Price'].str.replace(r'[^\d]', '', regex=True)
# Rename the Price column
df_fridges = df_fridges.rename(columns={'Price': 'Price(kshs)'})



#Remove brackets from Reviews column
df_fridges['Reviews'] = df_fridges['Reviews'].str.extract(r'(\d+)')
df_fridges.sample(20)

Unnamed: 0,Name,Price(kshs),Reviews,Links
475,Original Low energy consumption Rebune Fridge ...,51500,0,https://www.kilimall.co.ke/listing/2594830-ori...
130,Volsmart 108L Double Door Fridge With A Larger...,26055,0,https://www.kilimall.co.ke/listing/1000448749-...
211,Roch RFR-190S-I Single Door Fridge - 150 Litresr,35999,1,https://www.kilimall.co.ke/listing/1000196641-...
41,SmartPro 138 Litres Refrigerator SFR-175-DT-I ...,31399,2,https://www.kilimall.co.ke/listing/1000360077-...
40,Ramtons 90 LITRES DOUBLE DOOR DIRECT COOL FRID...,26999,34,https://www.kilimall.co.ke/listing/2541784-ram...
273,"RAMTONS 128 LITERS 2 DOOR DIRECT COOL FRIDGE, ...",37800,0,https://www.kilimall.co.ke/listing/3029960-ram...
112,RESTOCKED Efficient Cooling for Your Kitchen N...,24200,0,https://www.kilimall.co.ke/listing/1001037989-...
93,"Mika Fridge, 202L, 2 Door Top Mount Freezer, N...",59999,0,https://www.kilimall.co.ke/listing/1001015266-...
305,Hisense RS-12DR4SA 92L Single Door Direct Cool...,25999,0,https://www.kilimall.co.ke/listing/2760888-his...
522,1 Mika fridge of 202L No frost FREE 1 JSB Ext...,60000,0,https://www.kilimall.co.ke/listing/1000398910-...


##### Remove duplicates

In [8]:
#Checking how many duplicates
duplicate_count = int(df_fridges.duplicated( keep=False).sum())
print(f"There are {duplicate_count} duplicates")

#Find all duplicates
duplicates = df_fridges[df_fridges.duplicated( keep=False)]
duplicates


There are 0 duplicates


Unnamed: 0,Name,Price(kshs),Reviews,Links


In [9]:
# Remove duplicates, keeping the first occurrence
df_fridges = df_fridges.drop_duplicates()



##### Feature Extraction

In [10]:
# Extract the number of doors
def extract_doors(description):
    # Define the regex pattern to match numbers/keywords before "Door" or "Doors"
    pattern = r'\b(1|one|2|two|3|three|4|four|Single|Double)\b(?:\s*Doors)?'
    # Search for the pattern in the description
    match = re.search(pattern, description, re.IGNORECASE)
    # Map matches to corresponding numeric values
    door_mapping = {
        "1": 1,
        "one": 1,
        "single": 1,
        "2": 2,
        "two": 2,
        "double": 2,
        "4": 4,
        "four": 4}
    if match:
        door_type = match.group(1).lower()  # Convert the match to lowercase
        return door_mapping.get(door_type, "Unknown")  # Map to the number of doors
    return "Unknown"  # If no match is found



# Extract capacity in litres
def extract_capacity(description):
    # Define the regex pattern
    pattern = r'(\d+(\.\d+)?)\s*(L|litres|ltrs|lt)'
    # Search for the pattern in the description
    match = re.search(pattern, description, re.IGNORECASE)
    if match:
        return float(match.group(1))  # Return the number as float
    return None  # Return None if no match is found




#Extract brand names
brands = ['Volsmart','Hisense','Roch','Nunnix','Smartpro','Nunix','Ecomax','Ramtons','Mika','Von','Haier','Exzel','GLD','Vitron','Smartpro','Bruhm','Premier','Samsung', 'Ailyons', 'LG', 'Solstar', 'Royal','Beko','Syinix','ICECOOL','Rebune','Legacy','FK','Smart pro']
# Function to extract the brand name
def extract_brand(product_name):
    for brand in brands:
        if brand.lower() in product_name.lower():  # Case insensitive match
            return brand
    return 'Unknown'  # Return 'Unknown' if no brand is found





# Apply the extraction functions to the DataFrame
df_fridges["Doors"] = df_fridges["Name"].apply(extract_doors)
df_fridges['Capacity(ltrs)'] = df_fridges['Name'].apply(extract_capacity)
df_fridges['Brand'] = df_fridges['Name'].apply(extract_brand)


# Display the updated DataFrame
df_fridges.sample(20)

Unnamed: 0,Name,Price(kshs),Reviews,Links,Doors,Capacity(ltrs),Brand
24,"MIKA Fridge, 138L, 2 Door Top Mount Freezer, D...",35500,0,https://www.kilimall.co.ke/listing/1000339776-...,2,138.0,Mika
290,VOLSMART 118L Fridge Freezer Energy Saving Dou...,29430,0,https://www.kilimall.co.ke/listing/1000349863-...,2,118.0,Volsmart
145,NUNIX ModelBC-92 Direct Cool Fridge 92L,21000,0,https://www.kilimall.co.ke/listing/1001015971-...,Unknown,92.0,Nunix
298,"Exzel Fridge 515L No Frost ERFF-515SL, Inverte...",101978,0,https://www.kilimall.co.ke/listing/1000348165-...,2,515.0,Exzel
379,Nunix 138L Double Door Fridge Energy Efficient...,30499,0,https://www.kilimall.co.ke/listing/1000257887-...,2,138.0,Nunix
71,Nunix 92L Single Door Fridge Energy Efficient ...,23999,0,https://www.kilimall.co.ke/listing/1001027581-...,1,92.0,Nunix
335,DEAL Hisense Fridge 91 Liters Double Door,24999,0,https://www.kilimall.co.ke/listing/2979451-bla...,2,91.0,Hisense
172,LG Freezer Fridge 287L smart inverter with 1 y...,96699,0,https://www.kilimall.co.ke/listing/3036626-lg-...,1,287.0,LG
103,"MIKA Fridge, 168L, 2 Door Top Mount Freezer, D...",45800,0,https://www.kilimall.co.ke/listing/1000447214-...,2,168.0,Mika
442,Royal - RF-195D Double Door Fridge - 181L,49995,0,https://www.kilimall.co.ke/listing/1000221955-...,2,181.0,Royal


In [11]:
# Standardize brand names (replace "Smart Pro" with "Smartpro")
df_fridges['Brand'] = df_fridges['Brand'].replace({'Smart pro': 'Smartpro','Nunnix': 'Nunix'})
df_fridges['Brand'].value_counts()

Brand
Ramtons     91
Mika        84
Hisense     67
Unknown     58
Von         47
Nunix       46
Volsmart    27
Roch        26
Premier     18
Haier       18
LG          13
Exzel       10
Rebune       8
Ecomax       7
Royal        7
Smartpro     6
Vitron       6
Beko         5
Solstar      5
Syinix       5
GLD          3
Ailyons      3
FK           3
Samsung      1
ICECOOL      1
Legacy       1
Name: count, dtype: int64

In [12]:
# Save to CSV
#df_fridges.to_csv('csv_files/kilimall_clean_fridges.csv', index=True)

### Scraping Laptops data

In [10]:
# Initialize a list to store all laptops' data
laptops = []

# Loop through all 90 pages
for x in range(1, 91):  # Pages 1 to 90
    print(f"Scraping page {x}...")
    
    # Send a GET request to the page
    result = requests.get(f'https://www.kilimall.co.ke/search?q=laptop&page={x}&source=search|enterSearch|laptop',headers=headers)
    
    # Check if the request was successful
    if result.status_code == 200:
        soup = BeautifulSoup(result.text, 'html.parser')  # Parse the HTML content
        
        # Extract laptop details from divs with the class "info-box".
        laptops_info = soup.find_all('div', class_="info-box")
        
        # Extract relevant details
        for laptop_info in laptops_info:
            # Safely extract data, handle cases where tags are missing
            laptop_name = laptop_info.find('p', class_='product-title')
            laptop_price = laptop_info.find('div', class_='product-price')
            laptop_reviews = laptop_info.find('span', class_='reviews')
            
            # Clean and append extracted data
            laptops.append({
                "Name": laptop_name.text.strip() if laptop_name else "N/A",
                "Price": laptop_price.text.strip() if laptop_price else "N/A",
                "Reviews": laptop_reviews.text.strip() if laptop_reviews else "N/A"
            })
    else:
        print(f"Failed to fetch page {x}, Status code: {result.status_code}")



Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 

In [24]:
# Set the maximum number of rows and columns to display
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns

In [11]:
#Save results as a DataFrame
df_laptops = pd.DataFrame(laptops)
df_laptops.sample(20)

Unnamed: 0,Name,Price,Reviews
232,REFURBISHED Dell Latitude 7300 Business Laptop...,"KSh 24,999",(0)
2083,(LIMITED SPECIAL OFFER) REFURBISHED LENOVO THI...,"KSh 14,499",(0)
2833,HP EliteBook 830 G5 Intel Core i5 8th Gen 16GB...,"KSh 44,000",(0)
1688,HP ELITEBOOK 820 G3 6TH GEN COREi5 8GB RAM 256...,"KSh 20,999",(0)
1354,Lenovo x260 Core i5 8GB 256GB SSD Laptop,"KSh 25,500",(0)
2038,"Lenovo ThinkPad X260 Intel Core I5, 8GB RAM, 2...","KSh 28,500",(2)
2103,(SPECIAL OFFER) DELL LATITUDE E7470| CORE I5| ...,"KSh 25,499",(1)
1979,"All In One Imac 17,1 Intel Core i5 6th Gen 3.2...","KSh 65,250",(0)
2486,(RBEST OFFER) DELL LATITUDE 5570 INTEL CORE i5...,"KSh 23,000",(0)
1258,(special offer) Refurbished laptop Computer hp...,"KSh 18,599",(0)


#### Data Cleaning and feature extraction

In [29]:
# Extract the screen size (integer or float)
def extract_screen_size(description):
    # Regex to capture floats/integers before "inch", "inches", or `"`
    match = re.search(r"(\d+\.\d+|\d+)(?=\s*(?:''|\"|inch|inches?))", description, re.IGNORECASE)
    # If a match is found, return the captured number
    # Regex to capture floats/integers directly before `"`
    match_quote = re.search(r'(\d+\.?\d*)\s*(?=")', description)
    if match:
        return float(match.group(1))  # Return match before "inch" or "inches"
    elif match_quote:
        return match_quote.group(1)  # Return match with `"`
    else:
        return np.nan  # Return NaN if no match is found
    


# Extract RAM
def extract_ram(name):
    match = re.search(r'(\d+)\s*GB\s*RAM', name, re.IGNORECASE)  # Adjust regex to allow flexible spacing
    return f"{match.group(1)}GB" if match else 'Unknown'  # Return the RAM size with "GB" appended




# extract ROM with or without SSD/HDD
def extract_rom(name):
    # Define the regex pattern to capture storage sizes (with or without space) and optional SSD/HDD
    pattern = r'\b(128\s*GB|256\s*GB|250\s*GB|320\s*GB|500\s*GB|512\s*GB|750\s*GB|1000\s*GB|1\s*TB|2\s*TB)\b(?:\s*(HDD|SSD|BROM|Storage))?'
    # Search for a match
    match = re.search(pattern, name, re.IGNORECASE) 
    if match:
        # Extract the size and optional type
        size = match.group(1).replace(' ', '').upper()  
        storage_type = match.group(2).upper() if match.group(2) else ""  # Get the type if it exists
        return f"{size} {storage_type}".strip()  # Combine size and type, removing extra spaces
    else:
        return 'Unknown'  # Default if no match found


# Extract processor model and append "Intel Core" before it
def extract_processor(name):
    # Define a regex pattern to capture only the processor model (i3, i5, i7, i9)
    pattern = r'\b(i[3-9])\b'  # Match i3, i5, i7, i9 with word boundaries
    match = re.search(pattern, name, re.IGNORECASE)
    if match:
        processor_model = match.group(0).lower()  # Extract the processor model (i3, i5, i7, i9)
        return f"Intel Core {processor_model}"  # Prepend "Intel Core" to the processor model
    else:
        return 'Unknown'  # Return 'Unknown' if no valid processor is found



# Extract brand names
brands = [
    "Lenovo", "HP", "MacBook", "NEC", "Panasonic", "Asus", "Dell", "FUJITSU", "Toshiba", "Infinix", "Microsoft Surface","Chuwi","Sony","Acer","GPD"]
# Create a regex pattern to match the brand names
pattern = r'\b(?:' + '|'.join(re.escape(brand) for brand in brands) + r')\b'
# Find all brand names 
def find_brand(name):
    match = re.search(pattern, name, re.IGNORECASE)
    return next((brand for brand in brands if brand.lower() == match.group(0).lower()), "Unknown") if match else "Unknown"



# Apply the extraction/cleaning functions to the DataFrame
df_laptops['Name'] = df_laptops['Name'].apply(clean_name)
df_laptops['Screen_size'] = df_laptops['Name'].apply(extract_screen_size)
df_laptops['RAM']=df_laptops['Name'].apply(extract_ram)
df_laptops['ROM'] =df_laptops['Name'].apply(extract_rom)
df_laptops['Processor'] = df_laptops['Name'].apply(extract_processor)
df_laptops['Brand'] = df_laptops['Name'].apply(find_brand)

In [17]:
# Define the combined function to clean and extract RAM
def clean_and_extract_ram(row):
    # Step 1: Clean the RAM value if it contains 58GB, 78GB, 116GB, 716GB, or 516GB
    row['RAM'] = re.sub(r'\b(58GB|78GB)\b', '8GB', row['RAM'])  # Replace 58 GB and 78 GB with 8GB
    row['RAM'] = re.sub(r'\b(116GB|716GB|516GB)\b', '16GB', row['RAM'])  # Replace 116gb, 716gb, 516gb with 16gb

    # Step 2: If RAM is unknown, extract the valid RAM value from the 'Name' column
    if row['RAM'] == "Unknown":  # Only process if the RAM is unknown
        # Define the RAM pattern (limited to 4GB, 8GB, 16GB, 32GB, 64GB)
        ram_pattern = r'\b(4|8|16|32|64)\s*GB'
        # Attempt to find the RAM in the 'Name' field
        match = re.findall(ram_pattern, row['Name'])
        if match:
            # Return the first match found (assuming we want just the first match)
            row['RAM'] = match[0] + "GB"  # Append 'GB' to match

    return row['RAM']

# Apply the combined function across the DataFrame
df_laptops['RAM'] = df_laptops.apply(clean_and_extract_ram, axis=1)

In [32]:
#Extract brand name (special cases)
special_cases = {"yoga": "Lenovo", 
                 "elitebook": "HP", 
                 "probook": "HP", 
                 "spectre": "HP",
                 "iMAC":"Macbook",
                 "Mackbook":"Macbook",
                 "Thinkpad":"Lenovo",
                 "Latitude":"Dell"}

def find_brand(name, current_value):
    if current_value != "Unknown":
        return current_value
    return next((brand for keyword, brand in special_cases.items() if keyword.lower() in name.lower()), "Unknown")
# Apply the function only to rows with 'Unknown' in the 'Brand' column
df_laptops['Brand'] = df_laptops.apply(lambda row: find_brand(row['Name'], row['Brand']), axis=1)

In [33]:
df_laptops["Brand"].value_counts()

Brand
HP                   1605
Lenovo                786
Dell                  378
MacBook                53
Unknown                17
Asus                   16
Toshiba                12
Microsoft Surface       9
NEC                     7
FUJITSU                 7
Macbook                 6
Acer                    6
Panasonic               4
 Chuwi                  4
Sony                    2
Infinix                 2
GPD                     1
Name: count, dtype: int64

In [19]:
# using pd.set_option() to widen the output display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [20]:
df_laptops[df_laptops['RAM']=="64GB"]

Unnamed: 0,Name,Price,Reviews,Screen_size,RAM,ROM,Processor,Brand
385,"GAMING LAPTOP DELL PRECISION 15 5550 10TH GE INTEL CORE i9 64GB RAM 512GB SSD 4GB DEDICATED GRAPHICS CARD NVIDIA TOUCHSCREEN LAPTOP,15INCHES NOTEBOOK ,REFURBISHED COMPUTERFREE BAGMOUSE","KSh 105,999",(0),15.0,64GB,512GB SSD,Intel Core i9,DELL


In [21]:
df_laptops["ROM"].value_counts()

ROM
256GB SSD        1062
500GB HDD         555
Unknown           504
128GB SSD         210
512GB SSD         188
500GB             137
256GB              86
1TB HDD            28
128GB              23
512GB              21
320GB HDD          19
1TB SSD            16
500GB SSD          15
1TB                 9
750GB HDD           7
500GB STORAGE       6
320GB               5
256GB HDD           5
256GB STORAGE       4
1000GB              2
512GB STORAGE       2
250GB HDD           2
512GB HDD           2
128GB HDD           2
1000GB HDD          2
1TB STORAGE         2
128GB STORAGE       1
Name: count, dtype: int64

In [28]:
df_laptops[df_laptops['Brand']=="Unknown"]

Unnamed: 0,Name,Price,Reviews,Screen_size,RAM,ROM,Processor,Brand
137,"NEC VK25 Refurbished 2-in-1 Laptop Intel Core i5-7200U, 2.7GHz, 8GB RAM, 256GB SSD, Touchscreen, 360 Convertible",16999,0,,8GB,256GB SSD,Intel Core i5,Unknown
149,BRAND NEW 15s Laptop PC Intel Core i5 1235U 12th Generation CPU 8GB DDR4 RAM 256GB SSD Intel Iris Xe 15.6 Inch Full HD Windows 11 Home Natural Silver,68999,0,15.6,8GB,256GB SSD,Intel Core i5,Unknown
220,Spectre x360 14-ef2013dx Intel Core i7 13th Gen 16GB RAM 512GB SSD 13.5 WUXGA Multi-Touch IPS Display Intel Iris Xe GPU Windows 11 High-End Laptop - Nightfall Black,176000,0,,16GB,512GB SSD,Intel Core i7,Unknown
251,"SONY VJPG11 Laptop - Intel Core i5-7200U 2.50GHz 4 CPUs, 8GB RAM, 256GB SSD, Windows 10 - Sleek and Powerful Performance Notebook",16999,0,,8GB,256GB SSD,Intel Core i5,Unknown
292,NEC VKT13 Refurbished laptop Intel Core i5 8th gen 1.30GHz 4 CPUs Boost to 1.60GHz 8GB RAM 128GB ROM SSD 12 inch Display Windows 10 Pro,16999,0,12.0,8GB,128GB,Intel Core i5,Unknown
325,"Refurbished LenovoThinkPad L390 Laptop Intel Core i5-8365U Processor, 8th Generation, 16GB RAM, 256GB SSD, 13.3 Inch Display, Windows 10 FREE BAG",25999,0,13.3,16GB,256GB SSD,Intel Core i5,Unknown
337,"Sony VJPG11C11N Laptop - 8GB RAM, 256GB SSD, Windows 10",22040,0,,8GB,256GB SSD,Unknown,Unknown
357,Infinix Laptop X2 Core i5 Windows 11 USBCUSB3.0HDMISD Card3.5mm Headset 8GB Ram 512GB SSD 14.1 inch,49999,0,14.1,8GB,512GB SSD,Intel Core i5,Unknown
375,"REFURBISHED EliteBook 840 G3 14-inch Laptop, Intel i5 6300U 2.4GHz, 8GB DDR4 RAM, 256GB M.2 SSD Hard Drive, USB Type C, Webcam, Windows 10",25000,0,,8GB,256GB,Intel Core i5,Unknown
499,Infinix Laptop X2 Core i5 Windows 11 USBCUSB3.0HDMISD Card3.5mm Headset 8GB Ram 512GB SSD 14.1 inch,48000,3,14.1,8GB,512GB SSD,Intel Core i5,Unknown


In [22]:
#Remove commas and any text from Price column
df_laptops["Price"] = df_laptops['Price'].str.replace(r'[^\d]', '', regex=True)


#Remove brackets from Reviews column
df_laptops['Reviews'] = df_laptops['Reviews'].str.extract(r'(\d+)')
df_laptops.sample(20)

Unnamed: 0,Name,Price,Reviews,Screen_size,RAM,ROM,Processor,Brand
996,i58GB512GB14Free Mouse HP Elitebook 1040 G3 6th Intel Core i5 8GB 512GB SSD 14 FHD Display Laptop Backlit Windows 11 intel HD Graphics Bluetooth Refurbished Laptop,27999,0,,8GB,512GB SSD,Intel Core i5,HP
2321,REFURBISHED DELL LATITUDE 7390 INTEL CORE i5 8GB RAM MEMORY 512GB SSD 13.5INCHES BLACK COLOR,30000,0,13.5,8GB,512GB SSD,Intel Core i5,DELL
735,i58GB256GB SSDFree MouseRefurbished HP Probook 640 G1 Laptop Intel Core i5 8GB RAM 256GB SSD ROM 14 Windows 11 6 months warranty Notebook Bluetooth Webcam Laptops,21999,0,,8GB,256GB SSD,Intel Core i5,HP
2066,Hp 830 g5 corei5 8gb 256ssd 8th Gen,35000,0,,Unknown,Unknown,Unknown,Hp
97,Refurbished Laptop Lenovo Thinkpad T430 Intel Core i5 3th Gen 4GB320GB12.5 Windows 10 Refurbished Lenovo Laptops Notebook Black 12 inch,13299,2,12.0,4GB,Unknown,Intel Core i5,Lenovo
2221,Dell Latitude E7240 Intel Core i5 2.1GHz 8GB 128GB SSD 12.5 GHz Intel Core i5-4300U 1.9ghz Processor 4GB DDR3 memory. 128GB Solid State Drive 12.5-inch,19999,0,,8GB,128GB SSD,Intel Core i5,Dell
1482,Hp Elite x2 1012 G2 2IN1 Intel core i5 7th GEN 8GB RAM 256 GB SSD BACKLIT KEYBOARD DETACHABLE KEYBOARD TOUCHSCREEN 12.5 INCHES TYPE C CHARGER THUNDERBOLT,39700,1,12.5,8GB,256GB SSD,Intel Core i5,Hp
127,Refurbished Dell Latitude E7440 Intel Core i5 5th Generation 8GB RAM 500 GB HDD Storage 14 Inches HD Display Refurbished Laptops Notebook Computers,16499,0,14.0,8GB,500GB HDD,Intel Core i5,Dell
2307,Dell Latitude E7240 Intel Core i5 2.1GHz 8GB 256GB SSD 12.5 GHz Intel Core i5-4300U 1.9ghz Processor 4GB DDR3 memory. 128GB Solid State Drive 12.5-inch,36000,0,,8GB,256GB SSD,Intel Core i5,Dell
1428,Refurbished HP X360 Chromebook Touchscreen 4GB RAM 32GB Storage Intel Celeron N4020 11.5 Laptops Computers Notebook,11999,3,,4GB,Unknown,Unknown,HP


In [23]:
df_laptops['Processor'].value_counts()

Processor
Intel Core i5    1753
Unknown           696
Intel Core i7     399
Intel Core i3      64
Intel Core i9       3
Name: count, dtype: int64

In [None]:
# Save to CSV
#df_laptops.to_csv('laptops_clean.csv', index=True)