In [11]:
#Importing libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import numpy as np

### Scraping Fridges data

In this process, we are scraping fridge product data from the Kilimall site. The data is then cleaned by handling missing values, removing duplicates, and standardizing the format for consistency. This ensures that the dataset is accurate, structured, and ready for analysis or further processing.


In [12]:
# Initialize a list to store all fridges data
fridges = []

# Loop through all 16 pages
for x in range(1, 17):  # Pages 1 to 16
    print(f"Scraping page {x}...")
    
    # Send a GET request to the page
    result = requests.get(f'https://www.kilimall.co.ke/search?q=FRIDGE&page={x}&source=search|enterSearch|FRIDGE')
    
    # Check if the request was successful
    if result.status_code == 200:
        soup = BeautifulSoup(result.text, 'html.parser')  # Parse the HTML content
        
        # Extract fridge details from divs with the class "info-box".
        fridges_info = soup.find_all('div', class_="info-box")
        
        # Extract relevant details
        for fridge_info in fridges_info:
            # Safely extract data, handle cases where tags are missing
            fridge_name = fridge_info.find('p', class_='product-title')
            fridge_price = fridge_info.find('div', class_='product-price')
            fridge_reviews = fridge_info.find('span', class_='reviews')
            
            # Clean and append extracted data
            fridges.append({
                "Name": fridge_name.text.strip() if fridge_name else "N/A",
                "Price": fridge_price.text.strip() if fridge_price else "N/A",
                "Reviews": fridge_reviews.text.strip() if fridge_reviews else "N/A"
            })
    else:
        print(f"Failed to fetch page {x}, Status code: {result.status_code}")

# Print results
for fridge in fridges:
    print(fridge)


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
{'Name': 'Volsmart 138L Fridge Freezer VL-BCD138 Energy Saving Double Doors Refrigerator', 'Price': 'KSh 25,299', 'Reviews': '(155)'}
{'Name': 'Hisense 94  Liters fridge single door Energy Saving  REFO94DR Refrigerator', 'Price': 'KSh 19,299', 'Reviews': '(486)'}
{'Name': '【New Year Sale】Volsmart 138L Fridge Freezer VL-BCD138 Energy Saving Double Doors Refrigerator 138L fridge double door Direct Cool fridge Refrigerator fridges and freezers fridge138ltrs', 'Price': 'KSh 25,555', 'Reviews': '(28)'}
{'Name': 'Volsmart 108L Double Doors VL-BCD108 Fridge Freezer Energy Saving Refrigerator with Lock and Keys', 'Price': 'KSh 20,299', 'Reviews': '(94)'}
{'Name': 'ROCH Single Door Mini Re

In [13]:
#Save results as a DataFrame
df_fridges = pd.DataFrame(fridges)
df_fridges.sample(20)

Unnamed: 0,Name,Price,Reviews
149,"MIKA, MRDCD138DS, 138L, Double Door Top Mount Freezer, Defrost (Direct Cool) Fridge","KSh 37,999",(0)
558,Fridge organizer containers now available Size 20cmx40cm,KSh 999,(0)
388,Mika 202L No Frost 2 Door Fridge- MRNF202LSD,"KSh 67,950",(0)
543,Mika 202Ltr No Frost Double Door Refrigerator Top Mounted Freezer Frost Free 3D Cooling Deodorizer-Smell Buster Dark Matt Energy Saving 2Yr Warranty+Fridge Guard+10M 6 Way JSB Extension,"KSh 65,000",(0)
453,Nunix 90L Single Door Fridge Energy Efficient BC-92,"KSh 20,499",(0)
384,Hisense REF094DR - 94L Single Door Fridge,"KSh 32,900",(0)
564,Big Adjustable Washing Machine / Fridge Mover Stand/Trolley,"KSh 3,200",(0)
121,"Mika Fridge, 138L, 2 Door Top Mount Freezer, Defrost (Direct Cool), Dark Silver","KSh 39,999",(0)
113,"Ramtons, 2 Door Direct Cool Fridge, 128 Litres - Silver (1YR WRTY)","KSh 38,000",(0)
95,"MIKA Fridge, 197L, 2 Door Top Mount Freezer, No Frost (Frost Free), Inox Line Brush","KSh 66,500",(0)


##### Data cleaning

In [14]:
# Function to clean the product name
def clean_name(name):
    
    # Remove words in parentheses or curly brackets if they contain "offer", "offers", "sale", or "sales"
    name = re.sub(r'\(([^)]*?(OFFER|OFFERS|SALE|SALES)[^)]*?)\)', '', name, flags=re.IGNORECASE)
    name = re.sub(r'\{([^}]*?(OFFER|OFFERS|SALE|SALES)[^}]*?)\}', '', name, flags=re.IGNORECASE)

    # Remove variations of "offer" and "sale" (including "offers", "sales")
    name = re.sub(r'\b(\w+)\s+(OFFER|OFFERS|SALE|SALES)\b', '', name, flags=re.IGNORECASE)
    
    # Remove unnecessary marketing phrases
    name = re.sub(r'\b(BLACK FRIDAY|BLACK FRIDAY OFFERS|BEST DEALS|LIMITED|LIMITED TIME|TECH WEEK|OFFER|BEST WHOLESALE PRICE|SPECIAL OFFERS)\b', '', name, flags=re.IGNORECASE)
    
    # Remove all remaining parentheses, curly braces, brackets, and clean extra spaces
    name = re.sub(r'[\(\)\{\}\[\]]', '', name)  # Remove parentheses, braces, and brackets
    name = re.sub(r'\s+', ' ', name)  # Replace multiple spaces with a single space
    
    # Remove special characters like '!', '+' if they appear as the first word
    name = re.sub(r'^[!+\[\]]+', '', name).strip()  # Strip unwanted characters at the start
    
    # Remove emojis using a regex for unicode emoji ranges
    name = re.sub(r'[^\w\s,.-]', '', name)  # Remove non-alphanumeric characters (including emojis)

    # Final trim to remove leading/trailing spaces
    name = name.strip()
    
    return name

# Apply the cleaning function to the 'Name' column in the DataFrame

df_fridges['Name'] = df_fridges['Name'].apply(clean_name)

In [15]:
#Remove commas and any text from Price column
df_fridges["Price"] = df_fridges['Price'].str.replace(r'[^\d]', '', regex=True)
# Rename the Price column
df_fridges = df_fridges.rename(columns={'Price': 'Price(kshs)'})



#Remove brackets from Reviews column
df_fridges['Reviews'] = df_fridges['Reviews'].str.extract(r'(\d+)')
df_fridges.sample(20)

Unnamed: 0,Name,Price(kshs),Reviews
62,Ailyons Mini Double Door Direct Cool Fridge- 115 Litres,26000,0
148,NUNIX ModelBC-92 Direct Cool Fridge 92L,21000,0
108,Premier 128 Litres Double Door Fridge - Silver 1 YR WRTY,29500,0
73,Von Tropicalised compressor VARM-11DHS Mini Fridge Freezer 90L - Silver 90L Single Door Lock and key Recessed handle,26199,5
441,Ramtons RF319- 430 L Ramtons RF319 430 LITERS SIDE BY SIDE LED NO FROST FRIDGE is a great choice for those who want a spacious and energy-efficient refrigerator.,121500,0
83,"MIKA MRDCD168DS 168L, 2 Door Top Mount Freezer, Defrost Direct Cool Fridge, Dark Silver",36999,0
34,Mika 92L Mini Fridge Single Door-MRDCS92DS -SILVER.,19999,0
98,"Mika Fridge, 202L, 2 Door Top Mount Freezer, No Frost Frost Free, Dark Silver",59999,0
381,NUNIX 138L BCD-138 Fridge Double Door Direct Cool Fridge Refrigerator Fridges,34999,0
153,Haier 333L Double Door No-Frost Fridge HRF-355BS,97200,0


##### Remove duplicates

In [16]:
#Checking how many duplicates
duplicate_count = int(df_fridges.duplicated( keep=False).sum())
print(f"There are {duplicate_count} duplicates")

#Find all duplicates
duplicates = df_fridges[df_fridges.duplicated( keep=False)]
duplicates


There are 41 duplicates


Unnamed: 0,Name,Price(kshs),Reviews
21,"RAMTONS 138 LITERS 2 DOOR DIRECT COOL 3 STAR FRIDGE, SILVER- RF339",36990,1
65,"Mika Fridge, Double Door- 138 Litres",40000,0
84,"MIKA Fridge, 168L, 2 Door Top Mount Freezer, Defrost Direct Cool, Dark Silver",36999,0
89,"MIKA Fridge, 168L, 2 Door Top Mount Freezer, Defrost Direct Cool, Dark Silver",36999,0
95,"MIKA Fridge, 197L, 2 Door Top Mount Freezer, No Frost Frost Free, Inox Line Brush",66500,0
96,"MIKA Fridge, 197L, 2 Door Top Mount Freezer, No Frost Frost Free, Inox Line Brush",66500,0
97,"MIKA Fridge, 197L, 2 Door Top Mount Freezer, No Frost Frost Free, Inox Line Brush",66500,0
112,"MIKA Fridge, 168L, 2 Door Top Mount Freezer, Defrost Direct Cool, Dark Silver",38999,0
122,"AILYONS Double Door RefrigeratorFridge Top Mounted Freezer, 168L",35000,0
123,"AILYONS Double Door RefrigeratorFridge Top Mounted Freezer, 168L",35000,0


In [17]:
# Remove duplicates, keeping the first occurrence
df_fridges = df_fridges.drop_duplicates()



##### Feature Extraction

In [18]:
# Extract the number of doors
def extract_doors(description):
    # Define the regex pattern to match numbers/keywords before "Door" or "Doors"
    pattern = r'\b(1|one|2|two|3|three|4|four|Single|Double)\b(?:\s*Doors)?'
    # Search for the pattern in the description
    match = re.search(pattern, description, re.IGNORECASE)
    # Map matches to corresponding numeric values
    door_mapping = {
        "1": 1,
        "one": 1,
        "single": 1,
        "2": 2,
        "two": 2,
        "double": 2,
        "4": 4,
        "four": 4}
    if match:
        door_type = match.group(1).lower()  # Convert the match to lowercase
        return door_mapping.get(door_type, "Unknown")  # Map to the number of doors
    return "Unknown"  # If no match is found



# Extract capacity in litres
def extract_capacity(description):
    # Define the regex pattern
    pattern = r'(\d+(\.\d+)?)\s*(L|litres|ltrs|lt)'
    # Search for the pattern in the description
    match = re.search(pattern, description, re.IGNORECASE)
    if match:
        return float(match.group(1))  # Return the number as float
    return None  # Return None if no match is found




#Extract brand names
brands = ['Volsmart','Hisense','Roch','Nunnix','Smartpro','Nunix','Ecomax','Ramtons','Mika','Von','Haier','Exzel','GLD','Vitron','Smartpro','Bruhm','Premier','Samsung', 'Ailyons', 'LG', 'Solstar', 'Royal','Beko','Syinix','ICECOOL','Rebune','Legacy','FK','Smart pro']
# Function to extract the brand name
def extract_brand(product_name):
    for brand in brands:
        if brand.lower() in product_name.lower():  # Case insensitive match
            return brand
    return 'Unknown'  # Return 'Unknown' if no brand is found





# Apply the extraction functions to the DataFrame
df_fridges["Doors"] = df_fridges["Name"].apply(extract_doors)
df_fridges['Capacity(ltrs)'] = df_fridges['Name'].apply(extract_capacity)
df_fridges['Brand'] = df_fridges['Name'].apply(extract_brand)


# Display the updated DataFrame
df_fridges 

Unnamed: 0,Name,Price(kshs),Reviews,Doors,Capacity(ltrs),Brand
0,Volsmart 138L Fridge Freezer VL-BCD138 Energy Saving Double Doors Refrigerator,25299,155,2,138.0,Volsmart
1,Hisense 94 Liters fridge single door Energy Saving REFO94DR Refrigerator,19299,486,1,94.0,Hisense
2,New Volsmart 138L Fridge Freezer VL-BCD138 Energy Saving Double Doors Refrigerator 138L fridge double door Direct Cool fridge Refrigerator fridges and freezers fridge138ltrs,25555,28,2,138.0,Volsmart
3,Volsmart 108L Double Doors VL-BCD108 Fridge Freezer Energy Saving Refrigerator with Lock and Keys,20299,94,2,108.0,Volsmart
4,ROCH Single Door Mini Refrigerator 90Ltrs Fridge,19349,105,1,90.0,Roch
5,Volsmart 118L Fridge Freezer VL-BCD118 Energy Saving Double Doors Refrigerator,23199,33,2,118.0,Volsmart
6,Nunix 138L BCD-138 fridge double door Direct Cool fridge Refrigerator fridges and freezers,28899,104,2,138.0,Nunix
7,SmartPro SFR-120S-I Mini Fridge 90L Refrigerator Single Door 90Litres High Quality Fridge,17799,8,1,90.0,Smartpro
8,ECOMAX 90L Single Door Refrigerator Home Improvement Energy Saving Fridge with Lock and Keys BCD-90,16999,32,1,90.0,Ecomax
9,Hisense 176 Liters fridge single Door fridges and freezers Refrigerator,35699,17,1,176.0,Hisense


In [19]:
# Standardize brand names (replace "Smart Pro" with "Smartpro")
df_fridges['Brand'] = df_fridges['Brand'].replace({'Smart pro': 'Smartpro','Nunnix': 'Nunix'})
df_fridges['Brand'].value_counts()

Brand
Ramtons     90
Mika        78
Hisense     65
Unknown     48
Von         46
Nunix       46
Volsmart    28
Roch        27
Premier     22
Haier       18
LG          13
Exzel       10
Ecomax       7
Royal        7
Smartpro     6
Rebune       6
Vitron       6
Syinix       5
Solstar      5
Beko         5
Ailyons      3
Bruhm        3
GLD          3
FK           3
Samsung      1
ICECOOL      1
Legacy       1
Name: count, dtype: int64

In [20]:
# Save to CSV
df_fridges.to_csv('fridges_clean.csv', index=True)

### Scraping Laptops data

In [21]:
# Initialize a list to store all laptops' data
laptops = []

# Loop through all 90 pages
for x in range(1, 91):  # Pages 1 to 90
    print(f"Scraping page {x}...")
    
    # Send a GET request to the page
    result = requests.get(f'https://www.kilimall.co.ke/search?q=laptop&page={x}&source=search|enterSearch|laptop')
    
    # Check if the request was successful
    if result.status_code == 200:
        soup = BeautifulSoup(result.text, 'html.parser')  # Parse the HTML content
        
        # Extract laptop details from divs with the class "info-box".
        laptops_info = soup.find_all('div', class_="info-box")
        
        # Extract relevant details
        for laptop_info in laptops_info:
            # Safely extract data, handle cases where tags are missing
            laptop_name = laptop_info.find('p', class_='product-title')
            laptop_price = laptop_info.find('div', class_='product-price')
            laptop_reviews = laptop_info.find('span', class_='reviews')
            
            # Clean and append extracted data
            laptops.append({
                "Name": laptop_name.text.strip() if laptop_name else "N/A",
                "Price": laptop_price.text.strip() if laptop_price else "N/A",
                "Reviews": laptop_reviews.text.strip() if laptop_reviews else "N/A"
            })
    else:
        print(f"Failed to fetch page {x}, Status code: {result.status_code}")

# Print results
for laptop in laptops:
    print(laptop)


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping 

In [22]:
#Save results as a DataFrame
df_laptops = pd.DataFrame(laptops)
df_laptops.sample(20)

Unnamed: 0,Name,Price,Reviews
2175,HP EliteBook x360 1030 G3 Intel Core i7 8th Gen 8GB RAM 512GB,"KSh 59,500",(0)
542,"Brand New Dell Vostro 3520 Laptop Notebook 12th Gen Intel Core i5-1235U 15.6"" Display 8GB DDR4- SDRAM RAM 512GB SSD Ubuntu","KSh 87,999",(0)
107,(i5+8GB+500GB+Free Mouse)Refurbished HP Probook 640 G1 Laptop Intel Core i5 8GB RAM / 750GB HDD ROM 14'' Windows 11 6 months warranty Notebook Bluetooth Webcam Laptops,"KSh 19,999",(0)
1438,"(Free Mouse)Refurbished Laptop Lenovo Thinkpad T470 Core i5 6th 8GB+512GB+14"" Windows 10 14"" FHD Display Bluetooth Webcam WiFi Intel Graphics 6 Months Warranty laptops","KSh 24,999",(1)
2783,(SPECIAL OFFER) HP ELITEBOOK 1040 G2 CORE I7 8GB RAM 256GB SSD 5TH GENERATION INSTALLED WINDOWS 10 PRO 64 BIT+ 6 MONTH WARRANTY,"KSh 32,500",(0)
123,"BLACK FRIDAY OFFERS HP EliteBook x360 1040 G7 10th Generation Intel Core i7-10710U 16GB RAM 512GB SSD 14-In FHD Touchscreen ,laptop computer,sleek +FREE BAG |WINDOS |14''|REFURBISHED","KSh 58,000",(0)
1476,Special Limited Time Offer Refurbished Lenovo Thinkpad T14s Intel Core i7 10th Gen 16GB RAM 256GB SSD 14 Inch FHD Touchscreen Display With Windows and Office Installed,"KSh 42,999",(0)
327,(NEW YEAR SALE!) Refurbished Laptop Dell Latitude 7280 Notebook Computer | Intel Core i5 6th Generation | 8G RAM | 256GB SSD Storage | 12.5” HD Display | Installed with Windows 10Pro| MS Office 2019,"KSh 17,999",(0)
143,Lenovo ThinkPad X1 Yoga X360 Intel Core i7 8th Gen 8GB RAM 512GB SSD 14 Inch Full HD Touchscreen 1.8GHz up to 4.2GHz Quad Core Processor 2 in 1 Convertible Refurbished Laptop + Stylus Pen,"KSh 52,000",(0)
108,"[Core i5+8gb+256gb+14''] Refurbished Hp Elitebook 840 G3 Laptop Intel Core i5 6th Gen 8GB 256GB SSD 14"" Backlit Keyboard Notebook Laptop Computer Windows 11 Office","KSh 21,299",(1)


#### Data Cleaning and feature extraction

In [45]:
# Extract the screen size (integer or float)
def extract_screen_size(description):
    # Regex to capture floats/integers before "inch", "inches", or `"`
    match = re.search(r"(\d+\.\d+|\d+)(?=\s*(?:''|\"|inch|inches?))", description, re.IGNORECASE)
    # If a match is found, return the captured number
    # Regex to capture floats/integers directly before `"`
    match_quote = re.search(r'(\d+\.?\d*)\s*(?=")', description)
    if match:
        return float(match.group(1))  # Return match before "inch" or "inches"
    elif match_quote:
        return match_quote.group(1)  # Return match with `"`
    else:
        return np.nan  # Return NaN if no match is found
    



# Extract RAM
def extract_ram(name):
    match = re.search(r'(\d+)\s*GB\s*RAM', name, re.IGNORECASE)  # Adjust regex to allow flexible spacing
    return f"{match.group(1)}GB" if match else 'Unknown'  # Return the RAM size with "GB" appended



# Extract ROM (HDD/SSD/BROM) with GB or TB
def extract_rom(name):
    # Search for number along with "HDD" or "SSD", with or without space
    match = re.search(r'(\d+\s*(?:GB|TB)?\s*(HDD|SSD|BROM|Storage))', name, re.IGNORECASE)
    return match.group(1) if match else 'Unknown'




# Apply the extraction/cleaning functions to the DataFrame
df_laptops['Name'] = df_laptops['Name'].apply(clean_name)
df_laptops['Screen_size'] = df_laptops['Name'].apply(extract_screen_size)
df_laptops['RAM']=df_laptops['Name'].apply(extract_ram)
df_laptops['ROM'] =df_laptops['Name'].apply(extract_rom)

In [None]:
#Further extraction of ROM
# # Define the storage pattern
storage_pattern = r'\b(16|32|64|128|256)\s*GB'

# Apply the pattern to the column
df_laptops['ROM'] = df_laptops['Name'].apply(lambda x: re.findall(storage_pattern, x)[0] if re.findall(storage_pattern, x) else None)

In [24]:
df_laptops['Name'] = df_laptops['Name'].apply(clean_name)

In [46]:
# using pd.set_option() to widen the output display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
df_laptops

Unnamed: 0,Name,Price(kshs),Reviews,Screen_size,RAM,ROM
0,Lenovo Thinkpad Yoga 11e X360 Celeron 4gb Ram 500gb HDD Refurbished Laptop Touchscreen 12.5 Inch Screen Size With Windows and Office Installed Black in Color,10999,6,12.5,4GB,500gb HDD
1,"LENOVO YOGA 11E 4GB RAM 128GB SSD 2 IN 1 TOUCHSCREEN X360 LAPTOP REFURBISHED LAPTOP , BLACK , 12 INCH , INSTALLED WINDOWS 10 PRO AND FREE MOUSE",11499,152,12.0,4GB,128GB SSD
2,HP ProBook X360 11 G2 EE Core i5 7th Gen 8GB RAM 256GB SSD 11.6 Inches HD Touchscreen 1.2GHz up to 3.2GHz Dual Core Processor Windows 10 Pro 2 in 1 Convertible Slim Refurbished Laptop - 6 MNTHS WRNTY,21500,0,11.6,8GB,256GB SSD
3,HP ProBook X360 11 G5 EE 9th Generation 4GB RAM 192GB SSD 11.6 Inches Touchscreen 1.1GHz up to 2.7GHz Dual Core Processor Windows 10 Pro 2 - in - 1 Convertible Refurbished Laptop,17000,0,11.6,4GB,192GB SSD
4,"Lenovo ThinkPad L380 Yoga x360 Core I5 8th generation Quad core, 8GB RAM 256GB SSD 13.3 Inches FHD Touchscreen with a stylus Pen 2 in 1 Refurbished Laptop",29999,1,13.3,8GB,256GB SSD
5,Refurbished Macbook Air 2015 Silver 13 inch apple laptop,32500,1,13.0,Unknown,Unknown
6,"Apple MacBook Pro 13.3 Core I5 2.5GHz 8GB RAM, 256GB SSD Early 2011 Laptop",25000,0,,8GB,256GB SSD
7,"REFURBISHED HP ELITEBOOK 8460P CORE INTEL I5 8GB RAM 500GB HDD COMPUTER LAPTOP ,WINDOWS 10 PRO FREE MOUSE",13499,70,,8GB,500GB HDD
8,Brand NEW Lenovo Ideapad 1 Celeron N4020 8GB RAM 256GB SSD 14 Inch HD display School Business Laptop Computer Notebook Windows 10 New Laptops Lenovo Laptop Computers,31999,41,14.0,8GB,256GB SSD
9,Core i716gb512gb13.3 Touch Refurbished Hp 1040 G6 Laptop Core i7 Touchscreen X360 16GB RAM 512GB SSD Laptops 13.3 inch Computer Notebook,42999,0,13.3,16GB,512GB SSD


In [26]:
#Remove commas and any text from Price column
df_laptops["Price"] = df_laptops['Price'].str.replace(r'[^\d]', '', regex=True)
# Rename the Price column
df_laptops = df_laptops.rename(columns={'Price': 'Price(kshs)'})


#Remove brackets from Reviews column
df_laptops['Reviews'] = df_laptops['Reviews'].str.extract(r'(\d+)')
df_laptops.sample(20)

Unnamed: 0,Name,Price(kshs),Reviews,Screen_size,RAM,ROM
2750,Lenovo ThinkPad X380 Yoga CoreI5-8250U 16GB RAM 256GB SSD Win11,44000,0,,16GB,16GB
164,FREE MOUSE HP EliteBook 745 G6 AMD Ryzen 5 PRO 3500U 16GB RAM 512GB SSD 14 Inches Full HD Touchscreen 2GB AMD Radeon Graphics 2.1 GHz- 4.0GHz Quad Core Windows 11 Pro Ultra Slim Refurbished Laptop,41000,0,14.0,16GB,16GB
1010,"FREE BAG Lenovo ThinkPad X131E 11.6in Laptop, AMD, 4GB DDR3, 500GB SATA, 802.11n, Webcam, HDMI, Windows 10",15999,0,,Unknown,4GB
507,"Refurbished HP EliteBook Folio 9470 Intel Core i5 8GB Ram 500GB HDD 14 Inch Screen Size LED Ultrabook Laptop Backlit Keyboard Windows 10, Microsoft Office 2016.",18999,4,14.0,8GB,8GB
1321,8GB256GB13.3 Refurbished Laptop HP Elitebook 830 G5 Core i5 8th gen 8GB Ram 256GB SSD 13.3 Windows 10 Notebook with 1 Free Mouse 6 Month Warranty Free Office 2019,28499,0,,8GB,8GB
2733,Brand New HP PAVILION X360 14-DY1031NIA Intel Core i5 1155G7 8GB DDR4 512SSD Windows 11 Home FP 14 NATURAL SILVER Silver 14 inch,98999,0,14.0,Unknown,8GB
1018,Touchscreen Refurbished Hp Elitebook 840 G6 Laptop Intel Core i5 8th Gen - 8GB RAM 256GB SSD ROM - 14 Windows 10 Refurbished Laptop Computer Notebook,29999,0,,8GB,8GB
527,Refurbished HP EliteBook 840 g2 Intel core i5 8GB Ram 500GB HDD ROM 14 Refurbished Laptop Hp Laptops Black Notebook,17999,0,,8GB,8GB
1028,MEGA Dell Latitude E7270 6th Gen Intel Core i7 Thin Light FHD Laptop 8 GB DDR4 RAM256 GB SSD12.5 31.8 cm FHDWindows 11MS OfficeWiFiBluetoothWebcamIntegrated Graphics,37800,0,,Unknown,8 GB
2627,"8GB 256GB SSD-x360 Lenovo Refurbished Yoga 11e Touch Screen HDMI PORT - Intel Celeron WINDOWS 11 PRO ACTIVATED, Office 2021Activated, Basic SoftWares Installed FREE WIRELESS MOUSE.",20500,0,,Unknown,8GB


##### Checking duplicates

In [27]:
#Checking how many duplicates
duplicate_count = int(df_fridges.duplicated( keep=False).sum())
print(f"There are {duplicate_count} duplicates")

#Find all duplicates
duplicates = df_fridges[df_fridges.duplicated( keep=False)]
duplicates


There are 0 duplicates


Unnamed: 0,Name,Price(kshs),Reviews,Doors,Capacity(ltrs),Brand


In [28]:
# using pd.set_option() to widen the output display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)
df

NameError: name 'df' is not defined

In [61]:
df_laptops.isnull().sum()

Name              0
Price             0
Reviews           0
Screen_size    1962
RAM               0
dtype: int64

In [64]:
df_laptops["RAM"].value_counts()

RAM
8GB        1444
Unknown     740
4GB         390
16GB        189
8gb          86
4gb          21
16gb         12
8Gb           8
12GB          7
58GB          5
32GB          3
2GB           3
6GB           2
16Gb          2
516GB         1
64GB          1
78GB          1
716GB         1
18GB          1
4Gb           1
Name: count, dtype: int64

In [7]:
# Extract ROM (HDD/SSD) with GB
def extract_rom(name):
    # Search for GB/number along with "HDD" or "SSD", with or without space
    match = re.search(r'(\d+\s*GB|\d+TB|\d+SSD|\d+HDD)', name, re.IGNORECASE)
    return match.group(0) if match else 'Unknown'
df_laptops['ROM'] =df_laptops['Name'].apply(extract_rom)

In [40]:
# Extract ROM (HDD/SSD/BROM) with GB or TB
def extract_rom(name):
    # Search for GB/number along with "HDD", "SSD", or "BROM", with or without space
    match = re.search(r'(\d+\s*(GB|TB))\s*(HDD|SSD|BROM)', name, re.IGNORECASE)
    return match.group(0) if match else 'Unknown'

# Apply the harmonization function
df_laptops['RAM'] = df_laptops['RAM'].apply(extract_rom)

In [47]:
df_laptops["ROM"].value_counts()

ROM
256GB SSD     945
500GB HDD     517
Unknown       516
128GB SSD     182
512GB SSD     169
256 GB SSD     69
256 SSD        50
256gb ssd      50
256SSD         40
256ssd         25
1TB HDD        23
500 GB HDD     21
128 SSD        20
320GB HDD      19
128gb ssd      18
500GB SSD      16
512SSD         15
512 SSD        14
1TB SSD        13
500gb HDD      12
256gb SSD      12
500gb hdd      11
500gb Hdd      10
128 GB SSD     10
500HDD          9
500GBHDD        9
256 ssd         8
256GBSSD        7
750GB HDD       7
256GB ssd       6
512 GB SSD      6
500 HDD         6
256 HDD         5
256GB HDD       5
192GB SSD       4
128SSD          4
128ssd          3
128gb SSD       3
1 TB SSD        3
512ssd          3
512gb SSD       2
500 gb hdd      2
256gbssd        2
256 gb ssd      2
16GB SSD        2
32GB HDD        2
256 GB ssd      2
250GB HDD       2
512GB HDD       2
512gb ssd       2
500Gb HDD       2
512GB ssd       2
512GBSSD        2
4128GB SSD      1
128Gb ssd       1
1000GB

In [48]:
df_laptops[df_laptops["ROM"] == "2 SSD"]

Unnamed: 0,Name,Price(kshs),Reviews,Screen_size,RAM,ROM
359,"REFURBISHED EliteBook 840 G3 14-inch Laptop, Intel i5 6300U 2.4GHz, 8GB DDR4 RAM, 256GB M.2 SSD Hard Drive, USB Type C, Webcam, Windows 10",25000,0,,Unknown,2 SSD


In [None]:
# Save to CSV
df.to_csv('laptops_cleann.csv', index=True)

In [None]:
I need to filter using variations of offer and sale then remove the word
row 1..remove  (logic that removes every word in the brackets normal or curly brackets)2, 5,6,29,31, 33, 39, 40,45
Also logic that accomodates different letters whether small or capital-different variation of offers eg offers 
Add black friday, black friday offers, best deals,limited
Speciual case 36
