## Laptops Data.

In [1]:
# Import libraries
import pandas as pd
import re

In [2]:
# Data loading
laptops_df = pd.read_csv("./csv_files/laptops.csv")
laptops_df.head()

Unnamed: 0,Name,Price,Reviews,Ratings
0,"HP Refurbished EliteBook 640GB HDD, 8GB RAM, ...","KSh 20,300",3.7 out of 5(13),3.7 out of 5
1,"HP Refurbished EliteBook 820 Core I5, 8GB RAM ...","KSh 16,499",5 out of 5(1),5 out of 5
2,Lenovo ThinkPad T490 Touchscreen Core I5 -8th ...,"KSh 27,999",4 out of 5(9),4 out of 5
3,HP Refurbished EliteBook 840 G3 Intel Core I5 ...,"KSh 23,500",4 out of 5(1),4 out of 5
4,HP GAMING-LAPTOP -HP ELITEBOOK 755 AMD RYZEN 7...,"KSh 35,000",4.4 out of 5(9),4.4 out of 5


In [3]:
# Cleaning laptops data
# Extract brand name
def extract_brand(name):
    match = re.search(r'(HP|Lenovo|Dell|Acer|)', name, re.IGNORECASE)
    return match.group(0) if match else 'Unknown'

# Extract RAM
def extract_ram(name):
    match = re.search(r'(\d+GB)\s*RAM', name)
    return match.group(1) if match else 'Unknown'

# Extract ROM (HDD/SSD)
def extract_rom(name):
    match = re.search(r'(\d+GB|TB)\s*(HDD|SSD)', name)
    return match.group(0) if match else 'Uknown'

# Extract processor type
def extract_processor(name):
    match = re.search(r'Intel\s*(Core\s*I\d)', name)
    return match.group(1) if match else 'Unknown'
                    
def extract_screen_size(name):
    match = re.search(r'(\d+\.?\d*)"\s*', name)
    return match.group(1) if match else 'Unknown'

# Extract the price from the 'Price' column
def extract_price(price):
    match = re.search(r'KSh\s*(\d+([,]\d{3})*)', price)
    if match:
        return float(match.group(1).replace(',', ''))
    return None

# Extract reviews 
def extract_reviews(reviews):
    match = re.search(r'\((\d+)\)', reviews)
    if match:
        return int(match.group(1))
    return None

# Extract ratings (the number before "out of 5")
def extract_ratings(ratings):
    match = re.search(r'(\d+\.\d+)', ratings)
    if match:
        return float(match.group(1))
    return None

# Apply extraction functions to the DataFrame
laptops_df['Brand'] = laptops_df['Name'].apply(extract_brand)
laptops_df['RAM'] = laptops_df['Name'].apply(extract_ram)
laptops_df['ROM'] = laptops_df['Name'].apply(extract_rom)
laptops_df['Processor'] = laptops_df['Name'].apply(extract_processor)
laptops_df['Screen_Size'] = laptops_df['Name'].apply(extract_screen_size)
laptops_df['Price'] = laptops_df['Price'].apply(extract_price)
laptops_df['Reviews'] = laptops_df['Reviews'].apply(extract_reviews)
laptops_df['Ratings'] = laptops_df['Ratings'].apply(extract_ratings)

# Create the new DataFrame with the extracted data
cleaned_data = laptops_df[['Name','Brand', 'RAM', 'ROM', 'Processor', 'Screen_Size', 'Price','Reviews','Ratings']]

# Save the cleaned data to a new CSV file
cleaned_data.to_csv('./csv_files/laptops_clean.csv', index=False)

In [4]:
laptops = pd.read_csv("./csv_files/laptops_clean.csv")
laptops.head(50)

Unnamed: 0,Name,Brand,RAM,ROM,Processor,Screen_Size,Price,Reviews,Ratings
0,"HP Refurbished EliteBook 640GB HDD, 8GB RAM, ...",HP,8GB,640GB HDD,Core I5,Unknown,20300.0,13,3.7
1,"HP Refurbished EliteBook 820 Core I5, 8GB RAM ...",HP,8GB,500GB HDD,Unknown,Unknown,16499.0,1,
2,Lenovo ThinkPad T490 Touchscreen Core I5 -8th ...,Lenovo,8GB,256GB SSD,Unknown,14,27999.0,9,
3,HP Refurbished EliteBook 840 G3 Intel Core I5 ...,HP,Unknown,500GB HDD,Core I5,Unknown,23500.0,1,
4,HP GAMING-LAPTOP -HP ELITEBOOK 755 AMD RYZEN 7...,HP,16GB,256GB SSD,Unknown,Unknown,35000.0,9,4.4
5,"HP Refurbished EliteBook 8460p Core I5, 4GB Ra...",HP,Unknown,500GB HDD,Unknown,Unknown,13999.0,2,
6,"Lenovo Refurbished 8GB RAM, 500GB Hard Disk, X...",Lenovo,8GB,Uknown,Unknown,Unknown,14999.0,1,
7,"HP EliteBook 840 G4 Intel Core I7 7Th Gen,16GB...",HP,Unknown,256GB SSD,Core I7,Unknown,39449.0,7,4.3
8,Lenovo Refurbished Thinkpad X250 Intel Core I5...,Lenovo,Unknown,Uknown,Core I5,Unknown,14999.0,11,3.7
9,Lenovo Refurbished Thinkpad X240 Intel Core I5...,Lenovo,Unknown,Uknown,Core I5,Unknown,14999.0,2,4.5


## Fridges Data

In [6]:
# Fridges data
df = pd.read_csv("./csv_files/fridges.csv")
df.head(5)

Unnamed: 0,Name,Price,Reviews,Ratings
0,Ramtons RF/335 - 85L Single Door Refrigerator ...,"KSh 15,999",4.1 out of 5(66),4.1 out of 5
1,Hisense 94 Liters Single Door Fridge REF094DR ...,"KSh 16,799",4.4 out of 5(87),4.4 out of 5
2,"Ramtons RF/203, 2 Door Direct Cool Fridge, 128...","KSh 28,099",4.2 out of 5(69),4.2 out of 5
3,Roch RFR-120S-I Single Door Refrigerator - 90 ...,"KSh 16,299",4.2 out of 5(477),4.2 out of 5
4,Nunix 138L Double Door Fridge Energy Efficient...,"KSh 27,500",3.9 out of 5(149),3.9 out of 5


In [7]:
# Extract brand name and model.
def extract_brand_and_model(name):
    match = re.match(r"([A-Za-z]+(?: [A-Za-z]+)*)(?:\s[RF|REF|FM|DF|D|]{2,4}[\d]+)?", name)
    if match:
        return match.group(1).strip()
    return ''

# Extract size in litres
def extract_size(name):
    match = re.search(r'(\d+)\s*Litres?', name)
    if match:
        return int(match.group(1))
    return None

# Extract number of doors
def extract_doors(name):
    match = re.search(r'(\d+)\s*Door', name)
    if match:
        return int(match.group(1))
    return None

# Extract color
def extract_color(name):
    color_keywords = ['Silver', 'White', 'Black', 'Grey', 'Red', 'Blue', 'Green', 'Beige', 'Stainless', 'Chrome']
    for color in color_keywords:
        if color.lower() in name.lower():
            return color
    return None

# Extract warranty
def extract_warranty(name):
    match = re.search(r'(\d+)\s*YRs?\s*WRTY', name)
    if match:
        return int(match.group(1))
    return None

# Extract the price from the 'Price' column
def extract_price(price):
    match = re.search(r'KSh\s*(\d+([,]\d{3})*)', price)
    if match:
        return float(match.group(1).replace(',', ''))
    return None

# Extract reviews 
def extract_reviews(reviews):
    match = re.search(r'\((\d+)\)', reviews)
    if match:
        return int(match.group(1))
    return None

# Extract ratings (the number before "out of 5")
def extract_ratings(ratings):
    match = re.search(r'(\d+\.\d+)', ratings)
    if match:
        return float(match.group(1))
    return None

# Apply the extraction functions to the DataFrame
df['Brand'] = df['Name'].apply(extract_brand_and_model)
df['Size(Litres)'] = df['Name'].apply(extract_size)
df['Doors'] = df['Name'].apply(extract_doors)
df['Color'] = df['Name'].apply(extract_color)
df['Warranty(Years)'] = df['Name'].apply(extract_warranty)
df['Price'] = df['Price'].apply(extract_price)
df['Reviews'] = df['Reviews'].apply(extract_reviews)
df['Ratings'] = df['Ratings'].apply(extract_ratings)

data = df[['Name', 'Brand', 'Size(Litres)','Doors', 'Color', 'Warranty(Years)', 'Price', 'Reviews', 'Ratings']]

# Save the modified DataFrame to a new CSV file
data.to_csv('./csv_files/fridges_clean.csv', index=False)

print("Data extraction and CSV creation completed successfully!")

Data extraction and CSV creation completed successfully!


In [8]:
clean_df = pd.read_csv("./csv_files/fridges_clean.csv")
clean_df.head(100)

Unnamed: 0,Name,Brand,Size(Litres),Doors,Color,Warranty(Years),Price,Reviews,Ratings
0,Ramtons RF/335 - 85L Single Door Refrigerator ...,Ramtons RF,,,Silver,1.0,15999.0,66,4.1
1,Hisense 94 Liters Single Door Fridge REF094DR ...,Hisense,,,,2.0,16799.0,87,4.4
2,"Ramtons RF/203, 2 Door Direct Cool Fridge, 128...",Ramtons RF,128.0,2.0,Silver,1.0,28099.0,69,4.2
3,Roch RFR-120S-I Single Door Refrigerator - 90 ...,Roch RFR,90.0,,Silver,,16299.0,477,4.2
4,Nunix 138L Double Door Fridge Energy Efficient...,Nunix,,,,1.0,27500.0,149,3.9
...,...,...,...,...,...,...,...,...,...
95,Smart Pro SFR 175-DT-I Double Door Refrigerato...,Smart Pro SFR,,,Silver,1.0,27299.0,1,
96,Ramtons RF/257- 2 Door Direct Cool Fridge - 21...,Ramtons RF,213.0,2.0,Silver,1.0,46900.0,41,4.8
97,Hisense RS-12DR4SA Single Door Direct Cool Fri...,Hisense RS,,,Silver,2.0,21239.0,50,4.6
98,Roch RFR-150DT-I Top-Mounted Defrost Fridge - ...,Roch RFR,,,,1.0,27699.0,39,4.1


In [7]:
df.isnull().sum()

Name                 0
Price                0
Reviews              0
Ratings            230
Brand                0
Size(Liters)       926
Doors              868
Color              433
Warranty(Years)    201
dtype: int64