In [6]:
import pandas  as pd

In [24]:
# Reloading the CSV using the new 'on_bad_lines' parameter to skip bad lines
data = pd.read_csv('cleaned_product_data.csv', encoding='ISO-8859-1', on_bad_lines='skip')

# Display the first few rows to confirm the data is loaded correctly


In [8]:
# Checking for missing values in the dataset
missing_values = data.isnull().sum()

# Display columns with missing values and their counts
missing_values[missing_values > 0]


Total Price (No Discount)      91
List Price                   4071
Current Price                2646
You Save                     3443
Stock Availability             95
Product Description          2635
Rating(out of 5)               32
Total number of Rating         32
About Item                     45
Manufacturer                    1
Whats in the box              263
Battery Power Rating          457
Other camera features         813
Other display features         42
OS                             97
Batteries                     363
Item model number             151
Item Weight                    10
Product Dimensions            898
Top Comment                   103
Brand                           4
Screen Size                   124
Model Name                    135
Wireless Carrier              204
Color                          26
Memory Storage Capacity        40
Operating System               75
Ram Memory Installed Size     490
CPU Model                    3155
CPU Speed     

In [9]:
# Identifying non-numeric values in price columns to remove or correct them

# Helper function to identify non-numeric rows
def find_non_numeric(data_column):
    return data_column[~data_column.apply(lambda x: isinstance(x, (int, float)))]

# Identifying non-numeric values in price columns
non_numeric_total_price = find_non_numeric(data['Total Price (No Discount)'])
non_numeric_list_price = find_non_numeric(data['List Price'])
non_numeric_current_price = find_non_numeric(data['Current Price'])
non_numeric_you_save = find_non_numeric(data['You Save'])

# Displaying the non-numeric values to inspect
non_numeric_total_price, non_numeric_list_price, non_numeric_current_price, non_numeric_you_save


(0       139
 1        88
 2       189
 3       192
 4        99
        ... 
 4487     77
 4488     19
 4489     88
 4490    115
 4491    188
 Name: Total Price (No Discount), Length: 4401, dtype: object,
 Series([], Name: List Price, dtype: float64),
 Series([], Name: Current Price, dtype: float64),
 Series([], Name: You Save, dtype: float64))

In [None]:
# Updating the fillna operations to avoid chained assignments using the recommended approach

# Replacing the inplace=True with direct column assignment
data['Total Price (No Discount)'] = data['Total Price (No Discount)'].fillna(data['Total Price (No Discount)'].median())
data['List Price'] = data['List Price'].fillna(data['List Price'].median())
data['Current Price'] = data['Current Price'].fillna(data['Current Price'].median())
data['You Save'] = data['You Save'].fillna(data['You Save'].median())

# Check the final status of missing values
final_cleaned_data_summary = data.isnull().sum()
final_cleaned_data_summary


In [15]:
data['Product Description'] = data['Product Description'].fillna('Not Available')
data['About Item'] = data['About Item'].fillna('Not Available')
data['Whats in the box'] = data['Whats in the box'].fillna('Not Available')


In [16]:
data['Rating(out of 5)'] = data['Rating(out of 5)'].fillna(data['Rating(out of 5)'].mean())
data['Total number of Rating'] = data['Total number of Rating'].fillna(data['Total number of Rating'].mean())
data['Battery Power Rating'] = data['Battery Power Rating'].fillna(data['Battery Power Rating'].mode()[0])
data['Screen Size'] = data['Screen Size'].fillna(data['Screen Size'].mode()[0])
data['Ram Memory Installed Size'] = data['Ram Memory Installed Size'].fillna(data['Ram Memory Installed Size'].mode()[0])
data['CPU Model'] = data['CPU Model'].fillna('Unknown')
data['CPU Speed'] = data['CPU Speed'].fillna('Unknown')
data['Resolution'] = data['Resolution'].fillna('Unknown')


In [17]:
# Check the final status of missing values
final_cleaned_data_summary = data.isnull().sum()
final_cleaned_data_summary

Total Price (No Discount)        0
List Price                       0
Current Price                    0
You Save                         0
Stock Availability              95
Product Description              0
Title Content                    0
Rating(out of 5)                 0
Total number of Rating           0
About Item                       0
Service Provider                 0
Form Factor                      0
Manufacturer                     1
Whats in the box                 0
Battery Power Rating             0
Other camera features          813
Human Interface Input            0
Other display features          42
GPS                              0
OS                              97
Batteries                      363
Item model number              151
ASIN                             0
Item Weight                     10
Product Dimensions             898
Top Comment                    103
Brand                            4
Screen Size                      0
Model Name          

In [22]:
import pandas as pd
from IPython.display import display

# Filling remaining missing categorical fields with "Unknown"
data['Stock Availability'] = data['Stock Availability'].fillna('Unknown')
data['OS'] = data['OS'].fillna('Unknown')
data['Brand'] = data['Brand'].fillna('Unknown')
data['Model Name'] = data['Model Name'].fillna('Unknown')
data['Wireless Carrier'] = data['Wireless Carrier'].fillna('Unknown')
data['Color'] = data['Color'].fillna('Unknown')
data['Operating System'] = data['Operating System'].fillna('Unknown')

# Filling remaining text fields with "Not Available"
data['Other camera features'] = data['Other camera features'].fillna('Not Available')
data['Other display features'] = data['Other display features'].fillna('Not Available')
data['Top Comment'] = data['Top Comment'].fillna('Not Available')
data['Image URLs'] = data['Image URLs'].fillna('Not Available')

# Filling missing numeric fields with mean or mode
data['Batteries'] = data['Batteries'].fillna(data['Batteries'].mode()[0])
data['Item model number'] = data['Item model number'].fillna('Unknown')
data['Product Dimensions'] = data['Product Dimensions'].fillna('Unknown')

# Final check on missing values
final_cleaned_data_summary = data.isnull().sum()

# Display the DataFrame
display(data)

# Display the summary of missing values
#print(final_cleaned_data_summary)

Unnamed: 0,Total Price (No Discount),List Price,Current Price,You Save,Stock Availability,Product Description,Title Content,Rating(out of 5),Total number of Rating,About Item,...,Connectivity Technology,Color,Memory Storage Capacity,Wireless network technology,Operating System,Ram Memory Installed Size,CPU Model,CPU Speed,Resolution,Image URLs
0,139.0,379.99,219.00,36.86,Yes,Not Available,"FOSSIBOT F105 Rugged Smartphone Unlocked, Andr...",4.4,13.0,About this item 12GB RAM+64GB ROM Memory: Equi...,...,Wireless,Green,64 GB,"Mobile Hotspot, Wifi",Android,12 GB,Unknown,Unknown,Unknown,https://m.media-amazon.com/images/G/01/gno/spr...
1,88.0,379.99,219.00,36.86,Yes,Not Available,KXD A07 Smartphone | 6.52-Inch Screen Cell Pho...,4.5,2.0,"About this item ã6.52"" Waterdrop ScreenãTh...",...,"Wifi, Bluetooth",Black,128 GB,"Mobile Hotspot, Wifi",Android,4 GB,Others,1.6 GHz,Unknown,https://m.media-amazon.com/images/G/01/gno/spr...
2,189.0,379.99,219.00,36.86,Unknown,Not Available,"I23 Ultra Unlocked Cell Phone,Built in Pen,Sma...",4.4,9.0,About this item ãOcta-Core CPU + 128GB Expan...,...,"Wi-Fi, USB",Purple,256 GB,"Mobile Hotspot, Wifi",Android 13.0,6 GB,Unknown,Unknown,Unknown,https://m.media-amazon.com/images/G/01/gno/spr...
3,192.0,379.99,218.99,36.86,Only 10 left in stock - order soon.,"This product is inspected, tested, and refurbi...","Apple iPhone 11, 64GB, Black - Unlocked (Renewed)",4.2,52624.0,About this item This phone is unlocked and com...,...,"Bluetooth, Wi-Fi, USB, NFC",Black,64 GB,"Mobile Hotspot, Wifi",iOS 16,4 GB,Unknown,Unknown,Unknown,https://m.media-amazon.com/images/G/01/gno/spr...
4,99.0,379.99,219.00,36.86,Yes,Not Available,Moto G Play 2023 3-Day Battery Unlocked Made f...,4.0,1263.0,About this item Carrier compatibility: ATandT:...,...,"Wifi, Bluetooth",Navy Blue,32 GB,"Mobile Hotspot, Wifi",Android 12.0,3 GB,Unknown,Unknown,1600 x 720,https://m.media-amazon.com/images/G/01/gno/spr...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4487,77.0,379.99,89.99,36.86,Yes,"This product is inspected, tested, and refurbi...","SAMSUNG Galaxy A13 LTE, Factory Unlocked Smart...",4.0,549.0,"About this item 6.6"" 1080 x 2408 (FHD+) PLS LC...",...,"Wifi, Bluetooth",Black,32 GB,"Mobile Hotspot, Wifi",Android,3 GB,1.2GHz Cortex A8 Processor,Unknown,1080 x 2408,https://m.media-amazon.com/images/G/01/gno/spr...
4488,19.0,379.99,219.00,36.86,Yes,Not Available,"TracFone TCL Flip 2, 8GB, Black - Prepaid Flip...",4.0,823.0,About this item Large 2.8 Inch Internal Screen...,...,"Bluetooth, Wi-Fi, USB",BLACK,16 GB,"Mobile Hotspot, Wifi",Android 11.0,1 GB,Unknown,1.5 GHz,Unknown,https://m.media-amazon.com/images/G/01/gno/spr...
4489,88.0,379.99,219.00,36.86,Yes,Not Available,KXD A07 Smartphone | 6.52-Inch Screen Cell Pho...,4.7,3.0,"About this item ã6.52"" Waterdrop ScreenãTh...",...,"Wifi, Bluetooth",Black,128 GB,"Mobile Hotspot, Wifi",Android,4 GB,Others,1.6 GHz,Unknown,https://m.media-amazon.com/images/G/01/gno/spr...
4490,115.0,379.99,219.00,36.86,Only 11 left in stock - order soon.,Not Available,"HOTWAV Note 12 Cell Phone Unlocked Android 13,...",3.7,230.0,About this item ã2023 Updated Android 13 Cel...,...,"Wifi, Bluetooth",Orange,128 GB,"Mobile Hotspot, Wifi",Android 13.0,8 GB,Unknown,Unknown,720 x 1600,https://m.media-amazon.com/images/G/01/gno/spr...


In [23]:
# Check the final status of missing values
final_cleaned_data_summary = data.isnull().sum()
final_cleaned_data_summary

Total Price (No Discount)       0
List Price                      0
Current Price                   0
You Save                        0
Stock Availability              0
Product Description             0
Title Content                   0
Rating(out of 5)                0
Total number of Rating          0
About Item                      0
Service Provider                0
Form Factor                     0
Manufacturer                    1
Whats in the box                0
Battery Power Rating            0
Other camera features           0
Human Interface Input           0
Other display features          0
GPS                             0
OS                              0
Batteries                       0
Item model number               0
ASIN                            0
Item Weight                    10
Product Dimensions              0
Top Comment                     0
Brand                           0
Screen Size                     0
Model Name                      0
Wireless Carri

In [24]:
# Cleaning the 'Item Weight' column by extracting the numeric part and converting to float
data['Item Weight'] = data['Item Weight'].str.extract('([0-9.]+)').astype(float)

# Now, we can fill the missing values with the median weight
data['Item Weight'] = data['Item Weight'].fillna(data['Item Weight'].median())

# Filling the remaining missing value for 'Memory Storage Capacity' using the mode
data['Memory Storage Capacity'] = data['Memory Storage Capacity'].fillna(data['Memory Storage Capacity'].mode()[0])

# Final check on missing values
final_cleaned_data_summary = data.isnull().sum()
final_cleaned_data_summary


Total Price (No Discount)      0
List Price                     0
Current Price                  0
You Save                       0
Stock Availability             0
Product Description            0
Title Content                  0
Rating(out of 5)               0
Total number of Rating         0
About Item                     0
Service Provider               0
Form Factor                    0
Manufacturer                   1
Whats in the box               0
Battery Power Rating           0
Other camera features          0
Human Interface Input          0
Other display features         0
GPS                            0
OS                             0
Batteries                      0
Item model number              0
ASIN                           0
Item Weight                    0
Product Dimensions             0
Top Comment                    0
Brand                          0
Screen Size                    0
Model Name                     0
Wireless Carrier               0
Cellular T

In [25]:
# Filling remaining missing values
data['Manufacturer'] = data['Manufacturer'].fillna('Unknown')
data['Item Weight'] = data['Item Weight'].fillna(data['Item Weight'].median())
data['Memory Storage Capacity'] = data['Memory Storage Capacity'].fillna(data['Memory Storage Capacity'].mode()[0])

# Final check on missing values
final_cleaned_data_summary = data.isnull().sum()
final_cleaned_data_summary


Total Price (No Discount)      0
List Price                     0
Current Price                  0
You Save                       0
Stock Availability             0
Product Description            0
Title Content                  0
Rating(out of 5)               0
Total number of Rating         0
About Item                     0
Service Provider               0
Form Factor                    0
Manufacturer                   0
Whats in the box               0
Battery Power Rating           0
Other camera features          0
Human Interface Input          0
Other display features         0
GPS                            0
OS                             0
Batteries                      0
Item model number              0
ASIN                           0
Item Weight                    0
Product Dimensions             0
Top Comment                    0
Brand                          0
Screen Size                    0
Model Name                     0
Wireless Carrier               0
Cellular T

In [26]:
display(data)


Unnamed: 0,Total Price (No Discount),List Price,Current Price,You Save,Stock Availability,Product Description,Title Content,Rating(out of 5),Total number of Rating,About Item,...,Connectivity Technology,Color,Memory Storage Capacity,Wireless network technology,Operating System,Ram Memory Installed Size,CPU Model,CPU Speed,Resolution,Image URLs
0,139.0,379.99,219.00,36.86,Yes,Not Available,"FOSSIBOT F105 Rugged Smartphone Unlocked, Andr...",4.4,13.0,About this item 12GB RAM+64GB ROM Memory: Equi...,...,Wireless,Green,64 GB,"Mobile Hotspot, Wifi",Android,12 GB,Unknown,Unknown,Unknown,https://m.media-amazon.com/images/G/01/gno/spr...
1,88.0,379.99,219.00,36.86,Yes,Not Available,KXD A07 Smartphone | 6.52-Inch Screen Cell Pho...,4.5,2.0,"About this item ã6.52"" Waterdrop ScreenãTh...",...,"Wifi, Bluetooth",Black,128 GB,"Mobile Hotspot, Wifi",Android,4 GB,Others,1.6 GHz,Unknown,https://m.media-amazon.com/images/G/01/gno/spr...
2,189.0,379.99,219.00,36.86,Unknown,Not Available,"I23 Ultra Unlocked Cell Phone,Built in Pen,Sma...",4.4,9.0,About this item ãOcta-Core CPU + 128GB Expan...,...,"Wi-Fi, USB",Purple,256 GB,"Mobile Hotspot, Wifi",Android 13.0,6 GB,Unknown,Unknown,Unknown,https://m.media-amazon.com/images/G/01/gno/spr...
3,192.0,379.99,218.99,36.86,Only 10 left in stock - order soon.,"This product is inspected, tested, and refurbi...","Apple iPhone 11, 64GB, Black - Unlocked (Renewed)",4.2,52624.0,About this item This phone is unlocked and com...,...,"Bluetooth, Wi-Fi, USB, NFC",Black,64 GB,"Mobile Hotspot, Wifi",iOS 16,4 GB,Unknown,Unknown,Unknown,https://m.media-amazon.com/images/G/01/gno/spr...
4,99.0,379.99,219.00,36.86,Yes,Not Available,Moto G Play 2023 3-Day Battery Unlocked Made f...,4.0,1263.0,About this item Carrier compatibility: ATandT:...,...,"Wifi, Bluetooth",Navy Blue,32 GB,"Mobile Hotspot, Wifi",Android 12.0,3 GB,Unknown,Unknown,1600 x 720,https://m.media-amazon.com/images/G/01/gno/spr...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4487,77.0,379.99,89.99,36.86,Yes,"This product is inspected, tested, and refurbi...","SAMSUNG Galaxy A13 LTE, Factory Unlocked Smart...",4.0,549.0,"About this item 6.6"" 1080 x 2408 (FHD+) PLS LC...",...,"Wifi, Bluetooth",Black,32 GB,"Mobile Hotspot, Wifi",Android,3 GB,1.2GHz Cortex A8 Processor,Unknown,1080 x 2408,https://m.media-amazon.com/images/G/01/gno/spr...
4488,19.0,379.99,219.00,36.86,Yes,Not Available,"TracFone TCL Flip 2, 8GB, Black - Prepaid Flip...",4.0,823.0,About this item Large 2.8 Inch Internal Screen...,...,"Bluetooth, Wi-Fi, USB",BLACK,16 GB,"Mobile Hotspot, Wifi",Android 11.0,1 GB,Unknown,1.5 GHz,Unknown,https://m.media-amazon.com/images/G/01/gno/spr...
4489,88.0,379.99,219.00,36.86,Yes,Not Available,KXD A07 Smartphone | 6.52-Inch Screen Cell Pho...,4.7,3.0,"About this item ã6.52"" Waterdrop ScreenãTh...",...,"Wifi, Bluetooth",Black,128 GB,"Mobile Hotspot, Wifi",Android,4 GB,Others,1.6 GHz,Unknown,https://m.media-amazon.com/images/G/01/gno/spr...
4490,115.0,379.99,219.00,36.86,Only 11 left in stock - order soon.,Not Available,"HOTWAV Note 12 Cell Phone Unlocked Android 13,...",3.7,230.0,About this item ã2023 Updated Android 13 Cel...,...,"Wifi, Bluetooth",Orange,128 GB,"Mobile Hotspot, Wifi",Android 13.0,8 GB,Unknown,Unknown,720 x 1600,https://m.media-amazon.com/images/G/01/gno/spr...


In [27]:
data['Memory Storage Capacity'].unique()

array(['64 GB', '128 GB', '256 GB', '32 GB', '4 GB', '512 GB', '16 GB',
       '6 GB', '3 GB', '128 MB', '8 GB', '124 GB', '12 GB', '0.12 GB',
       '0.3 GB', '2 GB', '1 GB', '265 GB', '1000 GB', '20 GB', '0.5 GB',
       '63.98 GB', '7.8 GB', '32 MB', '300 Milliliters',
       '128 GB, Screen Size 6.74 Inches'], dtype=object)

In [31]:
# Converting the values in 'Memory Storage Capacity' to integers for readability
data['Memory Storage Capacity'] = data['Memory Storage Capacity'].apply(lambda x: int(x) if pd.notnull(x) else x)

# Checking the unique values again to confirm the format
memory_capacity_unique_values_int = data['Memory Storage Capacity'].unique()
memory_capacity_unique_values_int


array([  64.,  128.,  256.,   32.,    4.,  512.,   16.,    6.,    3.,
          0.,    8.,  124.,   12.,    2.,    1.,  265., 1000.,   20.,
         63.,    7.,   nan])

In [36]:
# Function to convert RAM sizes to a numeric value in GB
def convert_ram_to_gb(value):
    # Check if it's already in GB
    if 'GB' in value:
        try:
            return float(re.search(r'[\d.]+', value).group())
        except:
            return None
    # Check if it's in MB and convert to GB (1 GB = 1024 MB)
    elif 'MB' in value:
        try:
            return float(re.search(r'[\d.]+', value).group()) / 1024
        except:
            return None
    # If it's an invalid entry like '8 Gigabytes Per Second', return None
    else:
        return None

# Apply the conversion function to the 'Ram Memory Installed Size' column
data['Ram Memory Installed Size'] = data['Ram Memory Installed Size'].apply(lambda x: convert_ram_to_gb(str(x)))

# Convert the RAM values to integers for readability
data['Ram Memory Installed Size'] = data['Ram Memory Installed Size'].apply(lambda x: int(x) if pd.notnull(x) else x)

# Check the unique values after conversion
ram_memory_unique_values_int = data['Ram Memory Installed Size'].unique()
ram_memory_unique_values_int


array([ 12.,   4.,   6.,   3.,   8.,   2.,  64.,   1.,   0.,  16.,  32.,
       128., 256.,  24.,   9.,  15.,  nan,  20.])

In [37]:
ram_memory_unique_values_int

array([ 12.,   4.,   6.,   3.,   8.,   2.,  64.,   1.,   0.,  16.,  32.,
       128., 256.,  24.,   9.,  15.,  nan,  20.])

In [38]:
data['Ram Memory Installed Size']

0       12.0
1        4.0
2        6.0
3        4.0
4        3.0
        ... 
4487     3.0
4488     1.0
4489     4.0
4490     8.0
4491     8.0
Name: Ram Memory Installed Size, Length: 4492, dtype: float64

In [39]:
data['Product Dimensions'].unique()

array(['6.69 x 7.09 x 0.31 inches', '6.5 x 2.99 x 0.33 inches', 'Unknown',
       '7 x 5 x 4 inches', '6.58 x 3.01 x 0.37 inches',
       '3.02 x 0.33 x 6.3 inches', '7 x 4 x 5 inches',
       '6.4 x 3.01 x 0.35 inches', '6 x 3 x 1 inches',
       '6.7 x 3.4 x 2.44 inches', '6.1 x 2.3 x 0.4 inches',
       '2.98 x 0.34 x 6.61 inches', '6.45 x 2.95 x 0.33 inches',
       '6.3 x 3.02 x 0.33 inches', '6.46 x 2.99 x 0.36 inches',
       '6.6 x 3.07 x 0.36 inches', '0.32 x 3.07 x 6.37 inches',
       '5.97 x 0.31 x 2.72 inches', '2.53 x 0.29 x 5.18 inches',
       '5.75 x 2.78 x 0.3 inches', '6 x 2.8 x 0.4 inches',
       '5.78 x 2.82 x 0.31 inches', '6.59 x 2.98 x 0.35 inches',
       '6.5 x 3.01 x 0.35 inches', '0.28 x 2.64 x 5.43 inches',
       '6.57 x 3.03 x 0.31 inches', '1 x 1 x 1 inches',
       '1.43 x 6.31 x 8.25 inches', '6.39 x 3.11 x 0.34 inches',
       '6 x 4 x 2 inches', '2.99 x 0.31 x 6.54 inches',
       '5.42 x 0.39 x 2.59 inches', '2.04 x 6.31 x 8.25 inches',
       '7.9

In [41]:
# Removing "inches" from the 'Product Dimensions' and 'Screen Size' columns

# For 'Product Dimensions', remove "inches" and keep the numeric part
data['Product Dimensions'] = data['Product Dimensions'].str.replace(' inches', '')

# For 'Screen Size', remove "inches" and keep the numeric part
data['Screen Size'] = data['Screen Size'].str.replace(' inches', '')
data['Screen Size'] = data['Screen Size'].str.replace(' Inches', '')

# Checking the unique values in both columns to confirm the changes
product_dimensions_unique_values = data['Product Dimensions'].unique()
screen_size_unique_values = data['Screen Size'].unique()

product_dimensions_unique_values, screen_size_unique_values


(array(['6.69 x 7.09 x 0.31', '6.5 x 2.99 x 0.33', 'Unknown', '7 x 5 x 4',
        '6.58 x 3.01 x 0.37', '3.02 x 0.33 x 6.3', '7 x 4 x 5',
        '6.4 x 3.01 x 0.35', '6 x 3 x 1', '6.7 x 3.4 x 2.44',
        '6.1 x 2.3 x 0.4', '2.98 x 0.34 x 6.61', '6.45 x 2.95 x 0.33',
        '6.3 x 3.02 x 0.33', '6.46 x 2.99 x 0.36', '6.6 x 3.07 x 0.36',
        '0.32 x 3.07 x 6.37', '5.97 x 0.31 x 2.72', '2.53 x 0.29 x 5.18',
        '5.75 x 2.78 x 0.3', '6 x 2.8 x 0.4', '5.78 x 2.82 x 0.31',
        '6.59 x 2.98 x 0.35', '6.5 x 3.01 x 0.35', '0.28 x 2.64 x 5.43',
        '6.57 x 3.03 x 0.31', '1 x 1 x 1', '1.43 x 6.31 x 8.25',
        '6.39 x 3.11 x 0.34', '6 x 4 x 2', '2.99 x 0.31 x 6.54',
        '5.42 x 0.39 x 2.59', '2.04 x 6.31 x 8.25', '7.96 x 2.28 x 0.45',
        '2.5 x 6.75 x 10.38', '2.95 x 6.5 x 0.34', '6.46 x 0.35 x 0.35',
        '6.41 x 2.9 x 0.37', '6 x 4 x 8', '6.06 x 2.99 x 0.39',
        '6.36 x 2.98 x 0.31', '6.24 x 2.99 x 0.3', '6.53 x 3.06 x 0.43',
        '7.09 x 0.67 x 0.31

In [44]:
data['Battery Power Rating'].unique()

array(['10300 Milliamp Hours', '4500', '6800 Milliamp Hours',
       '3600 Amp Hours', '5000', '5000 Milliamp Hours', '5025',
       '4000 Milliamp Hours', '1821 Amp Hours', '3600', '4000', '5150',
       '1960', '19', '2227 Milliampere Hour (mAh)', '3700 Milliamp Hours',
       '4300', '2018 Milliamp Hours', '6000 Milliamp Hours',
       '8000 Amp Hours', '6800', '2815 Milliampere Hour (mAh)',
       '2000 Milliamp Hours', '2000', '1850', '1450', '1500',
       '4500 Milliamp Hours', '21', '3400 Milliamp Hours', '2400',
       '2406 Milliamp Hours', '4800 Milliampere Hour (mAh)', '4900',
       '4850', '2658 Milliamp Hours', '3200 Milliamp Hours',
       '5000 Milliampere Hour (mAh)', '10 Watts', '4352 Milliamp Hours',
       '10200 Milliamp Hours', '4323 Milliamp Hours',
       '3240 Milliamp Hours', '3700', '4614 Milliamp Hours', '3900',
       '3000 Milliamp Hours', '4926', '3000', '9', '4385 Milliamp Hours',
       '4700', '4800 Milliamp Hours', '5500',
       '3969 Milliampere Ho

In [45]:
# Function to convert battery ratings to milliamp hours (mAh)
def convert_battery_to_mah(value):
    value = str(value).lower()
    
    # Check for milliamp hour (mAh) values
    if 'millamp hours' in value or 'mah' in value:
        try:
            return float(re.search(r'[\d.]+', value).group())
        except:
            return None
    # Check for amp hour (Ah) values and convert to mAh (1 Ah = 1000 mAh)
    elif 'amp hours' in value:
        try:
            return float(re.search(r'[\d.]+', value).group()) * 1000
        except:
            return None
    # For Watt Hours, we'll keep as is (or potentially convert if voltage is known)
    elif 'watt hours' in value:
        try:
            return None  # Placeholder for now; we could convert if voltage is known
        except:
            return None
    # If none of the above, return None
    else:
        return None

# Apply the conversion function to the 'Battery Power Rating' column
data['Battery Power Rating'] = data['Battery Power Rating'].apply(lambda x: convert_battery_to_mah(x))

# Check the unique values after cleaning
battery_power_unique_values = data['Battery Power Rating'].unique()
battery_power_unique_values


array([1.030e+07,       nan, 6.800e+06, 3.600e+06, 5.000e+06, 4.000e+06,
       1.821e+06, 2.227e+03, 3.700e+06, 2.018e+06, 6.000e+06, 8.000e+06,
       2.815e+03, 2.000e+06, 4.500e+06, 3.400e+06, 2.406e+06, 4.800e+03,
       2.658e+06, 3.200e+06, 5.000e+03, 4.352e+06, 1.020e+07, 4.323e+06,
       3.240e+06, 4.614e+06, 3.000e+06, 4.385e+06, 4.800e+06, 3.969e+03,
       6.300e+06, 3.100e+06, 4.355e+06, 1.100e+07, 4.300e+06, 3.174e+06,
       4.400e+06, 5.500e+06, 6.500e+06, 4.180e+06, 1.050e+06, 4.492e+06,
       2.000e+03, 1.770e+06, 5.050e+06, 1.060e+07, 5.025e+06, 3.900e+06,
       6.000e+03, 1.200e+07, 5.100e+06, 5.010e+06, 1.600e+03, 1.000e+06,
       1.500e+06, 6.000e+00, 4.500e+03, 3.550e+06, 4.080e+06, 4.610e+06,
       6.150e+06, 6.150e+03, 3.500e+06, 6.320e+06, 4.780e+06, 5.500e+03,
       6.180e+06])

In [54]:

# Check the final status of missing values
final_cleaned_data_summary = data.isnull().sum()
final_cleaned_data_summary

Total Price (No Discount)         0
List Price                        0
Current Price                     0
You Save                          0
Stock Availability                0
Product Description               0
Title Content                     0
Rating(out of 5)                  0
Total number of Rating            0
About Item                        0
Service Provider                  0
Form Factor                       0
Manufacturer                      0
Whats in the box                  0
Battery Power Rating           1946
Other camera features             0
Human Interface Input             0
Other display features            0
GPS                               0
OS                                0
Batteries                         0
Item model number                 0
ASIN                              0
Item Weight                       0
Product Dimensions                0
Top Comment                       0
Brand                             0
Screen Size                 

In [55]:
# Filling missing values in 'Battery Power Rating' with 'Unknown'
data['Battery Power Rating'] = data['Battery Power Rating'].fillna('Unknown')

# Checking the updated missing values in the dataset
missing_values_summary = data.isnull().sum()
missing_values_summary


Total Price (No Discount)      0
List Price                     0
Current Price                  0
You Save                       0
Stock Availability             0
Product Description            0
Title Content                  0
Rating(out of 5)               0
Total number of Rating         0
About Item                     0
Service Provider               0
Form Factor                    0
Manufacturer                   0
Whats in the box               0
Battery Power Rating           0
Other camera features          0
Human Interface Input          0
Other display features         0
GPS                            0
OS                             0
Batteries                      0
Item model number              0
ASIN                           0
Item Weight                    0
Product Dimensions             0
Top Comment                    0
Brand                          0
Screen Size                    0
Model Name                     0
Wireless Carrier               0
Cellular T

In [65]:
# Converting MHz values to GHz in the 'CPU Speed' column
def convert_mhz_to_ghz(value):
    if 'mhz' in value.lower():
        try:
            return str(float(re.search(r'[\d.]+', value).group()) / 1000)  # Convert MHz to GHz
        except:
            return 'Unknown'
    else:
        return value

# Function to clean anomalies in CPU Speed values
def clean_cpu_speed(value):
    # If the value contains multiple speeds, extract the highest value
    if ',' in value:
        try:
            # Split the values and take the max after converting them to float
            speeds = [float(v.strip()) for v in value.split(',')]
            return f"{max(speeds)} GHz"
        except:
            return "Unknown"
    
    # Handle specific invalid units (like KHz or Nanometers)
    if 'khz' in value.lower() or 'nanometer' in value.lower():
        return "Unknown"
    
    # Return the valid value, appending GHz if necessary
    try:
        speed = float(value)
        return f"{speed} GHz" if speed > 0 else "Unknown"
    except:
        return value

# Apply the cleaning function to the 'CPU Speed' column
data['CPU Speed'] = data['CPU Speed'].apply(lambda x: clean_cpu_speed(x))

# Check the unique values after cleaning
cpu_speed_cleaned_values = data['CPU Speed'].unique()
cpu_speed_cleaned_values

data['CPU Speed'] = data['CPU Speed'].str.replace(' GHz', '')

# Check the unique values after conversion
data['CPU Speed'].unique()

array(['Unknown', '1.6', '2.0', '2.8', '2.84', '2.1', '2.2', '2.35',
       '3.4', '3.0', '0.0034', '2.3 Hz', '2.3', '2.6', '1.3', '1.28',
       '1.5', '2.73', '3.23', '2.4', '0.0028399999999999996', '2.26',
       '2.34', '3.2', '2.9', '2.5', '2.76', '3.36', '1.8', '3.35', '1.1',
       '1.2', '2.45', '0.003', '1.9', '2.85', '3.78', '0.0031', '0.0025',
       '2.75', '0.002', '1.84', '4.0', '1.4', '0.0022', '3.05', '3 Hz',
       '0.0012', '3.19', '0.0016'], dtype=object)

In [48]:
data.isna().sum()

Total Price (No Discount)      0
List Price                     0
Current Price                  0
You Save                       0
Stock Availability             0
Product Description            0
Title Content                  0
Rating(out of 5)               0
Total number of Rating         0
About Item                     0
Service Provider               0
Form Factor                    0
Manufacturer                   0
Whats in the box               0
Battery Power Rating           0
Other camera features          0
Human Interface Input          0
Other display features         0
GPS                            0
OS                             0
Batteries                      0
Item model number              0
ASIN                           0
Item Weight                    0
Product Dimensions             0
Top Comment                    0
Brand                          0
Screen Size                    0
Model Name                     0
Wireless Carrier               0
Cellular T

In [84]:
# Remove the specific link from the 'Image URLs' column
data['Image URLs'] = data['Image URLs'].str.replace(
    'https://m.media-amazon.com/images/G/01/gno/sprites/nav-sprite-global-1x-reorg-privacy._CB587940754_.png', 
    '', 
    regex=False
)
data['Image URLs'] = data['Image URLs'].str.replace(
    'https://m.media-amazon.com/images/I/21DX0E62GJL.png', 
    '', 
    regex=False
)
data['Image URLs'] = data['Image URLs'].str.replace(
    ' https://m.media-amazon.com/images/G/01/digital/video/merch/Other/TNFL_24_SWM_700x78_POST_Final_en-US_GSS00009642_GraphicalCountdown_Logos._CB564570695_.gif', 
    '', 
    regex=False
)
data['Image URLs'] = data['Image URLs'].str.replace(
    'https://m.media-amazon.com/images/G/01/consumerelectronics/zhenzhe/NavX/amazon-CE-store-new-v2._CB565923538_.png', 
    '', 
    regex=False
)
data['Image URLs'] = data['Image URLs'].str.replace(
    'https://m.media-amazon.com/images/G/01/product_insurance/images/warranty-short-bullet-point-coverage._CB630304460_.png', 
    '', 
    regex=False
)



# Check the final status of unique values in 'Image URLs'
final_cleaned_data_summary = data['Image URLs'].notnull
final_cleaned_data_summary

<bound method Series.notnull of 0                                                 ,, , , 
1                                                 ,, , , 
2                                                 ,, , , 
3                                                 ,, , , 
4                                                 ,, , , 
                              ...                        
4487                                             , , , , 
4488    , , , , https://m.media-amazon.com/images/I/31...
4489                                             , , , , 
4490                                             , , , , 
4491                                             , , , , 
Name: Image URLs, Length: 4492, dtype: object>

In [78]:
data.to_csv('cleaned_product_data.csv', index=False)
