In [1]:
import pandas as pd

In [16]:
data = pd.read_csv("./Data/scraped_mobile_data.csv")
df = pd.DataFrame(data)

In [60]:
print(df.head())

                                       name                    price  \
0                POCO F6 5G (Black, 256 GB)  ₹31,999\n₹35,99911% off   
1             POCO F6 5G (Titanium, 256 GB)  ₹29,999\n₹33,99911% off   
2     Motorola G34 5G (Ocean Green, 128 GB)  ₹11,999\n₹14,99920% off   
3  Motorola G34 5G (Charcoal Black, 128 GB)  ₹11,999\n₹14,99920% off   
4      POCO M6 Pro 5G (Power Black, 128 GB)   ₹9,999\n₹16,99941% off   

                                            sizeInCm  \
0  12 GB RAM | 256 GB ROM\n16.94 cm (6.67 inch) D...   
1  8 GB RAM | 256 GB ROM\n16.94 cm (6.67 inch) Di...   
2  8 GB RAM | 128 GB ROM\n16.51 cm (6.5 inch) HD+...   
3  8 GB RAM | 128 GB ROM\n16.51 cm (6.5 inch) HD+...   
4  6 GB RAM | 128 GB ROM | Expandable Upto 1 TB\n...   

                                          sizeInInch  
0  12 GB RAM | 256 GB ROM\n16.94 cm (6.67 inch) D...  
1  8 GB RAM | 256 GB ROM\n16.94 cm (6.67 inch) Di...  
2  8 GB RAM | 128 GB ROM\n16.51 cm (6.5 inch) HD+...  
3  8 GB RA

## Cleaning the Price column

In [62]:
df["price"] = [(i.split("\n"))[0] for i in df["price"]]
df["price"] = [i.replace("₹", " ") for i in df["price"]]
df["price"] = [i.replace(",", "") for i in df["price"]]

df["price"] = pd.to_numeric(df["price"], errors='coerce') 
df["price"] = df["price"].fillna(0).astype(int)

print(df["price"])

0       31999
1       29999
2       11999
3       11999
4        9999
        ...  
1363     7999
1364     7499
1365     7999
1366    58999
1367     9099
Name: price, Length: 1368, dtype: int64


## Transforming the SizeInCm column

In [139]:
df["sizeInCm"] = df["sizeInCm"].astype(str)

df["sizeInCm"] = df["sizeInCm"].str.split("\n").str[1].str.split(" ").str[0]

# converting to float value
df["sizeInCm"] = pd.to_numeric(df["sizeInCm"], errors='coerce').astype('float')

print(df["sizeInCm"])

0       16.94
1       16.94
2       16.51
3       16.51
4       17.25
        ...  
1363    16.51
1364    16.76
1365    16.51
1366    15.49
1367    17.22
Name: sizeInCm, Length: 1368, dtype: float64


## Transforming the sizeInInch Column

In [142]:
df["sizeInInch"] = df["sizeInInch"].str.split("\n").str[1].str.split(" ").str[2].str.replace("(", "")

# converting to float dataType
df['sizeInInch'] = pd.to_numeric(df['sizeInInch'], errors='coerce').astype("float")

print(df['sizeInInch'])

0       6.67
1       6.67
2       6.50
3       6.50
4       6.79
        ... 
1363    6.50
1364    6.60
1365    6.50
1366    6.10
1367    6.78
Name: sizeInInch, Length: 1368, dtype: float64


In [143]:
df.head()

Unnamed: 0,name,price,sizeInCm,sizeInInch
0,"POCO F6 5G (Black, 256 GB)",31999,16.94,6.67
1,"POCO F6 5G (Titanium, 256 GB)",29999,16.94,6.67
2,"Motorola G34 5G (Ocean Green, 128 GB)",11999,16.51,6.5
3,"Motorola G34 5G (Charcoal Black, 128 GB)",11999,16.51,6.5
4,"POCO M6 Pro 5G (Power Black, 128 GB)",9999,17.25,6.79


## Removing the duplicate data

Our data contains a large number of duplicates due to the scraping process getting stuck on a single page, causing excessive repetition of data from that one source.

In [9]:
# finding the duplicate
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
1363     True
1364     True
1365     True
1366     True
1367     True
Length: 1368, dtype: bool

In [10]:
# removing all the duplicate data
df.drop_duplicates(inplace= True)

Saving data in CSV file

In [13]:
df.to_csv("./Data/cleaned_mobile_data.csv", index=False)