In [22]:
import pandas as pd
import numpy as np

import os

# Extract Data

In [23]:
data_product_path = "/content/drive/MyDrive/Purwadhika/data_sources/data_products"

files = os.listdir(data_product_path)
print(files[:5])

['All Video Games.csv', 'Amazon Fashion.csv', 'All Books.csv', 'All Car and Motorbike Products.csv', 'All Hindi.csv']


In [24]:
df_dicts = dict()

# loop through the file lists
for file in files:
  df_dicts[file] = pd.read_csv(data_product_path + '/' + file)
  df_dicts[file]['file_name'] = file

  # remove 0 rows data from list of data
  if len(df_dicts[file]) == 0:
    print(file + " no rows recorded")
    df_dicts.pop(file)

All Video Games.csv no rows recorded
All Books.csv no rows recorded
All Hindi.csv no rows recorded
All English.csv no rows recorded
All Music.csv no rows recorded
Amazon Pharmacy.csv no rows recorded
All Movies and TV Shows.csv no rows recorded
Indian Language Books.csv no rows recorded
International Music.csv no rows recorded
Entertainment Collectibles.csv no rows recorded
Gaming Consoles.csv no rows recorded
Subscribe and Save.csv no rows recorded
Fine Art.csv no rows recorded
PC Games.csv no rows recorded
School Textbooks.csv no rows recorded
Gaming Accessories.csv no rows recorded
Exam Central.csv no rows recorded
Childrens Books.csv no rows recorded
Fiction Books.csv no rows recorded
Video Games Deals.csv no rows recorded
Sports Collectibles.csv no rows recorded
Textbooks.csv no rows recorded
Kindle eBooks.csv no rows recorded
Indian Classical.csv no rows recorded
Blu-ray.csv no rows recorded
Pantry.csv no rows recorded
Film Songs.csv no rows recorded


In [25]:
# check if all the data column is all the same

active_set = set()
reference_set = set(df_dicts['Amazon Fashion.csv'].columns)
error_file_list = []

for key, df in df_dicts.items():
  active_set = set(df.columns)
  if active_set != reference_set:
    error_file_list.append(key)

print(error_file_list)

['Amazon-Products.csv']


In [26]:
# check the anomaly data columns

print("Amazon Fashion.csv")
display(df_dicts["Amazon Fashion.csv"].head(1))
print("Amazon-Products.csv")
display(df_dicts["Amazon-Products.csv"].head(1))

Amazon Fashion.csv


Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,file_name
0,Aqualogica Glow+ Dewy Sunscreen SPF 50 PA+++ F...,stores,Amazon Fashion,https://m.media-amazon.com/images/I/51TSC6Uogx...,https://www.amazon.in/Aqualogica-Sunscreen-Pro...,4.2,3628,₹351,₹399,Amazon Fashion.csv


Amazon-Products.csv


Unnamed: 0.1,Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,file_name
0,0,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/31UISB90sY...,https://www.amazon.in/Lloyd-Inverter-Convertib...,4.2,2255,"₹32,999","₹58,990",Amazon-Products.csv


In [27]:
df_dicts["Amazon-Products.csv"].drop(columns=['Unnamed: 0'], inplace=True)

df_dicts["Amazon-Products.csv"].head(1)

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,file_name
0,Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1...,appliances,Air Conditioners,https://m.media-amazon.com/images/I/31UISB90sY...,https://www.amazon.in/Lloyd-Inverter-Convertib...,4.2,2255,"₹32,999","₹58,990",Amazon-Products.csv


In [28]:
# combine all the dataframes

combined_df = pd.concat(df_dicts.values(), ignore_index=True)
combined_df = combined_df.astype(str)
combined_df.head(1)

Unnamed: 0,name,main_category,sub_category,image,link,ratings,no_of_ratings,discount_price,actual_price,file_name
0,Aqualogica Glow+ Dewy Sunscreen SPF 50 PA+++ F...,stores,Amazon Fashion,https://m.media-amazon.com/images/I/51TSC6Uogx...,https://www.amazon.in/Aqualogica-Sunscreen-Pro...,4.2,3628,₹351,₹399,Amazon Fashion.csv


# Transformation Data

In [29]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1103170 entries, 0 to 1103169
Data columns (total 10 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   name            1103170 non-null  object
 1   main_category   1103170 non-null  object
 2   sub_category    1103170 non-null  object
 3   image           1103170 non-null  object
 4   link            1103170 non-null  object
 5   ratings         1103170 non-null  object
 6   no_of_ratings   1103170 non-null  object
 7   discount_price  1103170 non-null  object
 8   actual_price    1103170 non-null  object
 9   file_name       1103170 non-null  object
dtypes: object(10)
memory usage: 84.2+ MB


## Configure DataType

In [30]:
# DATA TYPE CONFIG

# name            string
# main_category   string
# sub_category    string
# image           string
# link            string
# ratings         float
# no_of_ratings   float
# discount_price  float
# actual_price    float
# file_name       string

In [31]:
def convert_column_type_to_numeric(df, column_name):
  # boolean filter for unconvertible columns
  unconvertible_filter  = pd.to_numeric(df[column_name].str.replace(r'[,\s]', ''), errors='coerce').isna()
  unconvertible_df = df[unconvertible_filter]
  unconvertible_df = unconvertible_df[unconvertible_df[column_name].notna()]
  print(f"{len(unconvertible_df)} unconvertible data for column {column_name}")

  df[column_name] = pd.to_numeric(df[column_name].str.replace(r'[,\s]', ''), errors='coerce')
  return df, unconvertible_df

def extract_currency_symbol(df, column_name):
  return df[column_name].str.extract(r'(^[^\d.]+)')

def remove_non_numeric(df, column_name):
  df[column_name] = df[column_name].str.replace(r'[^\da-zA-Z\s.]', '', regex=True)

In [32]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1103170 entries, 0 to 1103169
Data columns (total 10 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   name            1103170 non-null  object
 1   main_category   1103170 non-null  object
 2   sub_category    1103170 non-null  object
 3   image           1103170 non-null  object
 4   link            1103170 non-null  object
 5   ratings         1103170 non-null  object
 6   no_of_ratings   1103170 non-null  object
 7   discount_price  1103170 non-null  object
 8   actual_price    1103170 non-null  object
 9   file_name       1103170 non-null  object
dtypes: object(10)
memory usage: 84.2+ MB


In [33]:
combined_df['discount_price_currency_symbol'] = extract_currency_symbol(combined_df, 'discount_price')
combined_df['actual_price_currency_symbol'] = extract_currency_symbol(combined_df, 'actual_price')

remove_non_numeric(combined_df, 'no_of_ratings')
remove_non_numeric(combined_df, 'discount_price')
remove_non_numeric(combined_df, 'actual_price')

combined_df, unconvertible_ratings_df = convert_column_type_to_numeric(combined_df, 'ratings')
combined_df, unconvertible_no_of_ratings_df = convert_column_type_to_numeric(combined_df, 'no_of_ratings')
combined_df, unconvertible_discount_price_df = convert_column_type_to_numeric(combined_df, 'discount_price')
combined_df, unconvertible_actual_price_df = convert_column_type_to_numeric(combined_df, 'actual_price')

combined_df.info()

364054 unconvertible data for column ratings
364054 unconvertible data for column no_of_ratings
122326 unconvertible data for column discount_price
35626 unconvertible data for column actual_price
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1103170 entries, 0 to 1103169
Data columns (total 12 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   name                            1103170 non-null  object 
 1   main_category                   1103170 non-null  object 
 2   sub_category                    1103170 non-null  object 
 3   image                           1103170 non-null  object 
 4   link                            1103170 non-null  object 
 5   ratings                         739116 non-null   float64
 6   no_of_ratings                   739116 non-null   float64
 7   discount_price                  980844 non-null   float64
 8   actual_price                    1067544 non-null  float

In [34]:
duplicate_rows = combined_df[combined_df.duplicated()]
num_duplicates_removed = len(duplicate_rows)
combined_df.drop_duplicates(inplace=True)

print(f"duplicate rows removed: {num_duplicates_removed}")


duplicate rows removed: 0


## Check Unconvertible Data

In [35]:
print('========== ratings: ==========')
print(unconvertible_ratings_df['ratings'].unique())
print('\n========== no_of_ratings: ==========')
print(unconvertible_no_of_ratings_df['no_of_ratings'].unique())
print('\n========== discount_price: ==========')
print(unconvertible_discount_price_df['discount_price'].unique())
print('\n========== actual_price: ==========')
print(unconvertible_actual_price_df['actual_price'].unique())

['nan' 'Get' 'FREE' '₹65' '₹70' '₹2.99' '₹99' '₹100' '₹68.99']

['nan' 'FREE Delivery by Amazon' 'Only 2 left in stock.'
 'Only 1 left in stock.' 'Usually dispatched in 2 to 3 days.'
 'Usually dispatched in 6 to 7 days.' 'Usually dispatched in 4 to 5 days.'
 'Usually dispatched in 3 to 4 weeks.'
 'Usually dispatched in 3 to 5 days.'
 'Usually dispatched in 4 to 5 weeks.'
 'Usually dispatched in 5 to 6 days.'
 'Usually dispatched in 11 to 12 days.'
 'Usually dispatched in 7 to 8 days.' 'Only 4 left in stock.'
 'Only 3 left in stock.' 'Only 5 left in stock.'
 'Usually dispatched in 1 to 2 months.'
 'This item will be released on August 14 2023.'
 'Usually dispatched in 2 to 3 weeks.'
 'Usually dispatched in 8 to 9 days.'
 'Usually dispatched in 9 to 10 days.']

['nan']

['nan']


In [36]:
combined_df[['ratings', 'no_of_ratings', 'discount_price', 'actual_price']].agg(['min', 'max'])

Unnamed: 0,ratings,no_of_ratings,discount_price,actual_price
min,1.0,1.0,8.0,0.0
max,5.0,589547.0,1249990.0,9900000000.0


# Data Demography

## categorical

In [37]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1103170 entries, 0 to 1103169
Data columns (total 12 columns):
 #   Column                          Non-Null Count    Dtype  
---  ------                          --------------    -----  
 0   name                            1103170 non-null  object 
 1   main_category                   1103170 non-null  object 
 2   sub_category                    1103170 non-null  object 
 3   image                           1103170 non-null  object 
 4   link                            1103170 non-null  object 
 5   ratings                         739116 non-null   float64
 6   no_of_ratings                   739116 non-null   float64
 7   discount_price                  980844 non-null   float64
 8   actual_price                    1067544 non-null  float64
 9   file_name                       1103170 non-null  object 
 10  discount_price_currency_symbol  1103170 non-null  object 
 11  actual_price_currency_symbol    1103170 non-null  object 
dtype

In [38]:
categorical_columns = combined_df.select_dtypes(include=['object']).columns

for column in categorical_columns:
  print(f"\n========= Data Demography for: {column} =========")
  print(f"Unique Values: {combined_df[column].nunique()}")
  print(f"Value Counts:")
  display(combined_df[column].value_counts())


Unique Values: 396210
Value Counts:


Unnamed: 0_level_0,count
name,Unnamed: 1_level_1
Zeya Yellow Gold Ring,1436
PC Jeweller 22k (916) Yellow Gold Ring for Women,1138
Avsar 18k (750) Yellow Gold and Diamond Ring for Women,862
Arrow Men Shirt,846
Peter England Men Shirt,622
...,...
Nike Men's Black & White Running Shoes - 10 UK (10.5 US),2
Adidas Mens Daroga Plus Mid Lea Shoes,2
Under Armour Men's Curry 2.5 Basketball Shoes Size 9.5 D(M) US,2
"Nike Air Max 90 Ultra SE (GS) Running Trainers 844599 Sneakers Shoes (36 EU, Anthracite Wolf Grey Gym red 005)",2



Unique Values: 20
Value Counts:


Unnamed: 0_level_0,count
main_category,Unnamed: 1_level_1
accessories,232282
men's clothing,153312
women's clothing,153024
"tv, audio & cameras",137318
men's shoes,114912
appliances,66192
stores,65806
home & kitchen,29136
kids' fashion,26976
sports & fitness,25296



Unique Values: 112
Value Counts:


Unnamed: 0_level_0,count
sub_category,Unnamed: 1_level_1
Men's Fashion,38400
Shirts,38400
Sports Shoes,38400
Western Wear,38400
Formal Shoes,38400
...,...
STEM Toys Store,96
Fashion Sales & Deals,88
International Toy Store,48
Refurbished & Open Box,48



Unique Values: 462414
Value Counts:


Unnamed: 0_level_0,count
image,Unnamed: 1_level_1
https://m.media-amazon.com/images/I/51uEPldT42L._AC_UL320_.jpg,6088
https://m.media-amazon.com/images/W/IMAGERENDERING_521856-T2/images/I/51uEPldT42L._AC_UL320_.jpg,1310
https://m.media-amazon.com/images/W/IMAGERENDERING_521856-T1/images/I/51uEPldT42L._AC_UL320_.jpg,1102
https://m.media-amazon.com/images/I/61ijir6ozPL._AC_UL320_.jpg,776
https://m.media-amazon.com/images/I/41gmezKJxCL._AC_UL320_.jpg,272
...,...
https://m.media-amazon.com/images/I/71o3eYT1BEL._AC_UL320_.jpg,2
https://m.media-amazon.com/images/I/61VIyYrI3rL._AC_UL320_.jpg,2
https://m.media-amazon.com/images/I/81hNtJpQYFL._AC_UL320_.jpg,2
https://m.media-amazon.com/images/I/71of-dCwCIL._AC_UL320_.jpg,2



Unique Values: 551585
Value Counts:


Unnamed: 0_level_0,count
link,Unnamed: 1_level_1
https://www.amazon.in/Aqualogica-Sunscreen-Protection-Glowing-Protected/dp/B09TPFTJNN/ref=sr_1_49?qid=1679212359&s=apparel&sr=1-49,2
https://www.amazon.in/RANGOLI-ART-Womens-Beautiful-sequence/dp/B0BDFYM18B/ref=sr_1_6892?qid=1679154377&s=apparel&sr=1-6892,2
https://www.amazon.in/Riyashree-Unstitched-Dupatta-Material-Quality/dp/B0BVKD8B43/ref=sr_1_6906?qid=1679154377&s=apparel&sr=1-6906,2
https://www.amazon.in/Comfort-Lady-Kurti-Mobile-Pocket/dp/B08TWW8MFR/ref=sr_1_6905?qid=1679154377&s=apparel&sr=1-6905,2
https://www.amazon.in/Max-NOOSNYD21LPLIGHT-PINK-Solid-Dupatta/dp/B09K3VZP3G/ref=sr_1_6904?qid=1679154377&s=apparel&sr=1-6904,2
...,...
https://www.amazon.in/Max-Girls-Top-M22AMD15_Peach-Apricot_6-12M/dp/B09RZN4XTW/ref=sr_1_6872?qid=1679147433&s=shoes&sr=1-6872,2
https://www.amazon.in/Campus-Child-Charm-L-Gry-Running/dp/B09SY9M2N7/ref=sr_1_6871?qid=1679147433&s=shoes&sr=1-6871,2
https://www.amazon.in/SWIGGY-Lightweight-Multicolour-AL-1658-3105/dp/B09ZPNCH2V/ref=sr_1_6870?qid=1679147433&s=shoes&sr=1-6870,2
https://www.amazon.in/Puma-Mizar-Running-Castlerock-Vibrant-Orange/dp/B0B2JYN83S/ref=sr_1_6869?qid=1679147433&s=shoes&sr=1-6869,2



Unique Values: 113
Value Counts:


Unnamed: 0_level_0,count
file_name,Unnamed: 1_level_1
Amazon-Products.csv,551585
Formal Shoes.csv,19200
Western Wear.csv,19200
Shirts.csv,19200
Jeans.csv,19200
...,...
STEM Toys Store.csv,48
Fashion Sales and Deals.csv,44
Toys Gifting Store.csv,24
Refurbished and Open Box.csv,24



Unique Values: 2
Value Counts:


Unnamed: 0_level_0,count
discount_price_currency_symbol,Unnamed: 1_level_1
₹,980844
,122326



Unique Values: 2
Value Counts:


Unnamed: 0_level_0,count
actual_price_currency_symbol,Unnamed: 1_level_1
₹,1067544
,35626


## numerical

In [39]:
numerical_columns = combined_df.select_dtypes(include=['float64']).columns

for column in numerical_columns:
  print(f"\n========= Data Demography for: {column} =========")
  print(f"Minimum Value: {combined_df[column].min()}")
  print(f"Maximum Value: {combined_df[column].max()}")
  print(f"Mean Value: {combined_df[column].mean()}")
  print(f"Median Value: {combined_df[column].median()}")
  print(f"Number of Unique Values: {combined_df[column].nunique()}")



Minimum Value: 1.0
Maximum Value: 5.0
Mean Value: 3.8323113016089505
Median Value: 3.9
Number of Unique Values: 41

Minimum Value: 1.0
Maximum Value: 589547.0
Mean Value: 840.7786977957452
Median Value: 20.0
Number of Unique Values: 8284

Minimum Value: 8.0
Maximum Value: 1249990.0
Mean Value: 2623.160741076052
Median Value: 679.0
Number of Unique Values: 27511

Minimum Value: 0.0
Maximum Value: 9899999999.0
Mean Value: 23111.28346923406
Median Value: 1599.0
Number of Unique Values: 23170
