In [1]:
!pip install openpyxl




In [2]:
import requests
import zipfile
import io
import os
import numpy as np
import pandas as pd

In [None]:
####    Genaral Functions   ####

# 1- Function for showing the dataframe characteristics

def data_details(df, n=5):

     print("\n")
     print("Shape:")
     print(df.shape)
     print("\n")
    
     print("\n The Head")
     display(df.head(n))  
     print("\n")

     print("\n Info:")
     print(df.info(memory_usage="deep"))
     print("\n") 

     print("\n The Null Values:")
     print(df.isnull().sum())
     print("\n") 

     print("\nSummary statistics (categorical):")  # As our needed features are categorical 
     categorical_cols = df.select_dtypes(include=[object]).columns
     if len(categorical_cols) > 0:
         display(df.describe(include=[object]))
     else:
         print("No categorical columns found.")
         print("\n")    

# 2- Function for Normalize the final DataFrame

def normalize_all(df):
    # normalize column names
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(r"[\s\-]+", "_", regex=True)   # spaces/hyphens .. underscore
        .str.replace(r"[^\w_]", "", regex=True)     # remove punctuation
    )

    # normalize values inside object (string) columns
    for col in df.select_dtypes(include=["object"]):
        df[col] = (
            df[col]
            .astype(str)
            .str.strip()
            .str.lower()
            .str.replace(r"\s+", " ", regex=True)
            .str.replace(r"[^\w_]", "", regex=True)     # remove punctuation
        )
    return df             

In [4]:
### Download Data From Github

zip_url = 'https://raw.githubusercontent.com/Eng-Shady-Hub/Generative_AI_Project_Round3/refs/heads/main/All_Datasets.zip'
response = requests.get(zip_url)
response.raise_for_status()

In [5]:
## Reading the CSV files from the Zip and save them individually

dataframes = {}

with zipfile.ZipFile(io.BytesIO(response.content)) as z:
    csv_files = [f for f in z.namelist() if f.endswith(".csv")]

    if not csv_files:
        print("No CSV_Files")
    else:
        for i, file_name in enumerate(csv_files, start=1):
            with z.open(file_name) as f:
                key = f"df{i}"
                dataframes[key] = pd.read_csv(f)
                print(f'DataFrame "{key}" created from file {file_name}')

DataFrame "df1" created from file 1-BigBasket Products.csv
DataFrame "df2" created from file 2-Adidas Vs Nike.csv
DataFrame "df3" created from file 3-amazon.csv
DataFrame "df4" created from file 4-flipkart_fashion_products_dataset.csv
DataFrame "df5" created from file 5-adidas_usa.csv
DataFrame "df6" created from file 6-electronics_products_full_edit.csv


In [6]:
# DataFrame 1

basket_data = dataframes["df1"]
data_details(basket_data)



Shape:
(27555, 10)



 The Head


Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...





 Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   index         27555 non-null  int64  
 1   product       27554 non-null  object 
 2   category      27555 non-null  object 
 3   sub_category  27555 non-null  object 
 4   brand         27554 non-null  object 
 5   sale_price    27555 non-null  float64
 6   market_price  27555 non-null  float64
 7   type          27555 non-null  object 
 8   rating        18929 non-null  float64
 9   description   27440 non-null  object 
dtypes: float64(3), int64(1), object(6)
memory usage: 28.2 MB
None



 The Null Values:
index              0
product            1
category           0
sub_category       0
brand              1
sale_price         0
market_price       0
type               0
rating          8626
description      115
dtype: int64



Summary statistics (categorical):


Unnamed: 0,product,category,sub_category,brand,type,description
count,27554,27555,27555,27554,27555,27440
unique,23540,11,90,2313,426,21944
top,Turmeric Powder/Arisina Pudi,Beauty & Hygiene,Skin Care,Fresho,Face Care,A brand inspired by the Greek goddess of victo...
freq,26,7867,2294,638,1508,47


In [7]:
# target = [description]
# features = [product_name ,brand ,category, sub_category]

basket_data = basket_data[["product","brand","category" , "sub_category","description"]]
basket_data= basket_data.rename(columns={"product": "product_name"})

print(basket_data.columns)

Index(['product_name', 'brand', 'category', 'sub_category', 'description'], dtype='object')


In [8]:
# there are null values in product_name , brand & description columns

basket_data=basket_data.dropna(subset=["description","product_name","brand"])
data_details(basket_data)



Shape:
(27439, 5)



 The Head


Unnamed: 0,product_name,brand,category,sub_category,description
0,Garlic Oil - Vegetarian Capsule 500 mg,Sri Sri Ayurveda,Beauty & Hygiene,Hair Care,This Product contains Garlic Oil that is known...
1,Water Bottle - Orange,Mastercook,"Kitchen, Garden & Pets",Storage & Accessories,"Each product is microwave safe (without lid), ..."
2,"Brass Angle Deep - Plain, No.2",Trm,Cleaning & Household,Pooja Needs,"A perfect gift for all occasions, be it your m..."
3,Cereal Flip Lid Container/Storage Jar - Assort...,Nakoda,Cleaning & Household,Bins & Bathroom Ware,Multipurpose container with an attractive desi...
4,Creme Soft Soap - For Hands & Body,Nivea,Beauty & Hygiene,Bath & Hand Wash,Nivea Creme Soft Soap gives your skin the best...





 Info:
<class 'pandas.core.frame.DataFrame'>
Index: 27439 entries, 0 to 27554
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  27439 non-null  object
 1   brand         27439 non-null  object
 2   category      27439 non-null  object
 3   sub_category  27439 non-null  object
 4   description   27439 non-null  object
dtypes: object(5)
memory usage: 25.6 MB
None



 The Null Values:
product_name    0
brand           0
category        0
sub_category    0
description     0
dtype: int64



Summary statistics (categorical):


Unnamed: 0,product_name,brand,category,sub_category,description
count,27439,27439,27439,27439,27439
unique,23449,2309,11,90,21943
top,Turmeric Powder/Arisina Pudi,Fresho,Beauty & Hygiene,Skin Care,A brand inspired by the Greek goddess of victo...
freq,26,637,7856,2291,47


In [9]:
# DataFrame 2

adidas_data = dataframes["df2"]
data_details(adidas_data)



Shape:
(3268, 10)



 The Head


Unnamed: 0,Product Name,Product ID,Listing Price,Sale Price,Discount,Brand,Description,Rating,Reviews,Last Visited
0,Women's adidas Originals NMD_Racer Primeknit S...,AH2430,14999,7499,50,Adidas Adidas ORIGINALS,Channeling the streamlined look of an '80s rac...,4.8,41,2020-04-13T15:06:14
1,Women's adidas Originals Sleek Shoes,G27341,7599,3799,50,Adidas ORIGINALS,"A modern take on adidas sport heritage, tailor...",3.3,24,2020-04-13T15:06:15
2,Women's adidas Swim Puka Slippers,CM0081,999,599,40,Adidas CORE / NEO,These adidas Puka slippers for women's come wi...,2.6,37,2020-04-13T15:06:15
3,Women's adidas Sport Inspired Questar Ride Shoes,B44832,6999,3499,50,Adidas CORE / NEO,"Inspired by modern tech runners, these women's...",4.1,35,2020-04-13T15:06:15
4,Women's adidas Originals Taekwondo Shoes,D98205,7999,3999,50,Adidas ORIGINALS,This design is inspired by vintage Taekwondo s...,3.5,72,2020-04-13T15:06:15





 Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3268 entries, 0 to 3267
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Product Name   3268 non-null   object 
 1   Product ID     3268 non-null   object 
 2   Listing Price  3268 non-null   int64  
 3   Sale Price     3268 non-null   int64  
 4   Discount       3268 non-null   int64  
 5   Brand          3268 non-null   object 
 6   Description    3265 non-null   object 
 7   Rating         3268 non-null   float64
 8   Reviews        3268 non-null   int64  
 9   Last Visited   3268 non-null   object 
dtypes: float64(1), int64(4), object(5)
memory usage: 2.1 MB
None



 The Null Values:
Product Name     0
Product ID       0
Listing Price    0
Sale Price       0
Discount         0
Brand            0
Description      3
Rating           0
Reviews          0
Last Visited     0
dtype: int64



Summary statistics (categorical):


Unnamed: 0,Product Name,Product ID,Brand,Description,Last Visited
count,3268,3268,3268,3265,3268
unique,1531,3179,5,1762,318
top,Women's adidas Originals Sambarose Shoes,CD6720-808,Adidas CORE / NEO,A well cushioned shoe with a fresher look that...,2020-04-13T15:06:32
freq,16,3,1111,15,38


In [10]:
# Unify columns names
adidas_data = adidas_data[["Product Name", "Brand", "Description"]].rename(columns={"Product Name": "product_name", "Brand": "brand" , "Description":"description"})
adidas_data.info()

# there are null values(only 3) in description column

adidas_data=adidas_data.dropna(subset=["description"])
adidas_data =adidas_data[adidas_data['description'] != 'No description']
adidas_data.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3268 entries, 0 to 3267
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  3268 non-null   object
 1   brand         3268 non-null   object
 2   description   3265 non-null   object
dtypes: object(3)
memory usage: 76.7+ KB


product_name    0
brand           0
description     0
dtype: int64

In [11]:
# Dataset has agood description to our model but not have category & sub_category columns
# So we map category & sub_category columns with respect to the product_name column

category_map = {
    # cayrgory = footwear 
  "shoe": ("footwear", "shoes"),"sneaker": ("footwear", "shoes"),"running": ("footwear", "running shoes"),"trainer": ("footwear", "trainers"),"cleat": ("footwear", "cleats"),
    "slipper": ("footwear", "slippers"),"flip flop": ("footwear", "flip flops"),"jordan": ("footwear", "basketball shoes"),"retro": ("footwear", "shoes"),
    "phantom": ("footwear", "cleats"),"venom": ("footwear", "cleats"),"mercurial": ("footwear", "soccer shoes"),"superfly": ("footwear", "soccer shoes"),
    "tf": ("footwear", "turf soccer shoes"),"air max": ("footwear", "sneakers"),"p-6000": ("footwear", "running shoes"),"sandal": ("footwear", "sandals"),
    "slide": ("footwear", "slides"),"adilette": ("footwear", "slides"),"flipflop": ("footwear", "flip flops"),"sb": ("footwear", "skate shoes"),"skate": ("footwear", "skate shoes"),
    "chron": ("footwear", "skate shoes"),"kd": ("footwear", "basketball shoes"),"kyrie": ("footwear", "basketball shoes"),"iconclash": ("footwear", "running shoes"),
    "daybreak": ("footwear", "sneakers"),"blazer": ("footwear", "sneakers"),"prelove": ("footwear", "sneakers"),"pegasus": ("footwear", "running shoes"),
    "vaporfly": ("footwear", "running shoes"),"zoomx": ("footwear", "running shoes"),"slipon": ("footwear", "slip-ons"),"airforce": ("footwear", "sneakers"),
    "airmax": ("footwear", "sneakers"),"metcon": ("footwear", "training shoes"),"court": ("footwear", "tennis shoes"),"pg": ("footwear", "basketball shoes"),
    "m2k": ("footwear", "sneakers"),"winflo": ("footwear", "running shoes"),"vomero": ("footwear", "running shoes"),"vapormax": ("footwear", "lifestyle sneakers"),
    "flip-flop": ("footwear", "flip flops"),"flip-flops": ("footwear", "flip flops"),"slip-on": ("footwear", "slip-ons"), "slip-ons": ("footwear", "slip-ons"),
    "odyssey react": ("footwear", "running shoes"),"legend react": ("footwear", "running shoes"),"pre-love": ("footwear", "sneakers"),"air force": ("footwear", "sneakers"),
    "drop-type": ("footwear", "running shoes"),"zoom rival fly": ("footwear", "running shoes"),"mx-720-818": ("footwear", "running shoes"),"tanjun": ("footwear", "running shoes"),
    "superstar": ("footwear", "sneakers"),"slip on": ("footwear", "slip-ons"),"lebron soldier": ("footwear", "basketball shoes"),"react element": ("footwear", "running shoes"),
    "free rn": ("footwear", "running shoes"),"zoom fly": ("footwear", "running shoes"),"zoom rise": ("footwear", "running shoes"),"tiempo legend": ("footwear", "soccer shoes"),
    "flex rn": ("footwear", "running shoes"),"air zoom structure": ("footwear", "running shoes"),"sfb gen 2": ("footwear", "boots"),"air huarache": ("footwear", "sneakers"),
    "wildhorse": ("footwear", "running shoes"),"benassi": ("footwear", "slides"),"terra kiger": ("footwear", "running shoes"),"classic cortez": ("footwear", "sneakers"),
    "renew run": ("footwear", "running shoes"),"free tr": ("footwear", "training shoes"),"lebron": ("footwear", "basketball shoes"),"mowabb": ("footwear", "sneakers"),
    "revolution": ("footwear", "running shoes"),"precision": ("footwear", "basketball shoes"),"shox": ("footwear", "running shoes"),"potential": ("footwear", "basketball shoes"),
    "epic react": ("footwear", "running shoes"), "react city": ("footwear", "running shoes"),"kawa": ("footwear", "slides"),"joyride run": ("footwear", "running shoes"),
    "joyride optik": ("footwear", "running shoes"),"flex contact": ("footwear", "running shoes"),"football": ("footwear", "Football Shoes"),"predator": ("footwear", "Football Shoes"),
    "vandalised": ("footwear", "Casual Shoes"),"canyon": ("footwear", "Casual Shoes"),"react": ("footwear", "Running Shoes"),"acg": ("footwear", "Outdoor Shoes"),
    "flex": ("footwear", "Training Shoes"),"signal": ("footwear", "Running Shoes"),"joyride": ("footwear", "Running Shoes"),"cortez": ("footwear", "Casual Shoes"),
    "hawkins": ("footwear", "Casual Shoes"),"nemeziz": ("footwear", "Football Shoes"),"indoor": ("footwear", "Indoor Shoes"),"outdoor": ("footwear", "Outdoor Shoes"),
    "trail": ("footwear", "Outdoor Shoes"),"superrep": ("footwear", "Training Shoes"),"zoom": ("footwear", "Running Shoes"),"tr": ("footwear", "Training Shoes"),
    "renew": ("footwear", "Running Shoes"),"ghost": ("footwear", "Running Shoes"),"racer": ("footwear", "Running Shoes"),"alphadunk": ("footwear", "Basketball Shoes"),
    "monarch": ("footwear", "Walking Shoes"),"af-1": ("footwear", "Casual Shoes"),"bella": ("footwear", "Casual Shoes"), "huarache": ("footwear", "Lifestyle Shoes"),
    "solarsoft": ("footwear", "Training Shoes"),"exp-x14": ("footwear", "Running Shoes"),"fly.by": ("footwear", "Basketball Shoes"),"xarr": ("footwear", "Training Shoes"),
    "skarn": ("footwear", "Casual Shoes"),"tailwind": ("footwear", "Running Shoes"), "air dsvm": ("footwear", "Running Shoes"),
    # category = accessories
    "sock": ("accessories", "socks"), "cap": ("accessories", "cap"),"hat": ("accessories", "cap"),"bag": ("accessories", "bag"),"backpack": ("accessories", "bag"),
    "watch": ("accessories", "watch")
    }
    
def categorize_product(name):
    name = str(name).lower()
    for keyword, (cat, subcat) in category_map.items():
        if keyword in name:
            return cat, subcat
    return "Other", "Other"  # fallback if no keyword found

adidas_data[["category", "sub_category"]] = adidas_data["product_name"].apply(lambda x: pd.Series(categorize_product(x)))

In [12]:
# arranging the columns to be the same in all datasets

adidas_data = adidas_data[["product_name", "brand","category", "sub_category", "description"]]
print(adidas_data.columns)
data_details(adidas_data , n=20)

Index(['product_name', 'brand', 'category', 'sub_category', 'description'], dtype='object')


Shape:
(3264, 5)



 The Head


Unnamed: 0,product_name,brand,category,sub_category,description
0,Women's adidas Originals NMD_Racer Primeknit S...,Adidas Adidas ORIGINALS,footwear,shoes,Channeling the streamlined look of an '80s rac...
1,Women's adidas Originals Sleek Shoes,Adidas ORIGINALS,footwear,shoes,"A modern take on adidas sport heritage, tailor..."
2,Women's adidas Swim Puka Slippers,Adidas CORE / NEO,footwear,slippers,These adidas Puka slippers for women's come wi...
3,Women's adidas Sport Inspired Questar Ride Shoes,Adidas CORE / NEO,footwear,shoes,"Inspired by modern tech runners, these women's..."
4,Women's adidas Originals Taekwondo Shoes,Adidas ORIGINALS,footwear,shoes,This design is inspired by vintage Taekwondo s...
5,Women's adidas Sport Inspired Duramo Lite 2.0 ...,Adidas CORE / NEO,footwear,shoes,Refine your interval training in these women's...
6,Women's adidas Sport Inspired Duramo Lite 2.0 ...,Adidas CORE / NEO,footwear,shoes,Refine your interval training in these women's...
7,Women's adidas Swim Puka Slippers,Adidas CORE / NEO,footwear,slippers,These adidas Puka slippers for women's come wi...
8,WOMEN'S ADIDAS RUNNING DURAMO 9 SHOES,Adidas CORE / NEO,footwear,shoes,These women's neutral running shoes will get y...
9,Men's adidas Originals Forest Grove Shoes,Adidas ORIGINALS,footwear,shoes,The Forest Grove brings back the look of the a...





 Info:
<class 'pandas.core.frame.DataFrame'>
Index: 3264 entries, 0 to 3267
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  3264 non-null   object
 1   brand         3264 non-null   object
 2   category      3264 non-null   object
 3   sub_category  3264 non-null   object
 4   description   3264 non-null   object
dtypes: object(5)
memory usage: 2.0 MB
None



 The Null Values:
product_name    0
brand           0
category        0
sub_category    0
description     0
dtype: int64



Summary statistics (categorical):


Unnamed: 0,product_name,brand,category,sub_category,description
count,3264,3264,3264,3264,3264
unique,1529,5,2,27,1761
top,Women's adidas Originals Sambarose Shoes,Adidas CORE / NEO,footwear,shoes,A well cushioned shoe with a fresher look that...
freq,16,1111,3263,2306,15


In [13]:
# DataFrame 3

amazon_data =dataframes["df3"] 
data_details(amazon_data)



Shape:
(1465, 16)



 The Head


Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,user_id,user_name,review_id,review_title,review_content,img_link,product_link
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,Computers&Accessories|Accessories&Peripherals|...,₹399,"₹1,099",64%,4.2,24269,High Compatibility : Compatible With iPhone 12...,"AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBB...","Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jasp...","R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
1,B098NS6PVG,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,Computers&Accessories|Accessories&Peripherals|...,₹199,₹349,43%,4.0,43994,"Compatible with all Type C enabled devices, be...","AECPFYFQVRUWC3KGNLJIOREFP5LQ,AGYYVPDD7YG7FYNBX...","ArdKn,Nirbhay kumar,Sagar Viswanathan,Asp,Plac...","RGIQEG07R9HS2,R1SMWZQ86XIN8U,R2J3Y1WL29GWDE,RY...","A Good Braided Cable for Your Type C Device,Go...",I ordered this cable to connect my phone to An...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Ambrane-Unbreakable-Char...
2,B096MSW6CT,Sounce Fast Phone Charging Cable & Data Sync U...,Computers&Accessories|Accessories&Peripherals|...,₹199,"₹1,899",90%,3.9,7928,【 Fast Charger& Data Sync】-With built-in safet...,"AGU3BBQ2V2DDAMOAKGFAWDDQ6QHA,AESFLDV2PT363T2AQ...","Kunal,Himanshu,viswanath,sai niharka,saqib mal...","R3J3EQQ9TZI5ZJ,R3E7WBGK7ID0KV,RWU79XKQ6I1QF,R2...","Good speed for earlier versions,Good Product,W...","Not quite durable and sturdy,https://m.media-a...",https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Sounce-iPhone-Charging-C...
3,B08HDJ86NZ,boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...,Computers&Accessories|Accessories&Peripherals|...,₹329,₹699,53%,4.2,94363,The boAt Deuce USB 300 2 in 1 cable is compati...,"AEWAZDZZJLQUYVOVGBEUKSLXHQ5A,AG5HTSFRRE6NL3M5S...","Omkar dhale,JD,HEMALATHA,Ajwadh a.,amar singh ...","R3EEUZKKK9J36I,R3HJVYCLYOY554,REDECAZ7AMPQC,R1...","Good product,Good one,Nice,Really nice product...","Good product,long wire,Charges good,Nice,I bou...",https://m.media-amazon.com/images/I/41V5FtEWPk...,https://www.amazon.in/Deuce-300-Resistant-Tang...
4,B08CF3B7N1,Portronics Konnect L 1.2M Fast Charging 3A 8 P...,Computers&Accessories|Accessories&Peripherals|...,₹154,₹399,61%,4.2,16905,[CHARGE & SYNC FUNCTION]- This cable comes wit...,"AE3Q6KSUK5P75D5HFYHCRAOLODSA,AFUGIFH5ZAFXRDSZH...","rahuls6099,Swasat Borah,Ajay Wadke,Pranali,RVK...","R1BP4L2HH9TFUP,R16PVJEXKV6QZS,R2UPDB81N66T4P,R...","As good as original,Decent,Good one for second...","Bought this instead of original apple, does th...",https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Portronics-Konnect-POR-1...





 Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   product_id           1465 non-null   object
 1   product_name         1465 non-null   object
 2   category             1465 non-null   object
 3   discounted_price     1465 non-null   object
 4   actual_price         1465 non-null   object
 5   discount_percentage  1465 non-null   object
 6   rating               1465 non-null   object
 7   rating_count         1463 non-null   object
 8   about_product        1465 non-null   object
 9   user_id              1465 non-null   object
 10  user_name            1465 non-null   object
 11  review_id            1465 non-null   object
 12  review_title         1465 non-null   object
 13  review_content       1465 non-null   object
 14  img_link             1465 non-null   object
 15  product_link         1465 non-null   object
d

Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,user_id,user_name,review_id,review_title,review_content,img_link,product_link
count,1465,1465,1465,1465,1465,1465,1465.0,1463,1465,1465,1465,1465,1465,1465,1465,1465
unique,1351,1337,211,550,449,92,28.0,1143,1293,1194,1194,1194,1194,1212,1412,1465
top,B08WRWPM22,"Fire-Boltt Ninja Call Pro Plus 1.83"" Smart Wat...",Computers&Accessories|Accessories&Peripherals|...,₹199,₹999,50%,4.1,9378,[CHARGE & SYNC FUNCTION]- This cable comes wit...,"AHIKJUDTVJ4T6DV6IUGFYZ5LXMPA,AE55KTFVNXYFD5FPY...","$@|\|TO$|-|,Sethu madhav,Akash Thakur,Burger P...","R3F4T5TRYPTMIG,R3DQIEC603E7AY,R1O4Z15FD40PV5,R...","Worked on iPhone 7 and didn’t work on XR,Good ...","I am not big on camera usage, personally. I wa...",https://m.media-amazon.com/images/I/413sCRKobN...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...
freq,3,5,233,53,120,56,244.0,9,6,10,10,10,10,8,3,1


In [14]:
# there is no null values in the prefered dataset features
#  Amazon dataset don't contain brand , we note the first name in the product_name is the brand
# So creating a function to map the brand column with respect to product_name column

def map_brand(name):
    return name.split()[0]

# Apply function
amazon_data['brand'] =amazon_data['product_name'].apply(map_brand)

#  Amazon dataset don't contain sub_category , we note the values in category colums are diveded by | 
# So creating it by map sub_category column with respect to category column by extracting the most specific level(last part)

amazon_data['sub_category'] = amazon_data['category'].apply(lambda x: x.split('|')[-1])

amazon_data.head()

Unnamed: 0,product_id,product_name,category,discounted_price,actual_price,discount_percentage,rating,rating_count,about_product,user_id,user_name,review_id,review_title,review_content,img_link,product_link,brand,sub_category
0,B07JW9H4J1,Wayona Nylon Braided USB to Lightning Fast Cha...,Computers&Accessories|Accessories&Peripherals|...,₹399,"₹1,099",64%,4.2,24269,High Compatibility : Compatible With iPhone 12...,"AG3D6O4STAQKAY2UVGEUV46KN35Q,AHMY5CWJMMK5BJRBB...","Manav,Adarsh gupta,Sundeep,S.Sayeed Ahmed,jasp...","R3HXWT0LRP0NMF,R2AJM3LFTLZHFO,R6AQJGUP6P86,R1K...","Satisfied,Charging is really fast,Value for mo...",Looks durable Charging is fine tooNo complains...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Wayona-Braided-WN3LG1-Sy...,Wayona,USBCables
1,B098NS6PVG,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,Computers&Accessories|Accessories&Peripherals|...,₹199,₹349,43%,4.0,43994,"Compatible with all Type C enabled devices, be...","AECPFYFQVRUWC3KGNLJIOREFP5LQ,AGYYVPDD7YG7FYNBX...","ArdKn,Nirbhay kumar,Sagar Viswanathan,Asp,Plac...","RGIQEG07R9HS2,R1SMWZQ86XIN8U,R2J3Y1WL29GWDE,RY...","A Good Braided Cable for Your Type C Device,Go...",I ordered this cable to connect my phone to An...,https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Ambrane-Unbreakable-Char...,Ambrane,USBCables
2,B096MSW6CT,Sounce Fast Phone Charging Cable & Data Sync U...,Computers&Accessories|Accessories&Peripherals|...,₹199,"₹1,899",90%,3.9,7928,【 Fast Charger& Data Sync】-With built-in safet...,"AGU3BBQ2V2DDAMOAKGFAWDDQ6QHA,AESFLDV2PT363T2AQ...","Kunal,Himanshu,viswanath,sai niharka,saqib mal...","R3J3EQQ9TZI5ZJ,R3E7WBGK7ID0KV,RWU79XKQ6I1QF,R2...","Good speed for earlier versions,Good Product,W...","Not quite durable and sturdy,https://m.media-a...",https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Sounce-iPhone-Charging-C...,Sounce,USBCables
3,B08HDJ86NZ,boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...,Computers&Accessories|Accessories&Peripherals|...,₹329,₹699,53%,4.2,94363,The boAt Deuce USB 300 2 in 1 cable is compati...,"AEWAZDZZJLQUYVOVGBEUKSLXHQ5A,AG5HTSFRRE6NL3M5S...","Omkar dhale,JD,HEMALATHA,Ajwadh a.,amar singh ...","R3EEUZKKK9J36I,R3HJVYCLYOY554,REDECAZ7AMPQC,R1...","Good product,Good one,Nice,Really nice product...","Good product,long wire,Charges good,Nice,I bou...",https://m.media-amazon.com/images/I/41V5FtEWPk...,https://www.amazon.in/Deuce-300-Resistant-Tang...,boAt,USBCables
4,B08CF3B7N1,Portronics Konnect L 1.2M Fast Charging 3A 8 P...,Computers&Accessories|Accessories&Peripherals|...,₹154,₹399,61%,4.2,16905,[CHARGE & SYNC FUNCTION]- This cable comes wit...,"AE3Q6KSUK5P75D5HFYHCRAOLODSA,AFUGIFH5ZAFXRDSZH...","rahuls6099,Swasat Borah,Ajay Wadke,Pranali,RVK...","R1BP4L2HH9TFUP,R16PVJEXKV6QZS,R2UPDB81N66T4P,R...","As good as original,Decent,Good one for second...","Bought this instead of original apple, does th...",https://m.media-amazon.com/images/W/WEBP_40237...,https://www.amazon.in/Portronics-Konnect-POR-1...,Portronics,USBCables


In [15]:
# arranging the columns to be the same in all datasets

amazon_data = amazon_data[["product_name", "brand","category", "sub_category", "about_product"]]
amazon_data= amazon_data.rename(columns={"about_product": "description"})
print(amazon_data.columns)
data_details(amazon_data)

Index(['product_name', 'brand', 'category', 'sub_category', 'description'], dtype='object')


Shape:
(1465, 5)



 The Head


Unnamed: 0,product_name,brand,category,sub_category,description
0,Wayona Nylon Braided USB to Lightning Fast Cha...,Wayona,Computers&Accessories|Accessories&Peripherals|...,USBCables,High Compatibility : Compatible With iPhone 12...
1,Ambrane Unbreakable 60W / 3A Fast Charging 1.5...,Ambrane,Computers&Accessories|Accessories&Peripherals|...,USBCables,"Compatible with all Type C enabled devices, be..."
2,Sounce Fast Phone Charging Cable & Data Sync U...,Sounce,Computers&Accessories|Accessories&Peripherals|...,USBCables,【 Fast Charger& Data Sync】-With built-in safet...
3,boAt Deuce USB 300 2 in 1 Type-C & Micro USB S...,boAt,Computers&Accessories|Accessories&Peripherals|...,USBCables,The boAt Deuce USB 300 2 in 1 cable is compati...
4,Portronics Konnect L 1.2M Fast Charging 3A 8 P...,Portronics,Computers&Accessories|Accessories&Peripherals|...,USBCables,[CHARGE & SYNC FUNCTION]- This cable comes wit...





 Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1465 entries, 0 to 1464
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  1465 non-null   object
 1   brand         1465 non-null   object
 2   category      1465 non-null   object
 3   sub_category  1465 non-null   object
 4   description   1465 non-null   object
dtypes: object(5)
memory usage: 2.1 MB
None



 The Null Values:
product_name    0
brand           0
category        0
sub_category    0
description     0
dtype: int64



Summary statistics (categorical):


Unnamed: 0,product_name,brand,category,sub_category,description
count,1465,1465,1465,1465,1465
unique,1337,437,211,207,1293
top,"Fire-Boltt Ninja Call Pro Plus 1.83"" Smart Wat...",boAt,Computers&Accessories|Accessories&Peripherals|...,USBCables,[CHARGE & SYNC FUNCTION]- This cable comes wit...
freq,5,67,233,233,6


In [16]:
# DataFrame 4
 
flipkart_data = dataframes["df4"]
data_details(flipkart_data)



Shape:
(30000, 17)



 The Head


Unnamed: 0,_id,actual_price,average_rating,brand,category,crawled_at,description,discount,images,out_of_stock,pid,product_details,seller,selling_price,sub_category,title,url
0,fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a,2999,3.9,York,Clothing and Accessories,"02/10/2021, 20:11:51",Yorker trackpants made from 100% rich combed c...,69% off,['https://rukminim1.flixcart.com/image/128/128...,False,TKPFCZ9EA7H5FYZH,"[{'Style Code': '1005COMBO2'}, {'Closure': 'El...",Shyam Enterprises,921,Bottomwear,Solid Men Multicolor Track Pants,https://www.flipkart.com/yorker-solid-men-mult...
1,893e6980-f2a0-531f-b056-34dd63fe912c,1499,3.9,York,Clothing and Accessories,"02/10/2021, 20:11:52",Yorker trackpants made from 100% rich combed c...,66% off,['https://rukminim1.flixcart.com/image/128/128...,False,TKPFCZ9EJZV2UVRZ,"[{'Style Code': '1005BLUE'}, {'Closure': 'Draw...",Shyam Enterprises,499,Bottomwear,Solid Men Blue Track Pants,https://www.flipkart.com/yorker-solid-men-blue...
2,eb4c8eab-8206-59d0-bcd1-a724d96bf74f,2999,3.9,York,Clothing and Accessories,"02/10/2021, 20:11:52",Yorker trackpants made from 100% rich combed c...,68% off,['https://rukminim1.flixcart.com/image/128/128...,False,TKPFCZ9EHFCY5Z4Y,"[{'Style Code': '1005COMBO4'}, {'Closure': 'El...",Shyam Enterprises,931,Bottomwear,Solid Men Multicolor Track Pants,https://www.flipkart.com/yorker-solid-men-mult...
3,3f3f97bb-5faf-57df-a9ff-1af24e2b1045,2999,3.9,York,Clothing and Accessories,"02/10/2021, 20:11:53",Yorker trackpants made from 100% rich combed c...,69% off,['https://rukminim1.flixcart.com/image/128/128...,False,TKPFCZ9ESZZ7YWEF,"[{'Style Code': '1005COMBO3'}, {'Closure': 'El...",Shyam Enterprises,911,Bottomwear,Solid Men Multicolor Track Pants,https://www.flipkart.com/yorker-solid-men-mult...
4,750caa3d-6264-53ca-8ce1-94118a1d8951,2999,3.9,York,Clothing and Accessories,"02/10/2021, 20:11:53",Yorker trackpants made from 100% rich combed c...,68% off,['https://rukminim1.flixcart.com/image/128/128...,False,TKPFCZ9EVXKBSUD7,"[{'Style Code': '1005COMBO1'}, {'Closure': 'Dr...",Shyam Enterprises,943,Bottomwear,"Solid Men Brown, Grey Track Pants",https://www.flipkart.com/yorker-solid-men-brow...





 Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   _id              30000 non-null  object 
 1   actual_price     29137 non-null  object 
 2   average_rating   27554 non-null  float64
 3   brand            27932 non-null  object 
 4   category         30000 non-null  object 
 5   crawled_at       30000 non-null  object 
 6   description      18017 non-null  object 
 7   discount         29059 non-null  object 
 8   images           30000 non-null  object 
 9   out_of_stock     30000 non-null  bool   
 10  pid              30000 non-null  object 
 11  product_details  30000 non-null  object 
 12  seller           28259 non-null  object 
 13  selling_price    29998 non-null  object 
 14  sub_category     30000 non-null  object 
 15  title            30000 non-null  object 
 16  url              30000 non-null  object 
dtypes:

Unnamed: 0,_id,actual_price,brand,category,crawled_at,description,discount,images,pid,product_details,seller,selling_price,sub_category,title,url
count,30000,29137,27932,30000,30000,18017,29059,30000,30000,30000,28259,29998,30000,30000,30000
unique,30000,728,324,4,17188,5192,87,24146,28080,26783,534,1512,24,4579,30000
top,fa8e22d6-c0b6-5229-bb9e-ad52eda39a0a,999,ARBO,Clothing and Accessories,"02/11/2021, 01:31:26",Refresh your clothing with this latest new Pri...,60% off,[],TSHFUHWA7E7JKKUY,"[{'Color': 'Black'}, {'Care instructions': 'Du...",RetailNet,399,Topwear,Printed Men Round Neck Black T-Shirt,https://www.flipkart.com/yorker-solid-men-mult...
freq,1,3671,999,28971,3,377,2335,778,25,112,1615,1571,16575,856,1


In [17]:
# there are nulls in description an brand columns
# clearing "discription" rows with missed values
 
flipkart_data=flipkart_data.dropna(subset=["description"])
flipkart_data =flipkart_data[flipkart_data['description'] != 'No description']
flipkart_data.isnull().sum()



_id                   0
actual_price        578
average_rating     1474
brand              1369
category              0
crawled_at            0
description           0
discount            620
images                0
out_of_stock          0
pid                   0
product_details       0
seller             1210
selling_price         1
sub_category          0
title                 0
url                   0
dtype: int64

In [18]:
# display the most common brand to fill the missing value 

most_common = flipkart_data['brand'].mode()[0]
print(most_common)

REEB


In [19]:
# filling the missed value of brand By common brand in our dataset "REEB"

flipkart_data['brand'].fillna("REEB", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  flipkart_data['brand'].fillna("REEB", inplace=True)


In [20]:
# arranging the columns to be the same in all datasets

flipkart_data = flipkart_data[["title", "brand","category", "sub_category", "description"]]
flipkart_data= flipkart_data.rename(columns={"title": "product_name"})
print(flipkart_data.columns)
flipkart_data.isnull().sum()

Index(['product_name', 'brand', 'category', 'sub_category', 'description'], dtype='object')


product_name    0
brand           0
category        0
sub_category    0
description     0
dtype: int64

In [21]:
# DataFrame 5

adidas2_data =dataframes["df5"] 
data_details(adidas2_data)



Shape:
(845, 21)



 The Head


Unnamed: 0,index,url,name,sku,selling_price,original_price,currency,availability,color,category,...,source_website,breadcrumbs,description,brand,images,country,language,average_rating,reviews_count,crawled_at
0,0,https://www.adidas.com/us/beach-shorts/FJ5089....,Beach Shorts,FJ5089,40,,USD,InStock,Black,Clothing,...,https://www.adidas.com,Women/Clothing,Splashing in the surf. Making memories with yo...,adidas,"https://assets.adidas.com/images/w_600,f_auto,...",USA,en,4.5,35,2021-10-23 17:50:17.331255
1,1,https://www.adidas.com/us/five-ten-kestrel-lac...,Five Ten Kestrel Lace Mountain Bike Shoes,BC0770,150,,USD,InStock,Grey,Shoes,...,https://www.adidas.com,Women/Shoes,Lace up and get after it. The Five Ten Kestrel...,adidas,"https://assets.adidas.com/images/w_600,f_auto,...",USA,en,4.8,4,2021-10-23 17:50:17.423830
2,2,https://www.adidas.com/us/mexico-away-jersey/G...,Mexico Away Jersey,GC7946,70,,USD,InStock,White,Clothing,...,https://www.adidas.com,Kids/Clothing,"Clean and crisp, this adidas Mexico Away Jerse...",adidas,"https://assets.adidas.com/images/w_600,f_auto,...",USA,en,4.9,42,2021-10-23 17:50:17.530834
3,3,https://www.adidas.com/us/five-ten-hiangle-pro...,Five Ten Hiangle Pro Competition Climbing Shoes,FV4744,160,,USD,InStock,Black,Shoes,...,https://www.adidas.com,Five Ten/Shoes,The Hiangle Pro takes on the classic shape of ...,adidas,"https://assets.adidas.com/images/w_600,f_auto,...",USA,en,3.7,7,2021-10-23 17:50:17.615054
4,4,https://www.adidas.com/us/mesh-broken-stripe-p...,Mesh Broken-Stripe Polo Shirt,GM0239,65,,USD,InStock,Blue,Clothing,...,https://www.adidas.com,Men/Clothing,Step up to the tee relaxed. This adidas golf p...,adidas,"https://assets.adidas.com/images/w_600,f_auto,...",USA,en,4.7,11,2021-10-23 17:50:17.702680





 Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 845 entries, 0 to 844
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   index           845 non-null    int64  
 1   url             845 non-null    object 
 2   name            845 non-null    object 
 3   sku             845 non-null    object 
 4   selling_price   845 non-null    int64  
 5   original_price  829 non-null    object 
 6   currency        845 non-null    object 
 7   availability    845 non-null    object 
 8   color           845 non-null    object 
 9   category        845 non-null    object 
 10  source          845 non-null    object 
 11  source_website  845 non-null    object 
 12  breadcrumbs     845 non-null    object 
 13  description     845 non-null    object 
 14  brand           845 non-null    object 
 15  images          845 non-null    object 
 16  country         845 non-null    object 
 17  language        845 non-n

Unnamed: 0,url,name,sku,original_price,currency,availability,color,category,source,source_website,breadcrumbs,description,brand,images,country,language,crawled_at
count,845,845,845,829,845,845,845,845,845,845,845,845,845,845,845,845,845
unique,845,431,845,42,1,2,18,3,1,1,22,560,1,845,1,1,845
top,https://www.adidas.com/us/beach-shorts/FJ5089....,ZX 1K Boost Shoes,FJ5089,$65,USD,InStock,White,Shoes,adidas United States,https://www.adidas.com,Women/Clothing,"When running is your sport, there's nothing yo...",adidas,"https://assets.adidas.com/images/w_600,f_auto,...",USA,en,2021-10-23 17:50:17.331255
freq,1,24,1,68,845,842,222,426,845,845,176,8,845,1,845,845,1


In [22]:
# adidas2 dataset not have null values
#  Noting the breadcrumbs colums contains sub_category

adidas2_data = adidas2_data[["name", "brand","category", "breadcrumbs", "description"]]
adidas2_data= adidas2_data.rename(columns={"name": "product_name" , "breadcrumbs":"sub_category"})
print(adidas2_data.columns)
data_details(adidas2_data)

Index(['product_name', 'brand', 'category', 'sub_category', 'description'], dtype='object')


Shape:
(845, 5)



 The Head


Unnamed: 0,product_name,brand,category,sub_category,description
0,Beach Shorts,adidas,Clothing,Women/Clothing,Splashing in the surf. Making memories with yo...
1,Five Ten Kestrel Lace Mountain Bike Shoes,adidas,Shoes,Women/Shoes,Lace up and get after it. The Five Ten Kestrel...
2,Mexico Away Jersey,adidas,Clothing,Kids/Clothing,"Clean and crisp, this adidas Mexico Away Jerse..."
3,Five Ten Hiangle Pro Competition Climbing Shoes,adidas,Shoes,Five Ten/Shoes,The Hiangle Pro takes on the classic shape of ...
4,Mesh Broken-Stripe Polo Shirt,adidas,Clothing,Men/Clothing,Step up to the tee relaxed. This adidas golf p...





 Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 845 entries, 0 to 844
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  845 non-null    object
 1   brand         845 non-null    object
 2   category      845 non-null    object
 3   sub_category  845 non-null    object
 4   description   845 non-null    object
dtypes: object(5)
memory usage: 598.8 KB
None



 The Null Values:
product_name    0
brand           0
category        0
sub_category    0
description     0
dtype: int64



Summary statistics (categorical):


Unnamed: 0,product_name,brand,category,sub_category,description
count,845,845,845,845,845
unique,431,1,3,22,560
top,ZX 1K Boost Shoes,adidas,Shoes,Women/Clothing,"When running is your sport, there's nothing yo..."
freq,24,845,426,176,8


In [23]:
# DataFrame 6

elec_data = dataframes["df6"]
data_details(elec_data)




Shape:
(5000, 5)



 The Head


Unnamed: 0,Product_name,brand,category,sub_category,description
0,Dell Drawing Tablet,Dell,Tablet,Drawing Tablet,The Dell Drawing Tablet is designed for users ...
1,Bose Compact Camera,Bose,Camera,Compact Camera,The Bose Compact Camera is designed for users ...
2,LG Wireless,LG,Headphones,Wireless,The LG Wireless is designed for users who need...
3,Sony Curved Monitor,Sony,Monitor,Curved Monitor,The Sony Curved Monitor is designed for users ...
4,Dell Luxury Smartwatch,Dell,Smartwatch,Luxury Smartwatch,The Dell Luxury Smartwatch is designed for use...





 Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Product_name  5000 non-null   object
 1   brand         5000 non-null   object
 2   category      5000 non-null   object
 3   sub_category  5000 non-null   object
 4   description   5000 non-null   object
dtypes: object(5)
memory usage: 2.7 MB
None



 The Null Values:
Product_name    0
brand           0
category        0
sub_category    0
description     0
dtype: int64



Summary statistics (categorical):


Unnamed: 0,Product_name,brand,category,sub_category,description
count,5000,5000,5000,5000,5000
unique,400,10,10,40,4651
top,LG In-Ear,LG,Speaker,Compact Camera,The Lenovo 4K Monitor is designed for users wh...
freq,24,552,527,144,4


In [24]:
# The elec_data dataSet is clear 

elec_data= elec_data.rename(columns={"Product_name": "product_name"})
print(elec_data.columns)

Index(['product_name', 'brand', 'category', 'sub_category', 'description'], dtype='object')


In [25]:
# Combining all datasets 

data_all = pd.concat([basket_data, adidas_data, amazon_data ,adidas2_data , flipkart_data , elec_data], ignore_index=True)

In [26]:
data_details(data_all)



Shape:
(56030, 5)



 The Head


Unnamed: 0,product_name,brand,category,sub_category,description
0,Garlic Oil - Vegetarian Capsule 500 mg,Sri Sri Ayurveda,Beauty & Hygiene,Hair Care,This Product contains Garlic Oil that is known...
1,Water Bottle - Orange,Mastercook,"Kitchen, Garden & Pets",Storage & Accessories,"Each product is microwave safe (without lid), ..."
2,"Brass Angle Deep - Plain, No.2",Trm,Cleaning & Household,Pooja Needs,"A perfect gift for all occasions, be it your m..."
3,Cereal Flip Lid Container/Storage Jar - Assort...,Nakoda,Cleaning & Household,Bins & Bathroom Ware,Multipurpose container with an attractive desi...
4,Creme Soft Soap - For Hands & Body,Nivea,Beauty & Hygiene,Bath & Hand Wash,Nivea Creme Soft Soap gives your skin the best...





 Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56030 entries, 0 to 56029
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product_name  56030 non-null  object
 1   brand         56030 non-null  object
 2   category      56030 non-null  object
 3   sub_category  56030 non-null  object
 4   description   56030 non-null  object
dtypes: object(5)
memory usage: 44.5 MB
None



 The Null Values:
product_name    0
brand           0
category        0
sub_category    0
description     0
dtype: int64



Summary statistics (categorical):


Unnamed: 0,product_name,brand,category,sub_category,description
count,56030,56030,56030,56030,56030
unique,30414,2982,241,405,35396
top,Printed Men Round Neck Black T-Shirt,REEB,Clothing and Accessories,Topwear,Refresh your clothing with this latest new Pri...
freq,506,2175,17695,10342,377


In [27]:
save_path = os.path.expanduser("~/Documents/data_all.csv")
data_all.to_csv(save_path, index=False, encoding="utf-8")
print("Dataset Saved to : ", save_path)

Dataset Saved to :  C:\Users\PC/Documents/data_all.csv
