# Import packages

In [22]:
import pandas as pd
from teradataml import *
import ast
import json


from utils import clean_text
from constants import (

SHOPMAINA_DATASET, SKROUTZ_DATASET, PRICE_RUNNER_DATASET, JIO_MART_DATASET, JIO_MART_DATASET_MAPPED, JIO_MART_DATASET_MAPPING, BIG_BASKET_DATASET, FLIP_KART_DATASET
)


## Jio Mart

In [23]:
jio_mart_df = pd.read_csv(JIO_MART_DATASET)
jio_mart_df = jio_mart_df.loc[~jio_mart_df['items'].isna(), :]
jio_mart_df = jio_mart_df.drop(columns=['href', 'price'])
jio_mart_df.rename(columns={'items': 'product name'}, inplace=True)
jio_mart_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 162280 entries, 0 to 162312
Data columns (total 3 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   category      162280 non-null  object
 1   sub_category  162280 non-null  object
 2   product name  162280 non-null  object
dtypes: object(3)
memory usage: 5.0+ MB


In [24]:
jio_mart_df.head()

Unnamed: 0,category,sub_category,product name
0,Groceries,Fruits & Vegetables,Fresh Dates (Pack) (Approx 450 g - 500 g)
1,Groceries,Fruits & Vegetables,Tender Coconut Cling Wrapped (1 pc) (Approx 90...
2,Groceries,Fruits & Vegetables,Mosambi 1 kg
3,Groceries,Fruits & Vegetables,Orange Imported 1 kg
4,Groceries,Fruits & Vegetables,Banana Robusta 6 pcs (Box) (Approx 800 g - 110...


In [25]:
jio_mart_df_cateogires = jio_mart_df["sub_category"].unique()
jio_mart_df_cateogires

array(['Fruits & Vegetables', 'Premium Fruits', 'Dairy & Bakery',
       'Staples', 'Snacks & Branded Foods', 'Beverages', 'Personal Care',
       'Home Care', 'Apparel', 'Mom & Baby Care', 'Books', 'Pets',
       'Kitchenware', 'Dining', 'Furnishing', 'Home Decor', 'Furniture',
       'Home Appliances', 'Toys, Games & Fitness', 'Electrical',
       'Bathroom & Laundry Accessories', 'Disposables', 'Stationery',
       'Bags & Travel Luggage', 'Mops, Brushes & Scrubs', 'Auto Care',
       'Garden & Outdoor', 'Carpentry & work accessories', 'Pooja Needs',
       'Bathroom & Laundry', 'Industrial & Scientific Supplies',
       'Building Supplies & Measuring Tools', 'Hardware & Plumbing',
       'Home Safety & Automation', 'Kitchen & Bath Fixtures',
       'Paint, Wall Treatments & Supplies', 'Power & Hand Tools',
       'Handloom & Handicraft', 'Personal Wear', 'Men', 'Women', 'Boys',
       'Girls', 'Junior Boys', 'Infants', 'Mobiles & Tablets',
       'TV & Speaker', 'Computers', 'Camer

In [26]:
with open(JIO_MART_DATASET_MAPPING, "r", encoding="utf-8") as f:
    mapping = json.load(f)

In [27]:
def map_to_gpc(category, subcategory):
    try:
        return mapping[category][subcategory]
    except KeyError:
        return [None, None, None, None]

In [28]:
jio_mart_df[["Segment", "Family", "Class", "Brick"]] = jio_mart_df.apply(
    lambda row: pd.Series(map_to_gpc(row["category"], row["sub_category"])),
    axis=1
)

In [29]:
jio_mart_df.to_csv(JIO_MART_DATASET_MAPPED, index=False)

In [30]:
jio_mart_df.head()

Unnamed: 0,category,sub_category,product name,Segment,Family,Class,Brick
0,Groceries,Fruits & Vegetables,Fresh Dates (Pack) (Approx 450 g - 500 g),Food/Beverage,Fruits/Vegetables Fresh Cut,,
1,Groceries,Fruits & Vegetables,Tender Coconut Cling Wrapped (1 pc) (Approx 90...,Food/Beverage,Fruits/Vegetables Fresh Cut,,
2,Groceries,Fruits & Vegetables,Mosambi 1 kg,Food/Beverage,Fruits/Vegetables Fresh Cut,,
3,Groceries,Fruits & Vegetables,Orange Imported 1 kg,Food/Beverage,Fruits/Vegetables Fresh Cut,,
4,Groceries,Fruits & Vegetables,Banana Robusta 6 pcs (Box) (Approx 800 g - 110...,Food/Beverage,Fruits/Vegetables Fresh Cut,,


## Big Market

In [11]:
big_mart_df = pd.read_csv(BIG_BASKET_DATASET)
big_mart_df.head()

Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


In [None]:
big_mart_df = big_mart_df.loc[~jio_mart_df['product'].isna(), :]
big_mart_df = big_mart_df.drop(columns=['brand', 'sale_price', 'market_price', 'rating','index'])
big_mart_df.rename(columns={'product': 'product name', 'category' : 'level_1', 'sub_category' : 'level_2', 'type' : 'level_3'}, inplace=True)
big_mart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27555 entries, 0 to 27554
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   product name  27554 non-null  object
 1   level_1       27555 non-null  object
 2   level_2       27555 non-null  object
 3   level_3       27555 non-null  object
 4   description   27440 non-null  object
dtypes: object(5)
memory usage: 1.1+ MB


In [13]:
big_mart_df.head()

Unnamed: 0,product name,level_1,level_2,level_3,description
0,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Hair Oil & Serum,This Product contains Garlic Oil that is known...
1,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Water & Fridge Bottles,"Each product is microwave safe (without lid), ..."
2,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Lamp & Lamp Oil,"A perfect gift for all occasions, be it your m..."
3,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,"Laundry, Storage Baskets",Multipurpose container with an attractive desi...
4,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Bathing Bars & Soaps,Nivea Creme Soft Soap gives your skin the best...


In [21]:
big_mart_df_level_1 = big_mart_df["level_1"].unique()
big_mart_df_level_1

array(['Beauty & Hygiene', 'Kitchen, Garden & Pets',
       'Cleaning & Household', 'Gourmet & World Food',
       'Foodgrains, Oil & Masala', 'Snacks & Branded Foods', 'Beverages',
       'Bakery, Cakes & Dairy', 'Baby Care', 'Fruits & Vegetables',
       'Eggs, Meat & Fish'], dtype=object)

In [19]:
big_mart_df_level_2 = big_mart_df["level_2"].nunique()
big_mart_df_level_2

90

In [18]:
big_mart_df_level_3 = big_mart_df["level_3"].nunique()
big_mart_df_level_3

426

## Flip Kart

In [2]:
flip_kart_df = pd.read_csv(FLIP_KART_DATASET)
flip_kart_df.head()

Unnamed: 0,uniq_id,crawl_timestamp,product_url,product_name,product_category_tree,pid,retail_price,discounted_price,image,is_FK_Advantage_product,description,product_rating,overall_rating,brand,product_specifications
0,c2d766ca982eca8304150849735ffef9,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2FF9KEDEFGF,999.0,379.0,"[""http://img5a.flixcart.com/image/short/u/4/a/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
1,7f7036a6d550aaa89d34c77bd39a5e48,2016-03-25 22:59:23 +0000,http://www.flipkart.com/fabhomedecor-fabric-do...,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",SBEEH3QGU7MFYJFY,32157.0,22646.0,"[""http://img6a.flixcart.com/image/sofa-bed/j/f...",False,FabHomeDecor Fabric Double Sofa Bed (Finish Co...,No rating available,No rating available,FabHomeDecor,"{""product_specification""=>[{""key""=>""Installati..."
2,f449ec65dcbc041b6ae5e6a32717d01b,2016-03-25 22:59:23 +0000,http://www.flipkart.com/aw-bellies/p/itmeh4grg...,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",SHOEH4GRSUBJGZXE,999.0,499.0,"[""http://img5a.flixcart.com/image/shoe/7/z/z/r...",False,Key Features of AW Bellies Sandals Wedges Heel...,No rating available,No rating available,AW,"{""product_specification""=>[{""key""=>""Ideal For""..."
3,0973b37acd0c664e3de26e97e5571454,2016-03-25 22:59:23 +0000,http://www.flipkart.com/alisha-solid-women-s-c...,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",SRTEH2F6HUZMQ6SJ,699.0,267.0,"[""http://img5a.flixcart.com/image/short/6/2/h/...",False,Key Features of Alisha Solid Women's Cycling S...,No rating available,No rating available,Alisha,"{""product_specification""=>[{""key""=>""Number of ..."
4,bc940ea42ee6bef5ac7cea3fb5cfbee7,2016-03-25 22:59:23 +0000,http://www.flipkart.com/sicons-all-purpose-arn...,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",PSOEH3ZYDMSYARJ5,220.0,210.0,"[""http://img5a.flixcart.com/image/pet-shampoo/...",False,Specifications of Sicons All Purpose Arnica Do...,No rating available,No rating available,Sicons,"{""product_specification""=>[{""key""=>""Pet Type"",..."


In [4]:
flip_kart_df = flip_kart_df.loc[~flip_kart_df['product_name'].isna(), :]
flip_kart_df = flip_kart_df.drop(columns=['uniq_id', 'product_url', 'crawl_timestamp', 'pid','retail_price', 'discounted_price', 'image', 'is_FK_Advantage_product', 'product_rating', 'overall_rating', 'brand', 'product_specifications'])
flip_kart_df.head()

Unnamed: 0,product_name,product_category_tree,description
0,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",Key Features of Alisha Solid Women's Cycling S...
1,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",FabHomeDecor Fabric Double Sofa Bed (Finish Co...
2,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",Key Features of AW Bellies Sandals Wedges Heel...
3,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",Key Features of Alisha Solid Women's Cycling S...
4,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",Specifications of Sicons All Purpose Arnica Do...


In [17]:
flip_kart_df_tree = flip_kart_df["product_category_tree"].iloc[7867]
flip_kart_df_tree

'["Home Decor & Festive Needs >> Wall Decor & Clocks >> Clocks >> Wall Clocks >> Blacksmith Wall Clocks"]'

In [19]:
def extract_levels(category_tree_str):
   if pd.isna(category_tree_str):
       return [None, None, None, None]
   
   try:
       clean_str = category_tree_str.strip('[""]')
       levels = clean_str.split(' >> ')
       result = (levels + [None, None, None, None])[:4]
       return result
   except:
       return [None, None, None, None]

In [20]:
flip_kart_df[['level_1', 'level_2', 'level_3', 'level_4']] = pd.DataFrame(
   flip_kart_df['product_category_tree'].apply(extract_levels).tolist(), 
   index=flip_kart_df.index
)
flip_kart_df.head()

Unnamed: 0,product_name,product_category_tree,description,level_1,level_2,level_3,level_4
0,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",Key Features of Alisha Solid Women's Cycling S...,Clothing,Women's Clothing,"Lingerie, Sleep & Swimwear",Shorts
1,FabHomeDecor Fabric Double Sofa Bed,"[""Furniture >> Living Room Furniture >> Sofa B...",FabHomeDecor Fabric Double Sofa Bed (Finish Co...,Furniture,Living Room Furniture,Sofa Beds & Futons,FabHomeDecor Fabric Double Sofa Bed (Finish Co...
2,AW Bellies,"[""Footwear >> Women's Footwear >> Ballerinas >...",Key Features of AW Bellies Sandals Wedges Heel...,Footwear,Women's Footwear,Ballerinas,AW Bellies
3,Alisha Solid Women's Cycling Shorts,"[""Clothing >> Women's Clothing >> Lingerie, Sl...",Key Features of Alisha Solid Women's Cycling S...,Clothing,Women's Clothing,"Lingerie, Sleep & Swimwear",Shorts
4,Sicons All Purpose Arnica Dog Shampoo,"[""Pet Supplies >> Grooming >> Skin & Coat Care...",Specifications of Sicons All Purpose Arnica Do...,Pet Supplies,Grooming,Skin & Coat Care,Shampoo
