In [1]:
#eda stacks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import s3fs
import os
import sys

#display settings
pd.set_option("display.max_columns", 200)      # show all columns
pd.set_option("display.max_rows", 20)         # show more rows
pd.set_option("display.width", 1200)           # wider output
pd.set_option("display.max_colwidth", 100)     # long text columns
pd.set_option("display.float_format", "{:.2f}".format)  # clean decimals
sns.set_theme(style="whitegrid")

In [2]:
bucket="kerala-ayurveda-s3"
prefix="raw-data/"

fs=s3fs.S3FileSystem()
dfs=[]

for i in range(0, 1):  # last is 31
    file_key=f"{prefix}events_{i:012d}.parquet"
    s3_path=f"s3://{bucket}/{file_key}"
    
    print(f"Reading {s3_path}")
    df=pd.read_parquet(s3_path, filesystem=fs)
    dfs.append(df)

df=pd.concat(dfs, ignore_index=True)
print(df.shape)

Reading s3://kerala-ayurveda-s3/raw-data/events_000000000000.parquet
(164139, 11)


In [3]:
df.sample()

Unnamed: 0,user_id,session_id,event,event_ts,date_ist,time_ist,page_location,page_type,page_load_ts,event_params,device
21976,1722542484.176624,1766240687,page_view,1766240803432016,2025-12-20,2025-12-20 19:56:43 UTC+0530,https://keralaayurveda.com/collections/best-sellers,collection,1766240797350276,"[{'key': 'page_location', 'value': {'string_value': 'https://keralaayurveda.com/collections/best...",mobile


# Identifiers

In [4]:
identifiers=df.sort_values(by=['date_ist','user_id','session_id'],ascending=[True,True,True])
identifiers=identifiers.drop_duplicates(subset=['date_ist','user_id','session_id'])[['date_ist','user_id','session_id','page_location','page_type','device']]

In [5]:
identifiers.rename(columns={'page_location':'landing_page','page_type':'landing_page_type'},inplace=True)

In [6]:
identifiers.sample()

Unnamed: 0,date_ist,user_id,session_id,landing_page,landing_page_type,device
153428,2025-12-20,433347606.1766212,1766221216,https://keralaayurveda.com/products/sukumaram-kwath,products,mobile


# Dimensions

In [7]:
start = df.loc[df["event"] == "session_start"].copy()

In [8]:
def get_event_param(params, key):
    if params is None:
        return None

    # handle list OR numpy array
    if isinstance(params, (list, np.ndarray)):
        for p in params:
            if p.get("key") == key:
                val = p.get("value", {})
                return (
                    val.get("string_value")
                    or val.get("int_value")
                    or val.get("float_value")
                    or val.get("double_value")
                )
    return None


In [9]:
start["source"] = start["event_params"].apply(
    lambda x: get_event_param(x, "source")
)

start["medium"] = start["event_params"].apply(
    lambda x: get_event_param(x, "medium")
)

start["campaign"] = start["event_params"].apply(
    lambda x: get_event_param(x, "campaign")
)

In [10]:
dimensions=identifiers.merge(start[['source','medium','campaign','user_id','session_id']],how='left',on=['user_id','session_id'])
dimensions.sample()

Unnamed: 0,date_ist,user_id,session_id,landing_page,landing_page_type,device,source,medium,campaign
8233,2025-12-20,2145500946.1765769,1766198124,https://keralaayurveda.com/,homepage,desktop,,,


# Funnels

In [11]:
events=['product_view', 'add_to_cart', 'begin_checkout', 'add_shipping_info', 'add_payment_info', 'purchase','view_item','view_product_page_loaded','add_to_cart','add_to_cart_custom_event','begin_checkout','gokwik_checkout_initiated']
focus=df[df['event'].isin(events)]
focus.sample()

Unnamed: 0,user_id,session_id,event,event_ts,date_ist,time_ist,page_location,page_type,page_load_ts,event_params,device
49188,1596635963.1766205,1766205518,view_product_page_loaded,1766205523006536,2025-12-20,2025-12-20 10:08:43 UTC+0530,https://keralaayurveda.com/products/ajax-capsule/variant_id = 50087063060793,products,1766205518158821,"[{'key': 'item_name', 'value': {'string_value': 'Ajax Capsule (50 Nos)', 'int_value': None, 'flo...",mobile


In [12]:
focus['event']=np.where(focus['event'].isin(['view_item','view_product_page_loaded']),'product_view',focus['event'])
focus['event']=np.where(focus['event'].isin(['add_to_cart','add_to_cart_custom_event']),'add_to_cart',focus['event'])
focus['event']=np.where(focus['event'].isin(['begin_checkout','gokwik_checkout_initiated']),'begin_checkout',focus['event'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  focus['event']=np.where(focus['event'].isin(['view_item','view_product_page_loaded']),'product_view',focus['event'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  focus['event']=np.where(focus['event'].isin(['add_to_cart','add_to_cart_custom_event']),'add_to_cart',focus['event'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#ret

In [13]:
funnel=focus.groupby(['user_id','session_id','event']).agg(
    first_timestamp=('time_ist','min')
).reset_index()

In [14]:
funnel.sample()

Unnamed: 0,user_id,session_id,event,first_timestamp
6912,2055356250.1766224,1766225276,purchase,2025-12-20 16:12:27 UTC+0530


In [15]:
users = funnel[['user_id','session_id']].drop_duplicates()

In [16]:
events=pd.DataFrame(
    {"event":funnel['event'].unique()}
)
events

Unnamed: 0,event
0,product_view
1,add_to_cart
2,add_payment_info
3,add_shipping_info
4,begin_checkout
5,purchase


In [17]:
users=users.merge(events,how='cross')

In [18]:
users[users['user_id']=='184043494.1766206028']

Unnamed: 0,user_id,session_id,event
26226,184043494.1766206,1766206027,product_view
26227,184043494.1766206,1766206027,add_to_cart
26228,184043494.1766206,1766206027,add_payment_info
26229,184043494.1766206,1766206027,add_shipping_info
26230,184043494.1766206,1766206027,begin_checkout
26231,184043494.1766206,1766206027,purchase


In [19]:
funnel_new=users.merge(funnel,how='left',on=['user_id','session_id','event'])
funnel_new.sample()

Unnamed: 0,user_id,session_id,event,first_timestamp
36810,262129672.17662328,1766233007,product_view,2025-12-20 17:46:58 UTC+0530


In [20]:
funnel_new.fillna(0,inplace=True)

In [21]:
funnel_new['flag']=np.where(funnel_new['first_timestamp']==0,0,1)

In [45]:
funnel_new.head(10)

Unnamed: 0,user_id,session_id,event,first_timestamp,flag,event_payload
0,1000081962.176625,1766250666,product_view,2025-12-20 22:41:08 UTC+0530,1,"{'product_view': {'flag': 1, 'first_timestamp': '2025-12-20 22:41:08 UTC+0530'}}"
1,1000081962.176625,1766250666,add_to_cart,0,0,"{'add_to_cart': {'flag': 0, 'first_timestamp': 0}}"
2,1000081962.176625,1766250666,add_payment_info,0,0,"{'add_payment_info': {'flag': 0, 'first_timestamp': 0}}"
3,1000081962.176625,1766250666,add_shipping_info,0,0,"{'add_shipping_info': {'flag': 0, 'first_timestamp': 0}}"
4,1000081962.176625,1766250666,begin_checkout,0,0,"{'begin_checkout': {'flag': 0, 'first_timestamp': 0}}"
5,1000081962.176625,1766250666,purchase,0,0,"{'purchase': {'flag': 0, 'first_timestamp': 0}}"
6,1000121629.1766247,1766246174,product_view,2025-12-20 21:26:15 UTC+0530,1,"{'product_view': {'flag': 1, 'first_timestamp': '2025-12-20 21:26:15 UTC+0530'}}"
7,1000121629.1766247,1766246174,add_to_cart,0,0,"{'add_to_cart': {'flag': 0, 'first_timestamp': 0}}"
8,1000121629.1766247,1766246174,add_payment_info,0,0,"{'add_payment_info': {'flag': 0, 'first_timestamp': 0}}"
9,1000121629.1766247,1766246174,add_shipping_info,0,0,"{'add_shipping_info': {'flag': 0, 'first_timestamp': 0}}"


# Purchases

In [23]:
purchases=df.loc[df["event"] == "purchase"].copy()

In [24]:
purchases["transaction_id"] = purchases["event_params"].apply(
    lambda x: get_event_param(x, "transaction_id")
)

purchases["value"] = purchases["event_params"].apply(
    lambda x: get_event_param(x, "value")
)

purchases.sample()

Unnamed: 0,user_id,session_id,event,event_ts,date_ist,time_ist,page_location,page_type,page_load_ts,event_params,device,transaction_id,value
102421,1232469594.176617,1766169632,purchase,1766171852092905,2025-12-20,2025-12-20 00:47:32 UTC+0530,https://keralaayurveda.com/products/khadirarishta/variant_id = 50087059587385,products,1766171687042679,"[{'key': 'discount', 'value': {'string_value': None, 'int_value': 299.0, 'float_value': None, 'd...",desktop,#104700,700.0


In [25]:
purchases=purchases[['user_id','session_id','time_ist','transaction_id','value']]

In [26]:
purchases.sample()

Unnamed: 0,user_id,session_id,time_ist,transaction_id,value
123867,358445720.17662066,1766206557,2025-12-20 10:29:22 UTC+0530,#104761,437.0


In [27]:
purchases=purchases.groupby(['user_id','session_id']).agg(
    orders=('transaction_id','nunique'),
    orders_dups_inclusive=('transaction_id','count'),
    rev=('value','sum')
).reset_index()

In [28]:
purchases['aov']=purchases['rev']/purchases['orders']
purchases.sample(5)

Unnamed: 0,user_id,session_id,orders,orders_dups_inclusive,rev,aov
38,1258388839.1766236,1766241795,1,1,1853.0,1853.0
22,1166901611.1766205,1766205243,1,1,480.0,480.0
100,1721799680.1766195,1766197111,1,1,600.0,600.0
43,1305381878.1765919,1766213599,1,1,489.0,489.0
53,1378133107.1766205,1766204831,1,1,1270.0,1270.0


# Final Merging and Nesting

Tables: dimensions, funnel_new, purchases

In [29]:
dimensions.sample()

Unnamed: 0,date_ist,user_id,session_id,landing_page,landing_page_type,device,source,medium,campaign
10382,2025-12-20,503101149.1766215,1766214671,https://keralaayurveda.com/products/durdurapathradi-keram/variant_id = 50087058014521,products,mobile,google,cpc,Google_Shopping_Skin_and_Hair_Category_MC_07092025


In [30]:
check=funnel_new.groupby('user_id').agg(
    sessions=('session_id','nunique')
).reset_index()
check[check['sessions']>1]

Unnamed: 0,user_id,sessions
5,1000965042.1764346036,2
23,1004291177.1766226964,2
30,10066028.1765469857,5
65,1012620133.1766231536,2
77,1014748778.1763448614,2
...,...,...
9184,970910465.1766222094,2
9244,984404422.1766214575,2
9273,990471350.1766176009,2
9279,991467908.1766221339,2


In [31]:
funnel_new[funnel_new['user_id']=='1000965042.1764346036'] #QA

Unnamed: 0,user_id,session_id,event,first_timestamp,flag
30,1000965042.1764346,1766204227,product_view,2025-12-20 09:47:12 UTC+0530,1
31,1000965042.1764346,1766204227,add_to_cart,0,0
32,1000965042.1764346,1766204227,add_payment_info,0,0
33,1000965042.1764346,1766204227,add_shipping_info,0,0
34,1000965042.1764346,1766204227,begin_checkout,0,0
35,1000965042.1764346,1766204227,purchase,0,0
36,1000965042.1764346,1766238593,product_view,2025-12-20 19:19:55 UTC+0530,1
37,1000965042.1764346,1766238593,add_to_cart,0,0
38,1000965042.1764346,1766238593,add_payment_info,0,0
39,1000965042.1764346,1766238593,add_shipping_info,0,0


In [32]:
purchases.sample()

Unnamed: 0,user_id,session_id,orders,orders_dups_inclusive,rev,aov
125,1925106170.1766248,1766246537,1,1,520.0,520.0


In [33]:
final=dimensions.copy()
cols_to_nest = [
    "landing_page",
    "landing_page_type",
    "source",
    "medium",
    "campaign",
    "device"
]

final["identifiers"] = final[cols_to_nest].to_dict(orient="records")
final = final.drop(columns=cols_to_nest)
final.head()

Unnamed: 0,date_ist,user_id,session_id,identifiers
0,2025-12-20,1000081962.176625,1766250666,{'landing_page': 'https://keralaayurveda.com/products/murivenna-thailam/variant_id = 50087060406...
1,2025-12-20,1000121629.1766247,1766246174,"{'landing_page': 'https://keralaayurveda.com/products/aswagandhadi-lehyam', 'landing_page_type':..."
2,2025-12-20,1000133075.1766233,1766234813,"{'landing_page': 'https://keralaayurveda.com/collections/joint-muscle-care', 'landing_page_type'..."
3,2025-12-20,1000506278.1766171,1766173736,"{'landing_page': 'https://keralaayurveda.com/', 'landing_page_type': 'homepage', 'source': 'goog..."
4,2025-12-20,1000603621.1761928,1766171245,"{'landing_page': 'https://keralaayurveda.com/products/anu-thailam/variant_id = 50087058243897', ..."


In [34]:
final[final['session_id']==1766246174] #QA

Unnamed: 0,date_ist,user_id,session_id,identifiers
1,2025-12-20,1000121629.1766247,1766246174,"{'landing_page': 'https://keralaayurveda.com/products/aswagandhadi-lehyam', 'landing_page_type':..."


In [35]:
final[final['user_id']=='1000965042.1764346036'] #QA

Unnamed: 0,date_ist,user_id,session_id,identifiers
7,2025-12-20,1000965042.1764346,1766204227,"{'landing_page': 'https://keralaayurveda.com/products/anu-thailam/variant_id = 50087058243897', ..."
8,2025-12-20,1000965042.1764346,1766238593,{'landing_page': 'https://keralaayurveda.com/products/nalpamaradi-keram/variant_id = 50216339505...


In [36]:
funnel_event_cols=["event", "flag", "first_timestamp"]

funnel_new["event_payload"] = funnel_new.apply(
    lambda r: {
        r["event"]: {
            "flag": r["flag"],
            "first_timestamp": r["first_timestamp"]
        }
    },
    axis=1
)

In [37]:
funnel_df = (
    funnel_new
    .groupby(["user_id", "session_id"], as_index=False)
    .agg({
        "event_payload": lambda x: {
            k: v
            for d in x
            for k, v in d.items()
        }
    })
    .rename(columns={"event_payload": "funnels"})
)

In [38]:
funnel_df[funnel_df['user_id']=='1000965042.1764346036'] #QA

Unnamed: 0,user_id,session_id,funnels
5,1000965042.1764346,1766204227,"{'product_view': {'flag': 1, 'first_timestamp': '2025-12-20 09:47:12 UTC+0530'}, 'add_to_cart': ..."
6,1000965042.1764346,1766238593,"{'product_view': {'flag': 1, 'first_timestamp': '2025-12-20 19:19:55 UTC+0530'}, 'add_to_cart': ..."


In [39]:
final=final.merge(funnel_df,how='inner',on=['user_id','session_id'])
final.head()

Unnamed: 0,date_ist,user_id,session_id,identifiers,funnels
0,2025-12-20,1000081962.176625,1766250666,{'landing_page': 'https://keralaayurveda.com/products/murivenna-thailam/variant_id = 50087060406...,"{'product_view': {'flag': 1, 'first_timestamp': '2025-12-20 22:41:08 UTC+0530'}, 'add_to_cart': ..."
1,2025-12-20,1000121629.1766247,1766246174,"{'landing_page': 'https://keralaayurveda.com/products/aswagandhadi-lehyam', 'landing_page_type':...","{'product_view': {'flag': 1, 'first_timestamp': '2025-12-20 21:26:15 UTC+0530'}, 'add_to_cart': ..."
2,2025-12-20,1000603621.1761928,1766171245,"{'landing_page': 'https://keralaayurveda.com/products/anu-thailam/variant_id = 50087058243897', ...","{'product_view': {'flag': 1, 'first_timestamp': '2025-12-20 00:37:30 UTC+0530'}, 'add_to_cart': ..."
3,2025-12-20,1000876917.17645,1766219022,"{'landing_page': 'https://keralaayurveda.com/', 'landing_page_type': 'homepage', 'source': 'goog...","{'product_view': {'flag': 1, 'first_timestamp': '2025-12-20 13:54:06 UTC+0530'}, 'add_to_cart': ..."
4,2025-12-20,1000888200.1765997,1766240942,"{'landing_page': 'https://keralaayurveda.com/products/chyavanprash', 'landing_page_type': 'produ...","{'product_view': {'flag': 1, 'first_timestamp': '2025-12-20 19:59:04 UTC+0530'}, 'add_to_cart': ..."


In [40]:
final=final.merge(purchases,how='left',on=['user_id','session_id'])
final.fillna(0,inplace=True)
final['orders'].unique()

array([0., 1., 2.])

In [41]:
cols_to_nest = [
    "orders",
    "orders_dups_inclusive",
    "rev",
    "aov"
]

final["purchase"] = final[cols_to_nest].to_dict(orient="records")
final = final.drop(columns=cols_to_nest)
final.head()

Unnamed: 0,date_ist,user_id,session_id,identifiers,funnels,purchase
0,2025-12-20,1000081962.176625,1766250666,{'landing_page': 'https://keralaayurveda.com/products/murivenna-thailam/variant_id = 50087060406...,"{'product_view': {'flag': 1, 'first_timestamp': '2025-12-20 22:41:08 UTC+0530'}, 'add_to_cart': ...","{'orders': 0.0, 'orders_dups_inclusive': 0.0, 'rev': 0.0, 'aov': 0.0}"
1,2025-12-20,1000121629.1766247,1766246174,"{'landing_page': 'https://keralaayurveda.com/products/aswagandhadi-lehyam', 'landing_page_type':...","{'product_view': {'flag': 1, 'first_timestamp': '2025-12-20 21:26:15 UTC+0530'}, 'add_to_cart': ...","{'orders': 0.0, 'orders_dups_inclusive': 0.0, 'rev': 0.0, 'aov': 0.0}"
2,2025-12-20,1000603621.1761928,1766171245,"{'landing_page': 'https://keralaayurveda.com/products/anu-thailam/variant_id = 50087058243897', ...","{'product_view': {'flag': 1, 'first_timestamp': '2025-12-20 00:37:30 UTC+0530'}, 'add_to_cart': ...","{'orders': 0.0, 'orders_dups_inclusive': 0.0, 'rev': 0.0, 'aov': 0.0}"
3,2025-12-20,1000876917.17645,1766219022,"{'landing_page': 'https://keralaayurveda.com/', 'landing_page_type': 'homepage', 'source': 'goog...","{'product_view': {'flag': 1, 'first_timestamp': '2025-12-20 13:54:06 UTC+0530'}, 'add_to_cart': ...","{'orders': 0.0, 'orders_dups_inclusive': 0.0, 'rev': 0.0, 'aov': 0.0}"
4,2025-12-20,1000888200.1765997,1766240942,"{'landing_page': 'https://keralaayurveda.com/products/chyavanprash', 'landing_page_type': 'produ...","{'product_view': {'flag': 1, 'first_timestamp': '2025-12-20 19:59:04 UTC+0530'}, 'add_to_cart': ...","{'orders': 0.0, 'orders_dups_inclusive': 0.0, 'rev': 0.0, 'aov': 0.0}"


In [42]:
final.rename(columns={'identifiers':'dimensions'},inplace=True)
final.sample()

Unnamed: 0,date_ist,user_id,session_id,dimensions,funnels,purchase
9289,2025-12-20,850761342.1765089,1766216865,{'landing_page': 'https://keralaayurveda.com/products/murivenna-thailam/variant_id = 50216326299...,"{'product_view': {'flag': 1, 'first_timestamp': '2025-12-20 13:17:50 UTC+0530'}, 'add_to_cart': ...","{'orders': 0.0, 'orders_dups_inclusive': 0.0, 'rev': 0.0, 'aov': 0.0}"


In [43]:
final.shape

(10033, 6)

In [44]:
df.shape

(164139, 11)