In [15]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
import calendar

In [16]:
# Read data into a Pandas DataFrame
merged_df=pd.read_csv('Resources/sample_tableau_30m.csv', encoding='utf-8')
merged_df.head()


Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id
0,2019-11-01 05:23:11 UTC,view,1004317,2053013555631882655,electronics.smartphone,meizu,159.26,513395491
1,2019-11-02 06:18:53 UTC,view,1005174,2053013555631882655,electronics.smartphone,samsung,643.23,532847621
2,2019-11-22 14:30:13 UTC,view,3700779,2053013565983425517,appliances.environment.vacuum,xiaomi,295.76,550765008
3,2019-11-08 18:02:49 UTC,view,2702700,2053013563911439225,appliances.kitchen.refrigerators,midea,1029.6,548415316
4,2019-11-30 16:13:09 UTC,view,1004870,2053013555631882655,electronics.smartphone,samsung,282.89,513752448


In [17]:
visitor_id_counts = merged_df['user_id'].value_counts()
visitor_id_counts

user_id
597644399    15739
569335945     6767
568804062     6214
637360772     3943
625149076     3935
             ...  
600709285        1
603765853        1
603124640        1
606153445        1
647277668        1
Name: count, Length: 6263923, dtype: int64

In [18]:
# Renaming columns
df_cleaned = merged_df.rename(columns={
    'event_time': 'Timestamp',
    'event_type': 'Visitor_Action',
    'product_id': 'Product',
    'category_id': 'Product_Category',
    'category_code': 'Category_Name',
    'brand': 'Brand',
    'price': 'Price',
    'user_id': 'Visitor_ID'
})
df_cleaned

Unnamed: 0,Timestamp,Visitor_Action,Product,Product_Category,Category_Name,Brand,Price,Visitor_ID
0,2019-11-01 05:23:11 UTC,view,1004317,2053013555631882655,electronics.smartphone,meizu,159.26,513395491
1,2019-11-02 06:18:53 UTC,view,1005174,2053013555631882655,electronics.smartphone,samsung,643.23,532847621
2,2019-11-22 14:30:13 UTC,view,3700779,2053013565983425517,appliances.environment.vacuum,xiaomi,295.76,550765008
3,2019-11-08 18:02:49 UTC,view,2702700,2053013563911439225,appliances.kitchen.refrigerators,midea,1029.60,548415316
4,2019-11-30 16:13:09 UTC,view,1004870,2053013555631882655,electronics.smartphone,samsung,282.89,513752448
...,...,...,...,...,...,...,...,...
29999995,2020-04-27 04:34:07 UTC,view,10900392,2232732105912091273,appliances.kitchen.mixer,vitek,102.94,590911528
29999996,2020-04-19 03:22:38 UTC,view,52900028,2232732095568937384,sport.trainer,stanley,84.94,542124965
29999997,2020-04-21 10:02:05 UTC,view,1004839,2232732093077520756,construction.tools.light,oppo,160.08,539384386
29999998,2020-04-26 01:33:05 UTC,view,2501614,2232732092565815652,appliances.kitchen.oven,redmond,154.42,647277668


In [19]:
# Get unique user IDs with "view" events
view_users = df_cleaned[df_cleaned['Visitor_Action'] == 'view']['Visitor_ID'].unique()

# Get unique user IDs with "cart" or "purchase" events
cart_purchase_users = df_cleaned[df_cleaned['Visitor_Action'].isin(['cart', 'purchase'])]['Visitor_ID'].unique()

# Find user IDs with both "view" and "cart" or "purchase" events
view_and_cart_purchase_users = set(view_users) & set(cart_purchase_users)

# Filter the dataset to include only rows for user IDs with both "view" and "cart" or "purchase" events
filtered_df = df_cleaned[df_cleaned['Visitor_ID'].isin(view_and_cart_purchase_users)]

filtered_df = filtered_df.reset_index(drop=True)

# Print the filtered DataFrame
filtered_df

Unnamed: 0,Timestamp,Visitor_Action,Product,Product_Category,Category_Name,Brand,Price,Visitor_ID
0,2019-11-22 14:30:13 UTC,view,3700779,2053013565983425517,appliances.environment.vacuum,xiaomi,295.76,550765008
1,2019-11-16 08:47:08 UTC,cart,1004833,2053013555631882655,electronics.smartphone,samsung,179.16,562112994
2,2019-11-23 10:10:16 UTC,view,21402079,2053013561579406073,electronics.clocks,lorus,102.19,514759577
3,2019-11-29 08:34:48 UTC,view,1801704,2053013554415534427,electronics.video.tv,samsung,385.83,563926853
4,2019-11-15 15:02:09 UTC,view,22400054,2053013554474254687,electronics.audio.microphone,trust,15.16,517410481
...,...,...,...,...,...,...,...,...
12028190,2020-04-28 08:26:36 UTC,view,100112241,2053013551932506308,construction.tools.drill,escan,69.24,513039757
12028191,2020-04-26 08:38:32 UTC,view,1004259,2232732093077520756,construction.tools.light,apple,823.44,560848082
12028192,2020-04-03 15:11:39 UTC,view,100086203,2232732093077520756,construction.tools.light,samsung,103.45,512519873
12028193,2020-04-21 10:02:05 UTC,view,1004839,2232732093077520756,construction.tools.light,oppo,160.08,539384386


In [20]:
# Converting event_time to datetime
try:
    filtered_df['Timestamp'] = pd.to_datetime(filtered_df['Timestamp'], format='%Y-%m-%d %H:%M:%S %Z', errors='coerce')
except Exception as e:
    print("Error during conversion:", e)


In [21]:
# Print data types of each column
print("Data types of each column:")
print(filtered_df.dtypes)
print()

Data types of each column:
Timestamp           datetime64[ns, UTC]
Visitor_Action                   object
Product                           int64
Product_Category                  int64
Category_Name                    object
Brand                            object
Price                           float64
Visitor_ID                        int64
dtype: object



In [22]:
# Extract day of the week (Monday=0, Sunday=6) and day of the month
filtered_df.loc[:, 'Day_of_Week'] = filtered_df['Timestamp'].dt.dayofweek
filtered_df.loc[:, 'Day_of_Month'] = filtered_df['Timestamp'].dt.day
filtered_df.loc[:, 'Month'] = filtered_df['Timestamp'].dt.month
filtered_df.loc[:, 'Hour_of_Day'] = filtered_df['Timestamp'].dt.hour

filtered_df = filtered_df[['Hour_of_Day','Timestamp','Day_of_Week','Day_of_Month','Month','Visitor_ID','Visitor_Action','Product','Product_Category','Category_Name','Brand','Price']]

# Define mapping of numerical values of Day_of_Week to actual day names
day_mapping = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}

# Map the numeric day of week to day names
filtered_df['Day_of_Week'] = filtered_df['Day_of_Week'].map(day_mapping)
filtered_df

Unnamed: 0,Hour_of_Day,Timestamp,Day_of_Week,Day_of_Month,Month,Visitor_ID,Visitor_Action,Product,Product_Category,Category_Name,Brand,Price
0,14,2019-11-22 14:30:13+00:00,Friday,22,11,550765008,view,3700779,2053013565983425517,appliances.environment.vacuum,xiaomi,295.76
1,8,2019-11-16 08:47:08+00:00,Saturday,16,11,562112994,cart,1004833,2053013555631882655,electronics.smartphone,samsung,179.16
2,10,2019-11-23 10:10:16+00:00,Saturday,23,11,514759577,view,21402079,2053013561579406073,electronics.clocks,lorus,102.19
3,8,2019-11-29 08:34:48+00:00,Friday,29,11,563926853,view,1801704,2053013554415534427,electronics.video.tv,samsung,385.83
4,15,2019-11-15 15:02:09+00:00,Friday,15,11,517410481,view,22400054,2053013554474254687,electronics.audio.microphone,trust,15.16
...,...,...,...,...,...,...,...,...,...,...,...,...
12028190,8,2020-04-28 08:26:36+00:00,Tuesday,28,4,513039757,view,100112241,2053013551932506308,construction.tools.drill,escan,69.24
12028191,8,2020-04-26 08:38:32+00:00,Sunday,26,4,560848082,view,1004259,2232732093077520756,construction.tools.light,apple,823.44
12028192,15,2020-04-03 15:11:39+00:00,Friday,3,4,512519873,view,100086203,2232732093077520756,construction.tools.light,samsung,103.45
12028193,10,2020-04-21 10:02:05+00:00,Tuesday,21,4,539384386,view,1004839,2232732093077520756,construction.tools.light,oppo,160.08


In [23]:
#Dropping user_session 
filtered_df = filtered_df.drop(columns='Timestamp')
filtered_df

Unnamed: 0,Hour_of_Day,Day_of_Week,Day_of_Month,Month,Visitor_ID,Visitor_Action,Product,Product_Category,Category_Name,Brand,Price
0,14,Friday,22,11,550765008,view,3700779,2053013565983425517,appliances.environment.vacuum,xiaomi,295.76
1,8,Saturday,16,11,562112994,cart,1004833,2053013555631882655,electronics.smartphone,samsung,179.16
2,10,Saturday,23,11,514759577,view,21402079,2053013561579406073,electronics.clocks,lorus,102.19
3,8,Friday,29,11,563926853,view,1801704,2053013554415534427,electronics.video.tv,samsung,385.83
4,15,Friday,15,11,517410481,view,22400054,2053013554474254687,electronics.audio.microphone,trust,15.16
...,...,...,...,...,...,...,...,...,...,...,...
12028190,8,Tuesday,28,4,513039757,view,100112241,2053013551932506308,construction.tools.drill,escan,69.24
12028191,8,Sunday,26,4,560848082,view,1004259,2232732093077520756,construction.tools.light,apple,823.44
12028192,15,Friday,3,4,512519873,view,100086203,2232732093077520756,construction.tools.light,samsung,103.45
12028193,10,Tuesday,21,4,539384386,view,1004839,2232732093077520756,construction.tools.light,oppo,160.08


In [24]:
# Identify rows where Category_Name contains "construction" and the brand is either "apple" or "samsung", then change the brand to "Others"
filtered_df.loc[(filtered_df['Category_Name'].str.contains('construction', case=False)) & (filtered_df['Brand'].isin(['apple', 'samsung','oppo', 'xiaomi', 'huawei'])), 'Brand'] = 'Others'
filtered_df

Unnamed: 0,Hour_of_Day,Day_of_Week,Day_of_Month,Month,Visitor_ID,Visitor_Action,Product,Product_Category,Category_Name,Brand,Price
0,14,Friday,22,11,550765008,view,3700779,2053013565983425517,appliances.environment.vacuum,xiaomi,295.76
1,8,Saturday,16,11,562112994,cart,1004833,2053013555631882655,electronics.smartphone,samsung,179.16
2,10,Saturday,23,11,514759577,view,21402079,2053013561579406073,electronics.clocks,lorus,102.19
3,8,Friday,29,11,563926853,view,1801704,2053013554415534427,electronics.video.tv,samsung,385.83
4,15,Friday,15,11,517410481,view,22400054,2053013554474254687,electronics.audio.microphone,trust,15.16
...,...,...,...,...,...,...,...,...,...,...,...
12028190,8,Tuesday,28,4,513039757,view,100112241,2053013551932506308,construction.tools.drill,escan,69.24
12028191,8,Sunday,26,4,560848082,view,1004259,2232732093077520756,construction.tools.light,Others,823.44
12028192,15,Friday,3,4,512519873,view,100086203,2232732093077520756,construction.tools.light,Others,103.45
12028193,10,Tuesday,21,4,539384386,view,1004839,2232732093077520756,construction.tools.light,Others,160.08


In [25]:
# Define a dictionary mapping numeric values to text
replacement_dict = {
    1: 'Jan-2020',
    2: 'Feb-2020',
    3: 'Mar-2020',
    4: 'Apr-2020',
    11: 'Nov-2019',
    12: 'Dec-2019',
}

# Apply the replacement to the 'Month' column
filtered_df['Month'] = filtered_df['Month'].replace(replacement_dict)

In [26]:
filtered_df

Unnamed: 0,Hour_of_Day,Day_of_Week,Day_of_Month,Month,Visitor_ID,Visitor_Action,Product,Product_Category,Category_Name,Brand,Price
0,14,Friday,22,Nov-2019,550765008,view,3700779,2053013565983425517,appliances.environment.vacuum,xiaomi,295.76
1,8,Saturday,16,Nov-2019,562112994,cart,1004833,2053013555631882655,electronics.smartphone,samsung,179.16
2,10,Saturday,23,Nov-2019,514759577,view,21402079,2053013561579406073,electronics.clocks,lorus,102.19
3,8,Friday,29,Nov-2019,563926853,view,1801704,2053013554415534427,electronics.video.tv,samsung,385.83
4,15,Friday,15,Nov-2019,517410481,view,22400054,2053013554474254687,electronics.audio.microphone,trust,15.16
...,...,...,...,...,...,...,...,...,...,...,...
12028190,8,Tuesday,28,Apr-2020,513039757,view,100112241,2053013551932506308,construction.tools.drill,escan,69.24
12028191,8,Sunday,26,Apr-2020,560848082,view,1004259,2232732093077520756,construction.tools.light,Others,823.44
12028192,15,Friday,3,Apr-2020,512519873,view,100086203,2232732093077520756,construction.tools.light,Others,103.45
12028193,10,Tuesday,21,Apr-2020,539384386,view,1004839,2232732093077520756,construction.tools.light,Others,160.08


In [27]:
# Mapping of month abbreviations to month numbers
month_mapping = {
    'Jan-2020': 1,
    'Feb-2020': 2,
    'Mar-2020': 3,
    'Apr-2020': 4,
    'Nov-2019': 11,
    'Dec-2019': 12
}

# Sample 1 million rows per month
sampled_dfs = []
for month_abbr, month_num in month_mapping.items():
    month_df = filtered_df[filtered_df['Month'] == month_abbr]
    sampled_df = month_df.sample(n=min(1000000, len(month_df)), replace=False, random_state=42)
    sampled_dfs.append(sampled_df)

# Concatenate the sampled DataFrames
sampled_df = pd.concat(sampled_dfs)

# Display the sampled DataFrame
print(sampled_df)

         Hour_of_Day Day_of_Week  Day_of_Month     Month  Visitor_ID  \
5198059           18      Friday            17  Jan-2020   523723855   
4716385           14      Friday            31  Jan-2020   513631028   
4726983            8      Friday             3  Jan-2020   558618462   
5465811            6      Friday            31  Jan-2020   607840104   
4524172           17   Wednesday             1  Jan-2020   557545137   
...              ...         ...           ...       ...         ...   
3099925           14    Thursday            19  Dec-2019   568520034   
3049284           13      Monday             2  Dec-2019   514051064   
2670287           11    Saturday             7  Dec-2019   545244966   
2958151           17      Sunday             8  Dec-2019   542217176   
2356149           10   Wednesday            18  Dec-2019   535930047   

        Visitor_Action    Product     Product_Category  \
5198059           view    1305232  2053013554658804075   
4716385           c

In [29]:
# Save the modified DataFrame to a new CSV file
sampled_df.to_csv('Resources/sample_tableau_6m.csv', index=False, encoding='utf-8')