# Importing libraries

In [1]:
# import libraries
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Reading Files

In [6]:
# reading files

events = pd.read_csv('events.csv')
item_properties = pd.read_csv('item_properties_part1.1.csv')
category_tree = pd.read_csv('category_tree.csv')

In [7]:
# Preview first few rows
print("Events Data:")
print(events.head(), "\n")

print("Item Properties Data:")
print(item_properties.head(), "\n")

print("Category Tree:")
print(category_tree.head())

Events Data:
       timestamp  visitorid event  itemid  transactionid
0  1433221332117     257597  view  355908            NaN
1  1433224214164     992329  view  248676            NaN
2  1433221999827     111016  view  318965            NaN
3  1433221955914     483717  view  253185            NaN
4  1433221337106     951259  view  367447            NaN 

Item Properties Data:
       timestamp  itemid    property                            value
0  1435460400000  460429  categoryid                             1338
1  1441508400000  206783         888          1116713 960601 n277.200
2  1439089200000  395014         400  n552.000 639502 n720.000 424566
3  1431226800000   59481         790                       n15360.000
4  1431831600000  156781         917                           828513 

Category Tree:
   categoryid  parentid
0        1016     213.0
1         809     169.0
2         570       9.0
3        1691     885.0
4         536    1691.0


In [8]:
# Convert timestamps from UNIX to datetime
events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms')
item_properties['timestamp'] = pd.to_datetime(item_properties['timestamp'], unit='ms')

# Check types and nulls
print(events.info())
print(item_properties.info())
print(category_tree.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
 #   Column         Dtype         
---  ------         -----         
 0   timestamp      datetime64[ns]
 1   visitorid      int64         
 2   event          object        
 3   itemid         int64         
 4   transactionid  float64       
dtypes: datetime64[ns](1), float64(1), int64(2), object(1)
memory usage: 105.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999999 entries, 0 to 10999998
Data columns (total 4 columns):
 #   Column     Dtype         
---  ------     -----         
 0   timestamp  datetime64[ns]
 1   itemid     int64         
 2   property   object        
 3   value      object        
dtypes: datetime64[ns](1), int64(1), object(2)
memory usage: 335.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1669 entries, 0 to 1668
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      -------------- 

In [10]:
# Function to print missing value count and percentage
def missing_info(df, name):
    total = df.isnull().sum()
    percent = (total / len(df)) * 100
    missing_df = pd.DataFrame({
        'Missing Count': total,
        'Missing %': percent.round(2)
    })
    print(f"\nMissing Values in {name}:\n", missing_df)

# Apply to both datasets
missing_info(events, "events")
missing_info(item_properties, "item_properties")



Missing Values in events:
                Missing Count  Missing %
timestamp                  0       0.00
visitorid                  0       0.00
event                      0       0.00
itemid                     0       0.00
transactionid        2733644      99.19

Missing Values in item_properties:
            Missing Count  Missing %
timestamp              0        0.0
itemid                 0        0.0
property               0        0.0
value                  0        0.0


#### From the above code, we have missing values in the events file and the percentage is 99%. the ideal next action would have been to drop it but we would rather keep it since the transaction ide is only associated to purchasing event such as 'add to cart'. 
#### Hence will be useful in our analysis.

# Understanding User Behaviour Paths

In [11]:
# sorting events by user and time
events_sorted = events.sort_values(by=['visitorid', 'timestamp'])


In [13]:
sample_user = events_sorted[events_sorted['visitorid'] == events_sorted['visitorid'].iloc[0]]
print(sample_user)


                      timestamp  visitorid event  itemid  transactionid
1361687 2015-09-11 20:49:49.439          0  view  285930            NaN
1367212 2015-09-11 20:52:39.591          0  view  357564            NaN
1367342 2015-09-11 20:55:17.175          0  view   67045            NaN


The output shows you a chronological sequence of item views for a single user (visitorid == 0).
This tells us:
The user viewed three different items in a span of 6 minutes.

No add-to-cart or transaction occurred during this time.

It may represent a single session where the user browsed but didn't convert into a purchase.

To Predict properties of items added to cart, we need to:

Identify sequences where a user views items and then adds to cart

Use the viewed items to predict what they added

(Optional) Aggregate multiple such instances across users to build a model