In [None]:
import sys
!{sys.executable} -m pip install tqdm

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
# new import statements


### Retailrocket recommender system dataset

Source: https://www.kaggle.com/retailrocket/ecommerce-dataset

Load the dataset files.

In [None]:
events = pd.read_csv(os.path.join(\
    '..', 'timeseries_lec_data', 'events.csv'))
item_properties = pd.read_csv(os.path.join(\
    '..', 'timeseries_lec_data', 'item_properties_part1.csv'))
category_tree = pd.read_csv(os.path.join(\
    '..', 'timeseries_lec_data', 'category_tree.csv'))

In [None]:
events['event_datetime'] = pd.to_datetime(events['timestamp'], unit='ms')
events['event_date'] = events['event_datetime'].dt.date
events

In [None]:
item_properties

In [None]:
category_tree

#### Q1: What is the Spearman correlation between the number of "view" events and the number of "addtocart" events per item?

In [None]:
# Filtering: view / addtocart events
view_events = events[events['event'] == 'view']
addtocart_events = events[events['event'] == 'addtocart']

# Item count
view_counts = view_events['itemid'].value_counts()
addtocart_counts = addtocart_events['itemid'].value_counts()

# Count DataFrames
view_df = view_counts.reset_index().rename(columns=\
                                           {'count': 'view_count'})
addtocart_df = addtocart_counts.reset_index().rename(columns=\
                                                     {'count': 'addtocart_count'})

# Single merged DataFrame
merged_df = pd.merge(view_df, addtocart_df, on='itemid', how='inner')

In [None]:
spearman_corr = merged_df['view_count'].corr(merged_df['addtocart_count'],\
                                             method='spearman')
spearman_corr

#### Q2: Create a scatter plot with ordinary least squares' trend line to show correlation between the number of "view" events and the number of "addtocart" events per item. 

#### Q3: Plot a line chart of the number of events (view, transaction, etc.) over time, color-coded by the event type.

In [None]:
events_grouped = 

fig = 
              color='event', 
              title="Event counts over time (by event type)", 
              labels={'event_count': 'Number of events', 'event_date': 'Date'})
fig.write_html('events_vs_date.html')
fig.show()

#### pandas `resample` method

Documentation: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.resample.html

#### Q4: Calculate the monthly count number of events and create a line chart.

In [None]:
events

In [None]:
# Set timestamp as the index, resample to Month-Start, and calculate the count of number of events

# 'ME' --- Month-End also would work for montly statistics


fig = px.line(

    color='year', 
    title="Monthly total event counts",
    labels={'event_datetime': 'Month', 'event_count': 'Average Events', 'year': 'Year'}
)
fig.show()

#### Q5: Calculate the total number of events per day of the week for each year and visualize the trend with a line chart.

In [None]:
events_v2 = events.copy()

events_v2['year'] = 
events_v2['dayofweek'] = 
# Add 1 to make Monday = 1, Sunday = 7


fig = px.line(
    
    title="Total events per day of the week",
    labels={'dayofweek': 'Day of Week (1=Monday, 7=Sunday)', 'event_count': 'Total Events'}
)
fig.show()

In [None]:
events

#### Q6: Aggregate the number of events per day and plot the trend for the total number of events over time.

In [None]:


fig = px.line(
    events_per_day, 
    x='event_datetime', 
    y='event_count', 
    title="Total events per day", 
    labels={'event_datetime': 'Date', 'event_count': 'Total Events'}
)
fig.show()

### One-hot encoding

One-hot encoding is a method for converting categorical data (non-numeric data that can take on a limited number of values, e.g., "view", "addtocart", "transaction") into a numerical format that machine learning algorithms can understand

#### `pandas get_dummies` function

Documentation: https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html

#### Q7: One-hot encode the events data

Let's first determine the type of `event` column.

In [None]:
print()

Explore all the `events` DataFrame column datatypes.

In [None]:
events.head()

Let's drop `event_date` column.

In [None]:
def one_hot_encoder(df: pd.DataFrame, nan_as_category: bool = True) -> Tuple[pd.DataFrame, List[str]]:
    """
    One-hot encode categorical columns in the given DataFrame.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        nan_as_category (bool): Whether to include NaN as a separate category.
    
    Returns:
        Tuple[pd.DataFrame, List[str]]: The transformed DataFrame and the list of new column names.
    """
    

In [None]:
events

## Statistical model for timeseries analysis

### 1. Exponential Moving Average (EMA)
EMA is a quantitative technique used as a forecasting model for time series analysis. It is a statistical method used to smooth time-series data by giving more weight to recent observations and less weight to older ones. It's useful in scenarios where recent data points are considered more relevant and informative. Typical use case scenarios include financial time-series analysis or e-commerce product popularity prediction.

#### Q8: Use an Exponential Moving Average (EMA) model to calculate the average itemid view counts per category in the events dataset. Calculate the EMA for each category over a specified span.

In [None]:
item_properties

#### Filter `item_properties` to get only rows where `property` is 'categoryid'.

In [None]:
category_items = 
# 'value' column contains the category IDs, so let's rename value to categoryid
# category_items
category_items

#### Type conversion to `int`.

In [None]:
category_items['categoryid'] = 
category_tree['categoryid'] = 

#### Merge `category_items` with `category_tree` to get `parentid` (product family) for each category.

In [None]:
category_items = 
category_items

#### Compute the unique category ids (product families).

In [None]:
categories = 
categories

#### Create a dataframe to store the results

In [None]:
results = 
results

#### Compute EMA for view event counts

In [None]:
results_list = []

for category in tqdm(categories, total=len(categories)):
    # get the itemids associated with the current category
    category_items_for_category = 
    
#     # filter the events dataset for the current category's items and 'view' events
#     category_events = 
    
#     # group by timestamp to get the view counts per timestamp
#     view_counts = 
    
#     # apply the Exponential Moving Average (EMA) with a span of 16 to smooth the view counts
#     ema_values = 
    
#     # store the results with the EMA values for the category
#     category_results = 
    
#     # append the current category's results to the results list
#     results_list.append(category_results)

# # concatenate all the results into a single DataFrame
# results = 

In [None]:
results