In [None]:
import sys
!{sys.executable} -m pip install tqdm
!{sys.executable} -m pip install scikit-learn

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import os
from typing import Tuple, List
# new import statements
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

### Retailrocket recommender system dataset

Source: https://www.kaggle.com/retailrocket/ecommerce-dataset

Load the dataset files.

In [None]:
events = pd.read_csv(os.path.join(\
    '..', 'timeseries_lec_data', 'events.csv'))
item_properties = pd.read_csv(os.path.join(\
    '..', 'timeseries_lec_data', 'item_properties_part1.csv'))
category_tree = pd.read_csv(os.path.join(\
    '..', 'timeseries_lec_data', 'category_tree.csv'))

In [None]:
events['event_datetime'] = pd.to_datetime(events['timestamp'], unit='ms')
events['event_date'] = events['event_datetime'].dt.date
events

In [None]:
events

In [None]:
item_properties

In [None]:
category_tree

#### Q4: [CORRECTION FROM LAST LECTURE] Calculate the monthly count number of events and create a line chart.

In [None]:
# Set timestamp as the index, resample to Month-Start, and calculate the count of number of events
monthly_avg = events.set_index('event_datetime').resample('MS').size().reset_index(name='event_count')
# 'ME' --- Month-End also would work for montly statistics

monthly_avg['year'] = monthly_avg['event_datetime'].dt.year

fig = px.line(
    monthly_avg, 
    x='event_datetime', 
    y='event_count', 
    color='year', 
    title="Monthly total event counts",
    labels={'event_datetime': 'Month', 'event_count': 'Total Events', 'year': 'Year'}
)
fig.show()

### One-hot encoding

One-hot encoding is a method for converting categorical data (non-numeric data that can take on a limited number of values, e.g., "view", "addtocart", "transaction") into a numerical format that machine learning algorithms can understand

#### `pandas get_dummies` function

Documentation: https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html

#### Q4: One-hot encode the events data

Let's first determine the type of `event` column.

In [None]:
def one_hot_encoder(df: pd.DataFrame, nan_as_category: bool = True) -> Tuple[pd.DataFrame, List[str]]:
    """
    One-hot encode categorical columns in the given DataFrame.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        nan_as_category (bool): Whether to include NaN as a separate category.
    
    Returns:
        Tuple[pd.DataFrame, List[str]]: The transformed DataFrame and the list of new column names.
    """
    original_columns = list(df.columns)
    print(original_columns)
    
    categorical_columns = df.select_dtypes(["category", "object"]).columns.tolist()
    print(categorical_columns)
    
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    
    new_columns = [col for col in df.columns if col not in original_columns]
    print(new_columns)
        
    return df, new_columns

In [None]:
events.dtype

In [None]:
events.head()

In [None]:
events_encoded, event_columns = one_hot_encoder(events.drop('event_date', axis=1))
events_encoded

## Statistical model for timeseries analysis

### 1. Exponential Moving Average (EMA)
EMA is a quantitative technique used as a forecasting model for time series analysis. It is a statistical method used to smooth time-series data by giving more weight to recent observations and less weight to older ones. It's useful in scenarios where recent data points are considered more relevant and informative. Typical use case scenarios include financial time-series analysis or e-commerce product popularity prediction.

### pandas ewm

- Documentation: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.ewm.html.

#### Q5: Use an Exponential Moving Average (EMA) model to calculate the average itemid view counts per category in the events dataset. Calculate the EMA for each category over a specified span.

In [None]:
item_properties

#### Filter `item_properties` to get only rows where `property` is 'categoryid'.

In [None]:
category_items = item_properties[item_properties['property'] == 'categoryid'][['itemid', 'value']]
# 'value' column contains the category IDs, so let's rename value to categoryid
category_items.columns = ['itemid', 'categoryid']  
category_items

#### Type conversion to `int`.

In [None]:
category_items['categoryid'] = category_items['categoryid'].astype(int)

#### Compute the unique category ids (product families).

In [None]:
categories = category_items['categoryid'].unique()
categories

In [None]:
len(categories)

#### Create a dataframe to store the results

In [None]:
results = pd.DataFrame(columns=["timestamp", "categoryid", "ema"])
results

#### tqdm

Enables us to view progress meter for any iterable. Very helpful to keep track of programs that run for a long time.
Documentation: https://tqdm.github.io/.                                                                                                   

#### Compute EMA for view event counts per timestamp

In [None]:
results_list = []

for category in tqdm(categories, total=len(categories)):
    # get the itemids associated with the current category
    category_items_for_category = category_items[category_items['categoryid'] == category]['itemid']
    
    # filter the events dataset for the current category's items and 'view' events
    category_events = events[(events['itemid'].isin(category_items_for_category)) & (events['event'] == 'view')]
    
    # group by timestamp to get the view counts per timestamp
    view_counts = category_events.groupby("timestamp").size()
    
    # apply the Exponential Moving Average (EMA) with a span of 16 to smooth the view counts
    ema_values = view_counts.ewm(span=16, adjust=False).mean()
    
    # store the results with the EMA values for the category
    category_results = pd.DataFrame({
        "timestamp": ema_values.index,
        "categoryid": category,
        "ema": ema_values.values
    })
    
    # append the current category's results to the results list
    results_list.append(category_results)

# concatenate all the results into a single DataFrame
results = pd.concat(results_list, ignore_index=True)

In [None]:
results

### 2. ARIMA (Autoregressive Integrated Moving Average) model

### Autocorrelation Function (ACF) and Partial Autocorrelation Function (PACF)

Autocorrelation and partial autocorrelation are statistical measures that help analyze the relationship between a time series and its lagged values. They help identify patterns, trends, and dependencies in the data, which are crucial for model selection and evaluation.

### Autocorrelation

- Autocorrelation measures the linear relationship between a time series and its lagged values. In simpler terms, it assesses how much the current value of a series depends on its past values. 
- Used for identifying the order of a moving average (MA) process.
- Represents the overall correlation structure of the time series.
- Autocorrelation measures the linear relationship between an observation and its previous observations at different lags.
- Use case:
    - To identify repeated patterns (seasonality).
    - To assess whether a time series is random or has some dependency.
    - To decide if a time series has autoregressive (AR) components.

### Partial Autocorrelation
- Partial autocorrelation removes the influence of intermediate lags, providing a clearer picture of the direct relationship between a variable and its past values. Unlike autocorrelation, partial autocorrelation focuses on the direct correlation at each lag.
- Used for identifying the order of an autoregressive (AR) process.
- Highlights the direct relationships between observations at specific lags.
- Partial Autocorrelation measures the direct linear relationship between an observation and its previous observations at a specific lag, excluding the contributions from intermediate lags.
- Use case:
    - To determine the order of an autoregressive (AR) process.
    - To identify the number of lags that have a significant and direct impact on the current value.

In [None]:
grouped_data = (
    events_with_categories[events_with_categories['event'] == 'view']
    .groupby(['event_datetime', 'categoryid'])
    .size()
    .reset_index(name='view_count') 
)
grouped_data

In [None]:
category_totals = (
    grouped_data.groupby('categoryid')['view_count']
    .sum()
    .reset_index()
    .sort_values('view_count', ascending=False)
)

In [None]:
top_categories = category_totals.head(20)['categoryid'].values

In [None]:
for num, category in enumerate(top_categories):
    try:
        category_data = grouped_data[grouped_data['categoryid'] == category]
        
        # Resample to daily view counts
        category_daily = (
            category_data.set_index('event_datetime')
            .resample('D')
            .sum()
            .fillna(0)  # Fill missing days with 0 view counts
        )
        
        fig, ax = plt.subplots(1, 2, figsize=(15, 5))
        
        # Plot ACF and PACF
        plot_acf(category_daily['view_count'], lags=20, ax=ax[0], title=f"AUTOCORRELATION\nCategory: {category}")
        plot_pacf(category_daily['view_count'], lags=20, ax=ax[1], title=f"PARTIAL AUTOCORRELATION\nCategory: {category}")
        
        plt.show()
    except Exception as e:
        print(f"Error processing category {category}: {e}")
        pass

plt.close('all')