In [None]:
!pip install plotly --upgrade
!pip install statsmodels

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import os

### Retailrocket recommender system dataset

Source: https://www.kaggle.com/retailrocket/ecommerce-dataset

Load the dataset files.

In [None]:
events = pd.read_csv(os.path.join('data', 'events.csv'))
item_properties = pd.read_csv(os.path.join('data', 'item_properties_part1.csv'))
category_tree = pd.read_csv(os.path.join('data', 'category_tree.csv'))

In [None]:
events

In [None]:
item_properties

In [None]:
category_tree

## Exploratory Data Analysis (EDA)

EDA is about understanding the data and forming hypotheses about it. 

- Visualizing Data: Histograms, scatter plots, box plots, etc., to understand distributions and relationships.
- Summary Statistics: Calculating mean, median, mode, standard deviation, and correlation to gain insights into the dataset.
- Detecting Outliers: Identifying values that deviate significantly from the rest of the data.
- Assessing Data Types and Structure: Checking data types, unique values, and identifying missing values.

#### Q1: How many events happen on average per day?

In [None]:
events['event_datetime'] = pd.to_datetime(events['timestamp'], unit='ms')
events['event_date'] = events['event_datetime'].dt.date
events

In [None]:
events_per_day = events.groupby(events['event_date']).size()
avg_events_per_day = events_per_day.mean()

events_per_day.plot(kind='line', title='Events per Day', color='black')
plt.xlabel('Date')
plt.ylabel('Number of Events')
plt.xticks(rotation=45)
plt.show()

# TBD: change this to plotly?

#### Q2: Calculate the count of each unique property corresponding to "addtocart" items.

In [None]:
events

In [None]:
item_properties

In [None]:
addtocart_events = events[events['event'] == 'addtocart']
addtocart_events

In [None]:
addtocart_events = events[events['event'] == 'addtocart']
merged_data = pd.merge(addtocart_events, item_properties, on='itemid', \
                       how='left')
merged_data

In [None]:
property_counts = merged_data['property'].value_counts()
print(property_counts)

### Correlation Methods

In EDA, various correlation methods are used to understand the relationship between numerical variables. 

#### What is correlation? 

The variables are said to be correlated if the changes in one variable results in a corresponding change in another variable.

#### Pearson correlation (Linear relationship)

Pearson correlation coefficientmeasures the strength of a relationship between two variables and their association with one another (linear correlation).

#### Spearman correlation (Monotonic relationship)

Spearman correlation enables us to assess the monotonic relationship between between two ranked variables. That is, how well the relationship between two variables could be represented using a monotonic function.

- The Spearman Rank Correlation can take a value from +1 to -1 where:
    - +1 means a perfect association
    - 0 means that there is no association
    - -1 means a perfect negative association
- Further description of the correlation
    - .00-.19 "very weak"
    - .20-.39 "weak"
    - .40-.59 "moderate"
    - .60-.79 "strong"
    - .80-1.0 "very strong"

#### Kendall's Tau correlation
Kendall's tau is a measure of the correspondence between two rankings

#### Which correlation type should you choose?

- Use Pearson for linear relationships when data assumptions are met.
- Use Spearman for ordinal data or when data isn't linear.
- Use Kendall's Tau for smaller datasets and many rank ties

#### Readings:

1. https://www.simplilearn.com/tutorials/statistics-tutorial/spearmans-rank-correlation#:~:text=Spearman's%20rank%20correlation%20measures%20the,represented%20using%20a%20monotonic%20function. 
2. https://www.simplilearn.com/tutorials/statistics-tutorial/pearson-correlation-coefficient-in-statistics#pearsons_correlation_coefficient
3. https://datatab.net/tutorial/pearson-correlation
4. https://datatab.net/tutorial/spearman-correlation
5. https://datatab.net/tutorial/dispersion-parameter

#### Q3: What is the Spearman correlation between the number of "view" events and the number of "addtocart" events per item?

In [None]:
# Filter 'view' and 'addtocart' events
view_events = events[events['event'] == 'view']
addtocart_events = events[events['event'] == 'addtocart']

In [None]:
# Item count
view_counts = view_events['itemid'].value_counts()
addtocart_counts = addtocart_events['itemid'].value_counts()

In [None]:
view_df = view_counts.reset_index().rename(columns=\
                                           {'count': 'view_count'})
addtocart_df = addtocart_counts.reset_index().rename(columns=\
                                                     {'count': 'addtocart_count'})

In [None]:
merged_df = pd.merge(view_df, addtocart_df, on='itemid', how='inner')

In [None]:
spearman_corr = merged_df['view_count'].corr(merged_df['addtocart_count'],\
                                             method='spearman')
spearman_corr

#### Q4: What is the Pearson correlation between the number of "view" events and the number of "addtocart" events per item?

In [None]:
pearson_corr = merged_df['view_count'].corr(merged_df['addtocart_count'], \
                                            method='pearson')
pearson_corr

#### Q5: What is the Kendall's Tau correlation between the number of "view" events and the number of "addtocart" events per item?

In [None]:
kendall_tau_corr = merged_df['view_count'].corr(merged_df['addtocart_count'], \
                                                method='kendall')
kendall_tau_corr

#### Q6: Create a scatter plot with ordinary least squares' trend line to show correlation between the number of "view" events and the number of "addtocart" events per item. 

In [None]:
# Create a scatter plot with OLS trendline
fig = px.scatter(merged_df, x="view_count", y="addtocart_count", \
                 trendline="ols", trendline_color_override="red")

# Save the figure as an HTML file
fig.write_html('scatter_plot_with_ols_trendline.html')
fig.show()

In [None]:
events

#### Q7: Plot a line chart of the number of events (view, transaction, etc.) over time, color-coded by the event type.

In [None]:
events_grouped = events.groupby(['event_date', 'event']).size().reset_index(name='event_count')

fig = px.line(events_grouped, x='event_date', y='event_count', 
              color='event', 
              title="Event counts over time (by event type)", 
              labels={'event_count': 'Number of events', 'event_date': 'Date'})
fig.write_html('events_vs_date.html')
fig.show()

#### pandas `resample` method

Documentation: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.resample.html

#### Q8: Calculate the monthly count number of events and create a line chart.

In [None]:
# Set timestamp as the index, resample to Month-Start, and calculate the count of number of events
monthly_avg = events.set_index('event_datetime').resample('MS').size().reset_index(name='event_count')

monthly_avg['year'] = monthly_avg['event_datetime'].dt.year

fig = px.line(
    monthly_avg, 
    x='event_datetime', 
    y='event_count', 
    color='year', 
    title="Monthly total event counts",
    labels={'event_datetime': 'Month', 'event_count': 'Average Events', 'year': 'Year'}
)
fig.show()

#### Q9: Calculate the total number of events per day of the week for each year and visualize the trend with a line chart.

In [None]:
events_v2 = events.copy()

events_v2['year'] = events_v2['event_datetime'].dt.year
events_v2['dayofweek'] = events_v2['event_datetime'].dt.dayofweek + 1  # Add 1 to make Monday = 1, Sunday = 7

events_v2 = events_v2.groupby(['year', 'dayofweek']).size().\
reset_index(name='event_count')

fig = px.line(
    events_v2, 
    x="dayofweek", 
    y="event_count", 
    color="year", 
    title="Total events per day of the week",
    labels={'dayofweek': 'Day of Week (1=Monday, 7=Sunday)', 'event_count': 'Total Events'}
)
fig.show()

In [None]:
events

#### Q10: Aggregate the number of events per day and plot the trend for the total number of events over time.

In [None]:
events_per_day = events.set_index('event_datetime').resample('D').\
size().reset_index(name='event_count')

fig = px.line(
    events_per_day, 
    x='event_datetime', 
    y='event_count', 
    title="Total events per day", 
    labels={'event_datetime': 'Date', 'event_count': 'Total Events'}
)
fig.show()