In [None]:
import sys
!{sys.executable} -m pip install plotly
!{sys.executable} -m pip install statsmodels

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import os

### Retailrocket recommender system dataset

Source: https://www.kaggle.com/retailrocket/ecommerce-dataset

Load the dataset files.

In [None]:
events = pd.read_csv(os.path.join(\
    '..', 'timeseries_lec_data', 'events.csv'))
item_properties = pd.read_csv(os.path.join(\
    '..', 'timeseries_lec_data', 'item_properties_part1.csv'))
category_tree = pd.read_csv(os.path.join(\
    '..', 'timeseries_lec_data', 'category_tree.csv'))

In [None]:
events

In [None]:
item_properties

In [None]:
category_tree

In [None]:
events['event_datetime'] = pd.to_datetime(events['timestamp'], unit='ms')
events['event_date'] = events['event_datetime'].dt.date
events

In [None]:
grouped_events = events.groupby(['event_date', 'itemid', 'event']).size().\
                                reset_index(name='event_count')
grouped_events

## Data imputation

a process that replaces missing values in a dataset with substituted values

### Other options for Data Imputation

- Next or previous value
- Maximum or Minimum Value
- Statistical methods: mean, median, mode
- Missing Value Prediction: using a machine learning model to determine the final imputation value

In [None]:
np.random.seed(639)

date_range = pd.date_range(start='2024-01-01', periods=15, freq='D')
sales_data = np.random.normal(loc=200, scale=20, size=len(date_range))
sales_data[::5] = np.nan  # missing value every 5th day

df = pd.DataFrame({
    'date': date_range,
    'sales': sales_data
})
df

Replace with median.

In [None]:
df = pd.DataFrame({
    'date': date_range,
    'sales': sales_data
})
print(df)

# df = #median
df

Replace with mode.

In [None]:
np.random.seed(639)

date_range = pd.date_range(start='2024-01-01', periods=15, freq='D')
values = [100, 150, 200, 250]
sales_data = np.random.choice(values, size=15).tolist()

for i in range(0, len(sales_data), 5):
    sales_data[i] = np.nan

df = pd.DataFrame({
    'date': date_range,
    'sales': sales_data
})
print(df)

df = #mode
df

### Data Interpolation

Interpolation is a technique that can be useful for handling missing values, particularly when the missing data is assumed to follow a pattern or trend based on the existing values in the dataset. This is often the case with time series or ordered data, where the missing values are assumed to lie between known values. Interpolation fills in these gaps by estimating the missing data points using existing values.

When **NOT** to Use Interpolation:
- Large gaps: If the data has large gaps between observations, interpolation might not provide meaningful or reliable estimates.
- Randomness in missing values: If the missing values are random or don't follow any pattern (Missing Completely at Random - MCAR), interpolation may not be appropriate, as it assumes a relationship between values.
- Categorical or non-numeric data: Interpolation is typically used for continuous numerical data. For categorical or binary data, interpolation is not suitable.

In [None]:
np.random.seed(639)

date_range = pd.date_range(start='2024-01-01', periods=60, freq='D')
sales_data = np.random.normal(loc=200, scale=20, size=len(date_range))
sales_data[::5] = np.nan  # missing value every 5th day

df = pd.DataFrame({
    'date': date_range,
    'sales': sales_data
})
print(df.head())

plt.figure(figsize=(10, 6))
plt.plot(df['date'], df['sales'], marker='o', linestyle='-', color='black')
plt.xticks(rotation=45)
plt.ylabel('Sales')
plt.grid(True)
plt.show()

### Linear interpolation

- Assumption: the missing data points lie along a straight line between the known data points.
- Linear interpolation commonly used for time series where changes are expected to be linear between data points.
- When to use: when the relationship between consecutive values is roughly linear or changes gradually.

In [None]:
# linear interpolation

plt.figure(figsize=(10, 6))
plt.plot(df['date'], df['sales'], marker='o', linestyle='-', \
         label='Interpolated Sales', color='orange')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()

### Polynomial interpolation

- Polynomial interpolation fits a polynomial curve through the known data points and uses it to estimate the missing values.
- When to use: when the data shows a nonlinear relationship between points (for example, seasonal effects or periodic patterns).

In [None]:
# polynomial interpolation

plt.figure(figsize=(10, 6))
plt.plot(df['date'], df['sales'], marker='o', linestyle='-', \
         label='Interpolated Sales', color='orange')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()

#### Why do both methods produced the same result?

- Polynomial interpolation of order 2 fits a quadratic function (a parabola) between the two surrounding data points.
- A quadratic function can curve, but for the simple case where the data points are relatively close to each other and do not exhibit any highly nonlinear or curving behavior, the quadratic curve might end up being very similar to the straight line in terms of interpolation.
- When there are only two points surrounding the missing value (as is typical with simple time series data), the quadratic interpolation will essentially behave like a linear interpolation because a second-degree polynomial (a parabola) that passes through two points is uniquely determined by those two points and does not "bend" between them in a noticeable way.

In [None]:
# Non-linear timeseries - sine curve
np.random.seed(42)
date_range = pd.date_range(start='2024-01-01', periods=60, freq='D')
sales_data = 100 + 50 * np.sin(np.linspace(0, 3 * np.pi, len(date_range)))

# Introduce missing values randomly
missing_indices = np.random.choice(range(len(sales_data)), \
                                   size=18, replace=False)
sales_data[missing_indices] = np.nan

df = pd.DataFrame({
    'date': date_range,
    'sales': sales_data
})

plt.figure(figsize=(10, 6))
plt.plot(df['date'], df['sales'], marker='o', linestyle='-', \
         label='Original Sales with Missing Values', color='k')
plt.xticks(rotation=45)
plt.ylabel('Sales')
plt.grid(True)

In [None]:
# Interpolate using linear method

# Interpolate using polynomial method (order 2)

plt.figure(figsize=(10, 6))
plt.plot(df['date'], df['sales_linear'], marker='o', linestyle='-', \
         label='Interpolated Sales (Linear)', color='orange')
plt.plot(df['date'], df['sales_polynomial'], marker='x', linestyle='-', \
         label='Interpolated Sales (Polynomial)', color='green')
plt.legend()
plt.title('Linear vs Polynomial Interpolation on Nonlinear Data')
plt.grid(True)
plt.show()

### Spline interpolation

- Spline interpolation fits a smooth curve (a piecewise polynomial, typically cubic) through the known data points and estimates the missing values.
- When to use: when the data exhibits a smooth, nonlinear trend (often used for time series with cycles or seasonal patterns).

### Spline vs Polynomial

- When to use spline interpolation: when you need smooth, piecewise fits, especially when the data is non-linear or has noise. It's ideal for smooth, continuous data that needs to be modeled accurately across a range of values.
- When to use polynomial interpolation: when you have a simple, small dataset, and you want a single polynomial that fits all points exactly. Avoid polynomial interpolation with large or noisy datasets because it can cause overfitting and oscillations.

In [None]:
np.random.seed(639)
date_range = pd.date_range(start='2024-01-01', periods=60, freq='D')
sales_data = 100 * np.sin(np.linspace(0, 3 * np.pi, len(date_range))) 

# Introduce missing values randomly
missing_indices = np.random.choice(range(5, len(sales_data), 5), \
                                   size=10, replace=False)
sales_data[missing_indices] = np.nan

df = pd.DataFrame({
    'date': date_range,
    'sales': sales_data
})

plt.figure(figsize=(10, 6))
plt.plot(df['date'], df['sales'], marker='o', linestyle='-', \
         label='Original Sales with Missing Values', color='blue')
plt.xticks(rotation=45)
plt.ylabel('Sales')
plt.grid(True)

# Interpolate using cubic spline method


plt.plot(df['date'], df['sales_spline'], marker='x', linestyle='-', \
         label='Interpolated Sales (Spline)', color='green')
plt.legend()
plt.title('Spline Interpolation on Sinusoidal Data')
plt.show()

### Retailrocket recommender system dataset analysis

#### Q1: Calculate the count of each unique property corresponding to "addtocart" items.

In [None]:
events

In [None]:
item_properties

In [None]:
addtocart_events = 
addtocart_events

In [None]:
addtocart_events = events[events['event'] == 'addtocart']
merged_data = # left join
merged_data

In [None]:
property_counts = 
print(property_counts)

### Correlation Methods

In EDA, various correlation methods are used to understand the relationship between numerical variables. 

#### What is correlation? 

The variables are said to be correlated if the changes in one variable results in a corresponding change in another variable.

#### Pearson correlation (Linear relationship)

Pearson correlation coefficientmeasures the strength of a relationship between two variables and their association with one another (linear correlation).

#### Spearman correlation (Monotonic relationship)

Spearman correlation enables us to assess the monotonic relationship between between two ranked variables. That is, how well the relationship between two variables could be represented using a monotonic function.

- The Spearman Rank Correlation can take a value from +1 to -1 where:
    - +1 means a perfect association
    - 0 means that there is no association
    - -1 means a perfect negative association
- Further description of the correlation
    - .00-.19 "very weak"
    - .20-.39 "weak"
    - .40-.59 "moderate"
    - .60-.79 "strong"
    - .80-1.0 "very strong"

#### Kendall's Tau correlation
Kendall's tau is a measure of the correspondence between two rankings

#### Which correlation type should you choose?

- Use Pearson for linear relationships when data assumptions are met.
- Use Spearman for ordinal data or when data isn't linear.
- Use Kendall's Tau for smaller datasets and many rank ties

#### Readings:

1. https://www.simplilearn.com/tutorials/statistics-tutorial/spearmans-rank-correlation#:~:text=Spearman's%20rank%20correlation%20measures%20the,represented%20using%20a%20monotonic%20function. 
2. https://www.simplilearn.com/tutorials/statistics-tutorial/pearson-correlation-coefficient-in-statistics#pearsons_correlation_coefficient
3. https://datatab.net/tutorial/pearson-correlation
4. https://datatab.net/tutorial/spearman-correlation
5. https://datatab.net/tutorial/dispersion-parameter

#### Q2: What is the Spearman correlation between the number of "view" events and the number of "addtocart" events per item?

In [None]:
# Filter 'view' and 'addtocart' events
view_events = 
addtocart_events = 

In [None]:
# Item count
view_counts =
addtocart_counts = 

In [None]:
view_df = 
addtocart_df = 

In [None]:
merged_df = 

In [None]:
spearman_corr = 
spearman_corr

#### Q3: What is the Pearson correlation between the number of "view" events and the number of "addtocart" events per item?

In [None]:
pearson_corr = merged_df['view_count'].corr(merged_df['addtocart_count'], \
                                            method='??')
pearson_corr

#### Q4: What is the Kendall's Tau correlation between the number of "view" events and the number of "addtocart" events per item?

In [None]:
kendall_tau_corr = merged_df['view_count'].corr(merged_df['addtocart_count'], \
                                                method='??')
kendall_tau_corr

#### Q5: Create a scatter plot with ordinary least squares' trend line to show correlation between the number of "view" events and the number of "addtocart" events per item. 

In [None]:


fig.write_html('scatter_plot_with_ols_trendline.html')
fig.show()

In [None]:
events

#### Q6: Plot a line chart of the number of events (view, transaction, etc.) over time, color-coded by the event type.

In [None]:
events_grouped = 

fig.write_html('events_vs_date.html')
fig.show()

#### pandas `resample` method

Documentation: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.resample.html

#### Q7: Calculate the monthly count number of events and create a line chart.

In [None]:
# Set timestamp as the index, resample to Month-Start, and calculate the count of number of events
monthly_avg = 

# fig = px.line(
#     monthly_avg, 
#     x='event_datetime', 
#     y='event_count', 
#     color='year', 
#     title="Monthly total event counts",
#     labels={'event_datetime': 'Month', 'event_count': 'Average Events', 'year': 'Year'}
# )
# fig.show()

#### Q8: Calculate the total number of events per day of the week for each year and visualize the trend with a line chart.

In [None]:
events_v2 = events.copy()

events_v2['year'] = 
events_v2['dayofweek'] = 
                        # Add 1 to make Monday = 1, Sunday = 7

events_v2 = 


fig.show()

In [None]:
events

#### Q9: Aggregate the number of events per day and plot the trend for the total number of events over time.

In [None]:
events_per_day = 

fig.show()