In [None]:
!pip install plotly --upgrade
!pip install statsmodels

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

### Retailrocket recommender system dataset

Source: https://www.kaggle.com/retailrocket/ecommerce-dataset

Load the dataset files.

In [None]:
events = pd.read_csv(os.path.join('data', 'events.csv'))
item_properties = pd.read_csv(os.path.join('data', 'item_properties_part1.csv'))
category_tree = pd.read_csv(os.path.join('data', 'category_tree.csv'))

In [None]:
events

In [None]:
item_properties

In [None]:
category_tree

#### Q1: Convert `parentid` column to `int32` type.

In [None]:
category_tree

In [None]:
category_tree_v2 = category_tree.copy()
category_tree_v3 = category_tree.copy()
category_tree_v4 = category_tree.copy()

In [None]:
category_tree_v2.parentid = category_tree2.parentid.astype("int32")
category_tree_v2

## Data imputation

a process that replaces missing values in a dataset with substituted values

### Handling missing values with `category_tree`

#### Option 1: Fill `NaN` with a placeholder value (e.g., -1 or another integer)

In [None]:
category_tree_v2['parentid'] = category_tree['parentid'].fillna(-1).astype("int32")
category_tree_v2

#### Option 2: Drop rows with `NaN` in the `parentid `column

In [None]:
category_tree_v3 = category_tree_v3.dropna(subset=['parentid'])

category_tree_v3.parentid = category_tree_v3.parentid.astype("int32")
category_tree_v3

#### Option 3: Use `Int32` (nullable integer type)

In [None]:
category_tree_v4['parentid'] = category_tree_v4['parentid'].astype("Int32")
category_tree_v4

### Other options for Data Imputation

- Next or previous value
- Maximum or Minimum Value
- Statistical methods: mean, median, mode
- Missing Value Prediction: using a machine learning model to determine the final imputation value

Replace with next value.

In [None]:
np.random.seed(639)

date_range = pd.date_range(start='2024-01-01', periods=15, freq='D')
sales_data = np.random.normal(loc=200, scale=20, size=len(date_range))
sales_data[::5] = np.nan  # missing value every 5th day

df = pd.DataFrame({
    'date': date_range,
    'sales': sales_data
})
print(df)

df = df.bfill()
print(df)

Replace with previous value.

In [None]:
df = pd.DataFrame({
    'date': date_range,
    'sales': sales_data
})
print(df)

df = df.ffill()
print(df)

Replace with minimum / maximum.

In [None]:
df = pd.DataFrame({
    'date': date_range,
    'sales': sales_data
})
print(df)

df = df.fillna(df.sales.min()) # or max() for maximum
print(df)

Replace with mean.

In [None]:
df = pd.DataFrame({
    'date': date_range,
    'sales': sales_data
})
print(df)

df = df.fillna(df.sales.mean())
print(df)

Replace with median.

In [None]:
df = pd.DataFrame({
    'date': date_range,
    'sales': sales_data
})
print(df)

df = df.fillna(df.sales.median())
print(df)

Replace with mode.

In [None]:
np.random.seed(639)

date_range = pd.date_range(start='2024-01-01', periods=15, freq='D')
values = [100, 150, 200, 250]
sales_data = np.random.choice(values, size=15).tolist()

for i in range(0, len(data), 5):
    sales_data[i] = np.nan

df = pd.DataFrame({
    'date': date_range,
    'sales': sales_data
})
print(df)

df = df.fillna(df.sales.mode().iloc[0])
print(df)

### Data Interpolation

Interpolation is a technique that can be useful for handling missing values, particularly when the missing data is assumed to follow a pattern or trend based on the existing values in the dataset. This is often the case with time series or ordered data, where the missing values are assumed to lie between known values. Interpolation fills in these gaps by estimating the missing data points using existing values.

When **NOT** to Use Interpolation:
- Large gaps: If the data has large gaps between observations, interpolation might not provide meaningful or reliable estimates.
- Randomness in missing values: If the missing values are random or don't follow any pattern (Missing Completely at Random - MCAR), interpolation may not be appropriate, as it assumes a relationship between values.
- Categorical or non-numeric data: Interpolation is typically used for continuous numerical data. For categorical or binary data, interpolation is not suitable.

In [None]:
np.random.seed(639)

date_range = pd.date_range(start='2024-01-01', periods=60, freq='D')
sales_data = np.random.normal(loc=200, scale=20, size=len(date_range))
sales_data[::5] = np.nan  # missing value every 5th day

df = pd.DataFrame({
    'date': date_range,
    'sales': sales_data
})
print(df.head())

plt.figure(figsize=(10, 6))
plt.plot(df['date'], df['sales'], marker='o', linestyle='-', color='black')
plt.xticks(rotation=45)
plt.ylabel('Sales')
plt.grid(True)
plt.show()

### Linear interpolation

- Assumption: the missing data points lie along a straight line between the known data points.
- Linear interpolation commonly used for time series where changes are expected to be linear between data points.
- When to use: when the relationship between consecutive values is roughly linear or changes gradually.

In [None]:
df['sales'] = df['sales'].interpolate(method='linear')

plt.figure(figsize=(10, 6))
plt.plot(df['date'], df['sales'], marker='o', linestyle='-', label='Interpolated Sales', color='orange')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()

### Polynomial interpolation

- Polynomial interpolation fits a polynomial curve through the known data points and uses it to estimate the missing values.
- When to use: when the data shows a nonlinear relationship between points (for example, seasonal effects or periodic patterns).

In [None]:
df['sales'] = df['sales'].interpolate(method='polynomial', order=2)

plt.figure(figsize=(10, 6))
plt.plot(df['date'], df['sales'], marker='o', linestyle='-', label='Interpolated Sales', color='orange')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.show()

#### Why do both methods produced the same result?

- Polynomial interpolation of order 2 fits a quadratic function (a parabola) between the two surrounding data points.
- A quadratic function can curve, but for the simple case where the data points are relatively close to each other and do not exhibit any highly nonlinear or curving behavior, the quadratic curve might end up being very similar to the straight line in terms of interpolation.
- When there are only two points surrounding the missing value (as is typical with simple time series data), the quadratic interpolation will essentially behave like a linear interpolation because a second-degree polynomial (a parabola) that passes through two points is uniquely determined by those two points and does not "bend" between them in a noticeable way.

In [None]:
# Non-linear timeseries - sine curve
np.random.seed(42)
date_range = pd.date_range(start='2024-01-01', periods=60, freq='D')
sales_data = 100 + 50 * np.sin(np.linspace(0, 3 * np.pi, len(date_range)))

# Introduce missing values randomly
missing_indices = np.random.choice(range(len(sales_data)), size=18, replace=False)
sales_data[missing_indices] = np.nan

df = pd.DataFrame({
    'date': date_range,
    'sales': sales_data
})

plt.figure(figsize=(10, 6))
plt.plot(df['date'], df['sales'], marker='o', linestyle='-', label='Original Sales with Missing Values')
plt.xticks(rotation=45)
plt.ylabel('Sales')
plt.grid(True)

In [None]:
# Interpolate using linear method
df['sales_linear'] = df['sales'].interpolate(method='linear')

# Interpolate using polynomial method (order 2)
df['sales_polynomial'] = df['sales'].interpolate(method='polynomial', order=2)

plt.figure(figsize=(10, 6))
plt.plot(df['date'], df['sales_linear'], marker='o', linestyle='-', label='Interpolated Sales (Linear)', color='orange')
plt.plot(df['date'], df['sales_polynomial'], marker='x', linestyle='-', label='Interpolated Sales (Polynomial)', color='green')
plt.legend()
plt.title('Linear vs Polynomial Interpolation on Nonlinear Data')
plt.grid(True)
plt.show()

### Spline interpolation

- Spline interpolation fits a smooth curve (a piecewise polynomial, typically cubic) through the known data points and estimates the missing values.
- When to use: when the data exhibits a smooth, nonlinear trend (often used for time series with cycles or seasonal patterns).

### Spline vs Polynomial

- When to use spline interpolation: when you need smooth, piecewise fits, especially when the data is non-linear or has noise. It's ideal for smooth, continuous data that needs to be modeled accurately across a range of values.
- When to use polynomial interpolation: when you have a simple, small dataset, and you want a single polynomial that fits all points exactly. Avoid polynomial interpolation with large or noisy datasets because it can cause overfitting and oscillations.

In [None]:
np.random.seed(639)
date_range = pd.date_range(start='2024-01-01', periods=60, freq='D')
sales_data = 100 * np.sin(np.linspace(0, 3 * np.pi, len(date_range))) 

# Introduce missing values randomly
missing_indices = np.random.choice(range(5, len(sales_data), 5), size=10, replace=False)
sales_data[missing_indices] = np.nan

df = pd.DataFrame({
    'date': date_range,
    'sales': sales_data
})

plt.figure(figsize=(10, 6))
plt.plot(df['date'], df['sales'], marker='o', linestyle='-', label='Original Sales with Missing Values', color='blue')
plt.xticks(rotation=45)
plt.ylabel('Sales')
plt.grid(True)

# Interpolate using cubic spline method
df['sales_spline'] = df['sales'].interpolate(method='spline', order=3)

plt.plot(df['date'], df['sales_spline'], marker='x', linestyle='-', label='Interpolated Sales (Spline)', color='green')
plt.legend()
plt.title('Spline Interpolation on Sinusoidal Data')
plt.show()

## Exploratory Data Analysis (EDA)

EDA is about understanding the data and forming hypotheses about it. 

- Visualizing Data: Histograms, scatter plots, box plots, etc., to understand distributions and relationships.
- Summary Statistics: Calculating mean, median, mode, standard deviation, and correlation to gain insights into the dataset.
- Detecting Outliers: Identifying values that deviate significantly from the rest of the data.
- Assessing Data Types and Structure: Checking data types, unique values, and identifying missing values.

#### Q2: How many unique transactions are in the dataset?

In [None]:
unique_transactions = events['transactionid'].nunique()
unique_transactions

#### Q3: What is the distribution of transactions?

In [None]:
transaction_counts = events[events['transactionid'].notnull()]['transactionid'].value_counts()

transaction_counts.plot(kind='hist', bins=30, color='black')
plt.title('Distribution of Transactions')
plt.xlabel('Number of Events per Transaction')
plt.ylabel('Frequency')
plt.show()

#### Q4: How many events happen on average per day?

In [None]:
events_per_day = events.groupby(events['event_date']).size()
avg_events_per_day = events_per_day.mean()

events_per_day.plot(kind='line', title='Events per Day', color='black')
plt.xlabel('Date')
plt.ylabel('Number of Events')
plt.xticks(rotation=45)
plt.show()

#### Q5: Calculate the count of each unique property corresponding to "addtocart" items.

In [None]:
events

In [None]:
item_properties

In [None]:
addtocart_events = events[events['event'] == 'addtocart']
addtocart_events

In [None]:
addtocart_events = events[events['event'] == 'addtocart']
merged_data = pd.merge(addtocart_events, item_properties, on='itemid', how='left')
merged_data

In [None]:
property_counts = merged_data['property'].value_counts()
print(property_counts)

#### Q6: What is the Spearman correlation between the number of "view" events and the number of "addtocart" events per item?

In [None]:
# Filter 'view' and 'addtocart' events
view_events = events[events['event'] == 'view']
addtocart_events = events[events['event'] == 'addtocart']

In [None]:
# Item count
view_counts = view_events['itemid'].value_counts()
addtocart_counts = addtocart_events['itemid'].value_counts()

In [None]:
view_df = view_counts.reset_index().rename(columns={'count': 'view_count'})
addtocart_df = addtocart_counts.reset_index().rename(columns={'count': 'addtocart_count'})

In [None]:
merged_df = pd.merge(view_df, addtocart_df, on='itemid', how='inner')

In [None]:
spearman_corr = merged_df['view_count'].corr(merged_df['addtocart_count'],\
                                             method='spearman')
spearman_corr

#### Q7: What is the Pearson correlation between the number of "view" events and the number of "addtocart" events per item?

In [None]:
pearson_corr = merged_df['view_count'].corr(merged_df['addtocart_count'], \
                                            method='pearson')
pearson_corr

#### Q8: What is the Kendall's Tau correlation between the number of "view" events and the number of "addtocart" events per item?

In [None]:
kendall_tau_corr = merged_df['view_count'].corr(merged_df['addtocart_count'], \
                                                method='kendall')
kendall_tau_corr

#### Q9: Create a scatter plot with ordinary least squares' trend line to show correlation between the number of "view" events and the number of "addtocart" events per item. 

In [None]:
import plotly.express as px
# Create a scatter plot with OLS trendline
fig = px.scatter(merged_df, x="view_count", y="addtocart_count", \
                 trendline="ols", trendline_color_override="red")

# Save the figure as an HTML file
fig.write_html('scatter_plot_with_ols_trendline.html')