In [1]:
!pip install tqdm
!pip install scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import os
from typing import Tuple, List
# new import statements
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

### Retailrocket recommender system dataset

Source: https://www.kaggle.com/retailrocket/ecommerce-dataset

Load the dataset files.

In [3]:
events = pd.read_csv(os.path.join('data', 'events.csv'))
item_properties = pd.read_csv(os.path.join('data', 'item_properties_part1.csv'))
category_tree = pd.read_csv(os.path.join('data', 'category_tree.csv'))

In [4]:
events

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,
...,...,...,...,...,...
2756096,1438398785939,591435,view,261427,
2756097,1438399813142,762376,view,115946,
2756098,1438397820527,1251746,view,78144,
2756099,1438398530703,1184451,view,283392,


In [5]:
item_properties

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513
...,...,...,...,...
10999994,1439694000000,86599,categoryid,618
10999995,1435460400000,153032,1066,n1020.000 424566
10999996,1440298800000,421788,888,35975 856003 37346
10999997,1437879600000,159792,400,n552.000 639502 n720.000 424566


In [6]:
category_tree

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0
...,...,...
1664,49,1125.0
1665,1112,630.0
1666,1336,745.0
1667,689,207.0


## Exploratory Data Analysis (EDA)

EDA is about understanding the data and forming hypotheses about it. 

- Visualizing Data: Histograms, scatter plots, box plots, etc., to understand distributions and relationships.
- Summary Statistics: Calculating mean, median, mode, standard deviation, and correlation to gain insights into the dataset.
- Detecting Outliers: Identifying values that deviate significantly from the rest of the data.
- Assessing Data Types and Structure: Checking data types, unique values, and identifying missing values.

In [7]:
events['event_datetime'] = pd.to_datetime(events['timestamp'], unit='ms')
events['event_date'] = events['event_datetime'].dt.date
events

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,event_datetime,event_date
0,1433221332117,257597,view,355908,,2015-06-02 05:02:12.117,2015-06-02
1,1433224214164,992329,view,248676,,2015-06-02 05:50:14.164,2015-06-02
2,1433221999827,111016,view,318965,,2015-06-02 05:13:19.827,2015-06-02
3,1433221955914,483717,view,253185,,2015-06-02 05:12:35.914,2015-06-02
4,1433221337106,951259,view,367447,,2015-06-02 05:02:17.106,2015-06-02
...,...,...,...,...,...,...,...
2756096,1438398785939,591435,view,261427,,2015-08-01 03:13:05.939,2015-08-01
2756097,1438399813142,762376,view,115946,,2015-08-01 03:30:13.142,2015-08-01
2756098,1438397820527,1251746,view,78144,,2015-08-01 02:57:00.527,2015-08-01
2756099,1438398530703,1184451,view,283392,,2015-08-01 03:08:50.703,2015-08-01


### One-hot encoding

One-hot encoding is a method for converting categorical data (non-numeric data that can take on a limited number of values, e.g., "view", "addtocart", "transaction") into a numerical format that machine learning algorithms can understand

#### `pandas get_dummies` function

Documentation: https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html

#### Q4: One-hot encode the events data

Let's first determine the type of `event` column.

In [8]:
def one_hot_encoder(df: pd.DataFrame, nan_as_category: bool = True) -> Tuple[pd.DataFrame, List[str]]:
    """
    One-hot encode categorical columns in the given DataFrame.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        nan_as_category (bool): Whether to include NaN as a separate category.
    
    Returns:
        Tuple[pd.DataFrame, List[str]]: The transformed DataFrame and the list of new column names.
    """
    original_columns = list(df.columns)
    print(original_columns)
    
    categorical_columns = df.select_dtypes(["category", "object"]).columns.tolist()
    print(categorical_columns)
    
    df = pd.get_dummies(df, columns=categorical_columns, dummy_na=nan_as_category)
    
    new_columns = [col for col in df.columns if col not in original_columns]
    print(new_columns)
        
    return df, new_columns

In [9]:
events

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,event_datetime,event_date
0,1433221332117,257597,view,355908,,2015-06-02 05:02:12.117,2015-06-02
1,1433224214164,992329,view,248676,,2015-06-02 05:50:14.164,2015-06-02
2,1433221999827,111016,view,318965,,2015-06-02 05:13:19.827,2015-06-02
3,1433221955914,483717,view,253185,,2015-06-02 05:12:35.914,2015-06-02
4,1433221337106,951259,view,367447,,2015-06-02 05:02:17.106,2015-06-02
...,...,...,...,...,...,...,...
2756096,1438398785939,591435,view,261427,,2015-08-01 03:13:05.939,2015-08-01
2756097,1438399813142,762376,view,115946,,2015-08-01 03:30:13.142,2015-08-01
2756098,1438397820527,1251746,view,78144,,2015-08-01 02:57:00.527,2015-08-01
2756099,1438398530703,1184451,view,283392,,2015-08-01 03:08:50.703,2015-08-01


In [10]:
events

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,event_datetime,event_date
0,1433221332117,257597,view,355908,,2015-06-02 05:02:12.117,2015-06-02
1,1433224214164,992329,view,248676,,2015-06-02 05:50:14.164,2015-06-02
2,1433221999827,111016,view,318965,,2015-06-02 05:13:19.827,2015-06-02
3,1433221955914,483717,view,253185,,2015-06-02 05:12:35.914,2015-06-02
4,1433221337106,951259,view,367447,,2015-06-02 05:02:17.106,2015-06-02
...,...,...,...,...,...,...,...
2756096,1438398785939,591435,view,261427,,2015-08-01 03:13:05.939,2015-08-01
2756097,1438399813142,762376,view,115946,,2015-08-01 03:30:13.142,2015-08-01
2756098,1438397820527,1251746,view,78144,,2015-08-01 02:57:00.527,2015-08-01
2756099,1438398530703,1184451,view,283392,,2015-08-01 03:08:50.703,2015-08-01


In [11]:
events_encoded, event_columns = one_hot_encoder(events.drop('event_date', axis=1))
events_encoded

['timestamp', 'visitorid', 'event', 'itemid', 'transactionid', 'event_datetime']
['event']
['event_addtocart', 'event_transaction', 'event_view', 'event_nan']


Unnamed: 0,timestamp,visitorid,itemid,transactionid,event_datetime,event_addtocart,event_transaction,event_view,event_nan
0,1433221332117,257597,355908,,2015-06-02 05:02:12.117,False,False,True,False
1,1433224214164,992329,248676,,2015-06-02 05:50:14.164,False,False,True,False
2,1433221999827,111016,318965,,2015-06-02 05:13:19.827,False,False,True,False
3,1433221955914,483717,253185,,2015-06-02 05:12:35.914,False,False,True,False
4,1433221337106,951259,367447,,2015-06-02 05:02:17.106,False,False,True,False
...,...,...,...,...,...,...,...,...,...
2756096,1438398785939,591435,261427,,2015-08-01 03:13:05.939,False,False,True,False
2756097,1438399813142,762376,115946,,2015-08-01 03:30:13.142,False,False,True,False
2756098,1438397820527,1251746,78144,,2015-08-01 02:57:00.527,False,False,True,False
2756099,1438398530703,1184451,283392,,2015-08-01 03:08:50.703,False,False,True,False


## Statistical model for timeseries analysis

### 1. Exponential Moving Average (EMA)
EMA is a quantitative technique used as a forecasting model for time series analysis. It is a statistical method used to smooth time-series data by giving more weight to recent observations and less weight to older ones. It's useful in scenarios where recent data points are considered more relevant and informative. Typical use case scenarios include financial time-series analysis or e-commerce product popularity prediction.

### pandas ewm

- Documentation: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.ewm.html.

#### Q5: Use an Exponential Moving Average (EMA) model to calculate the average itemid view counts per category in the events dataset. Calculate the EMA for each category over a specified span.

In [12]:
item_properties

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513
...,...,...,...,...
10999994,1439694000000,86599,categoryid,618
10999995,1435460400000,153032,1066,n1020.000 424566
10999996,1440298800000,421788,888,35975 856003 37346
10999997,1437879600000,159792,400,n552.000 639502 n720.000 424566


#### Filter `item_properties` to get only rows where `property` is 'categoryid'.

In [13]:
category_items = item_properties[item_properties['property'] == 'categoryid'][['itemid', 'value']]
# 'value' column contains the category IDs, so let's rename value to categoryid
category_items.columns = ['itemid', 'categoryid']  
category_items

Unnamed: 0,itemid,categoryid
0,460429,1338
140,281245,1277
151,35575,1059
189,8313,1147
197,55102,47
...,...,...
10999880,441523,1167
10999917,250848,769
10999932,116380,1509
10999960,84186,209


#### Type conversion to `int`.

In [14]:
category_items['categoryid'] = category_items['categoryid'].astype(int)

#### Compute the unique category ids (product families).

In [15]:
categories = category_items['categoryid'].unique()
categories

array([1338, 1277, 1059, ...,  934,  480, 1354])

In [16]:
len(categories)

1196

#### Create a dataframe to store the results

In [17]:
results = pd.DataFrame(columns=["timestamp", "categoryid", "ema"])
results

Unnamed: 0,timestamp,categoryid,ema


#### tqdm

Enables us to view progress meter for any iterable. Very helpful to keep track of programs that run for a long time.
Documentation: https://tqdm.github.io/.                                                                                                   

#### Compute EMA for view event counts per timestamp

In [18]:
results_list = []

for category in tqdm(categories, total=len(categories)):
    # get the itemids associated with the current category
    category_items_for_category = category_items[category_items['categoryid'] == category]['itemid']
    
    # filter the events dataset for the current category's items and 'view' events
    category_events = events[(events['itemid'].isin(category_items_for_category)) & (events['event'] == 'view')]
    
    # group by timestamp to get the view counts per timestamp
    view_counts = category_events.groupby("timestamp").size()
    
    # apply the Exponential Moving Average (EMA) with a span of 16 to smooth the view counts
    ema_values = view_counts.ewm(span=16, adjust=False).mean()
    
    # store the results with the EMA values for the category
    category_results = pd.DataFrame({
        "timestamp": ema_values.index,
        "categoryid": category,
        "ema": ema_values.values
    })
    
    # append the current category's results to the results list
    results_list.append(category_results)

# concatenate all the results into a single DataFrame
results = pd.concat(results_list, ignore_index=True)

100%|██████████████████████████████████████████████████████████████| 1196/1196 [09:24<00:00,  2.12it/s]


In [19]:
results

Unnamed: 0,timestamp,categoryid,ema
0,1430649263692,1338,1.0
1,1430649323031,1338,1.0
2,1430659604240,1338,1.0
3,1430668046337,1338,1.0
4,1430668062168,1338,1.0
...,...,...,...
1459107,1438633754909,480,1.0
1459108,1440457508755,480,1.0
1459109,1442344855260,480,1.0
1459110,1442169053784,1354,1.0


#### P6 note

In P6, you'll have nested iteration: for each store, you'll be iterating over each family and using train data to compute EMA. Once you do that, you should iterate over the test set, use the results for generating a list of predictions and then invoke make_submission function to generate the csv file.

#### 2. Linear Regression

If the dataset exhibits a linear trend (either increasing or decreasing), linear regression can be a good starting point for short-term forecasting. 

A simple linear regression model fits a line to the data, assuming a linear relationship between the independent variable (time) and the dependent variable (views in our example).

#### Merge `events` with `category_items`.

In [20]:
events

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,event_datetime,event_date
0,1433221332117,257597,view,355908,,2015-06-02 05:02:12.117,2015-06-02
1,1433224214164,992329,view,248676,,2015-06-02 05:50:14.164,2015-06-02
2,1433221999827,111016,view,318965,,2015-06-02 05:13:19.827,2015-06-02
3,1433221955914,483717,view,253185,,2015-06-02 05:12:35.914,2015-06-02
4,1433221337106,951259,view,367447,,2015-06-02 05:02:17.106,2015-06-02
...,...,...,...,...,...,...,...
2756096,1438398785939,591435,view,261427,,2015-08-01 03:13:05.939,2015-08-01
2756097,1438399813142,762376,view,115946,,2015-08-01 03:30:13.142,2015-08-01
2756098,1438397820527,1251746,view,78144,,2015-08-01 02:57:00.527,2015-08-01
2756099,1438398530703,1184451,view,283392,,2015-08-01 03:08:50.703,2015-08-01


In [21]:
category_items

Unnamed: 0,itemid,categoryid
0,460429,1338
140,281245,1277
151,35575,1059
189,8313,1147
197,55102,47
...,...,...
10999880,441523,1167
10999917,250848,769
10999932,116380,1509
10999960,84186,209


In [22]:
events_with_categories = pd.merge(events, category_items, on="itemid", how="left")

#### Group `events` by `category` and `timestamp` to get view counts.

In [23]:
grouped_data = (
    events_with_categories[events_with_categories['event'] == 'view']
    .groupby(['timestamp', 'categoryid'])
    .size()
    .reset_index(name='view_count')  # Add a new column for aggregated counts
)

In [27]:
grouped_data

Unnamed: 0,timestamp,categoryid,view_count
0,1430622028399,421.0,5
1,1430622028399,1674.0,13
2,1430622033686,1339.0,1
3,1430622036210,368.0,1
4,1430622040988,683.0,1
...,...,...,...
1459107,1442545134195,1196.0,1
1459108,1442545152365,1293.0,1
1459109,1442545153842,956.0,1
1459110,1442545174109,491.0,1


#### Train test split

In [24]:
train, test = train_test_split(grouped_data, test_size=0.2, random_state=42)

In [25]:
categories = train['categoryid'].unique()
models = {}

# Train a Linear Regression model for each category
for category in tqdm(categories, total=len(categories)):
    category_train_data = train[train['categoryid'] == category]
    
    # Define target and features
    y = category_train_data['view_count']
    X = category_train_data[['timestamp']]  
    # In P6, you will be dropping id, date, and sales to create your feature vector 
    
    X = np.array(X).reshape(-1, 1).astype(np.float32)
    y = np.array(y).astype(np.float32)
    
    # Train the model
    model = LinearRegression()
    model.fit(X, y)
    
    # Store the model
    models[category] = model

100%|█████████████████████████████████████████████████████████████| 1112/1112 [00:08<00:00, 126.43it/s]


#### Predict on test set

In [26]:
test['predicted_view_count'] = np.nan  # Placeholder for predictions

for category in tqdm(categories, total=len(categories)):
    # Filter test data for the current category
    category_test_data = test[test['categoryid'] == category]
    
    # Skip if there's no data for this category
    if category_test_data.empty:
        print(f"No test data for category {category}. Skipping predictions.")
        continue
    
    if category in models:
        X_test = category_test_data[['timestamp']].to_numpy().reshape(-1, 1).astype(np.float32)
        
        # Skip prediction if there are no rows in X_test
        if X_test.shape[0] == 0:
            print(f"No valid test samples for category {category}. Skipping.")
            continue
        
        test.loc[test['categoryid'] == category, 'predicted_view_count'] = models[category].predict(X_test)

# Output the predictions
print(test[['timestamp', 'categoryid', 'view_count', 'predicted_view_count']].head())

 46%|████████████████████████████▌                                 | 513/1112 [00:02<00:02, 219.32it/s]

No test data for category 1100.0. Skipping predictions.


 55%|██████████████████████████████████                            | 611/1112 [00:03<00:02, 219.27it/s]

No test data for category 780.0. Skipping predictions.


 72%|████████████████████████████████████████████▌                 | 800/1112 [00:04<00:01, 216.36it/s]

No test data for category 1590.0. Skipping predictions.


 85%|████████████████████████████████████████████████████▉         | 950/1112 [00:05<00:00, 171.44it/s]

No test data for category 713.0. Skipping predictions.


 89%|███████████████████████████████████████████████████████▎      | 991/1112 [00:05<00:00, 216.91it/s]

No test data for category 1499.0. Skipping predictions.
No test data for category 501.0. Skipping predictions.
No test data for category 922.0. Skipping predictions.
No test data for category 1068.0. Skipping predictions.
No test data for category 1134.0. Skipping predictions.
No test data for category 1310.0. Skipping predictions.


 91%|███████████████████████████████████████████████████████▊     | 1017/1112 [00:05<00:00, 190.10it/s]

No test data for category 60.0. Skipping predictions.
No test data for category 902.0. Skipping predictions.
No test data for category 667.0. Skipping predictions.
No test data for category 45.0. Skipping predictions.
No test data for category 357.0. Skipping predictions.
No test data for category 1406.0. Skipping predictions.
No test data for category 908.0. Skipping predictions.
No test data for category 601.0. Skipping predictions.
No test data for category 981.0. Skipping predictions.
No test data for category 1449.0. Skipping predictions.
No test data for category 1354.0. Skipping predictions.
No test data for category 749.0. Skipping predictions.
No test data for category 433.0. Skipping predictions.
No test data for category 1427.0. Skipping predictions.
No test data for category 534.0. Skipping predictions.
No test data for category 32.0. Skipping predictions.
No test data for category 1489.0. Skipping predictions.
No test data for category 412.0. Skipping predictions.
No test 

100%|█████████████████████████████████████████████████████████████| 1112/1112 [00:05<00:00, 194.85it/s]

No test data for category 264.0. Skipping predictions.
No test data for category 1442.0. Skipping predictions.
No test data for category 587.0. Skipping predictions.
No test data for category 784.0. Skipping predictions.
No test data for category 632.0. Skipping predictions.
No test data for category 708.0. Skipping predictions.
No test data for category 709.0. Skipping predictions.
No test data for category 1459.0. Skipping predictions.
No test data for category 621.0. Skipping predictions.
No test data for category 28.0. Skipping predictions.
No test data for category 781.0. Skipping predictions.
No test data for category 672.0. Skipping predictions.
No test data for category 820.0. Skipping predictions.
No test data for category 756.0. Skipping predictions.
No test data for category 1361.0. Skipping predictions.
No test data for category 129.0. Skipping predictions.
No test data for category 748.0. Skipping predictions.
No test data for category 393.0. Skipping predictions.
No test 


