In [1]:
# **DATA ANALYTICS PROJECT : OTTO – Multi-Objective Recommender System**

## Description of the compition and models used 

### Importing packages

In [2]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import datetime
from tqdm.notebook import tqdm

In [3]:
from sklearn.model_selection import train_test_split

### Importing the data

In [4]:
train = pd.read_csv('../input/my-data/train (2).csv')
test = pd.read_csv('../input/my-data/test (2).csv')

In [5]:
train.shape,test.shape

((10285561, 5), (928115, 5))

In [6]:
train.drop("Unnamed: 0",axis=1,inplace=True)

In [7]:
test.drop("Unnamed: 0",axis=1,inplace=True)

Splitting the dataframe into 70 %training 30 % testing 

In [8]:
## No of unique sessions
train.nunique()

session      200000
aid          951294
ts         10132591
type              3
dtype: int64

In [9]:
train_split=train.iloc[:7331864,:]
test_split=train.iloc[7331864:,:]

In [10]:
train_split

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
7331859,139999,269200,1660804010508,clicks
7331860,139999,734876,1660804124988,clicks
7331861,139999,1499541,1661149361690,clicks
7331862,139999,1623295,1661597051499,clicks


In [None]:
import warnings
warnings.filterwarnings('ignore')
import pathlib
import seaborn as sns

### EXPLORATORY DATA ANALYSIS

#### 1. Looking for NA values

In [11]:
train.isnull().sum()

session    0
aid        0
ts         0
type       0
dtype: int64

In [12]:
test.isnull().sum()

session    0
aid        0
ts         0
type       0
dtype: int64

Thus we have no missing values

####  Description of the data

The train data contains data about user sessions and what happened in those sessions ordered by a timestamp the type of action which occured is either a "click" or "cart" i.e the article reffered in article id was added to cart or "order" i.e the article id was ordered

The test data is truncated by a timestamp i.e the session is shorter and we are to predict what happens after a particular timestamp

Thus the data belongs mostly to july august of this year 

#### SHAPE OF DATASET

In [None]:
dataset_directory = pathlib.Path('../input/otto-multi-objective-recommender-system-pickle')
df_train = pd.read_pickle(dataset_directory / 'train.pkl')
df_test = pd.read_pickle(dataset_directory / 'test.pkl')

print(f'Training Shape: {df_train.shape}')
print(f'Test Shape: {df_test.shape}')

Sessions consist of user events such as clicking to a product, adding a product to cart or ordering a product. Each session belongs to a unique user and duration of sessions are different from each other.

session: unique ID of the user session
aid: unique ID of the product
ts: timestamp of the event
type: category of the event

It is a multi-objective ranking problem since there are 3 categories (click, cart, order) to recommend for each session

In [None]:
train_events = df_train.shape[0]
test_events = df_test.shape[0]
print(f'Number of Events - Training: {train_events} | Test: {test_events}')

train_unique_sessions = df_train['session'].unique()
test_unique_sessions = df_test['session'].unique()
print(f'Number of Unique Sessions - Training: {len(train_unique_sessions)} | Test: {len(test_unique_sessions)}')
del train_unique_sessions, test_unique_sessions

train_unique_aids = df_train['aid'].unique()
test_unique_aids = df_test['aid'].unique()
overlapping_aids = set(train_unique_aids).intersection(set(test_unique_aids))
print(f'Number of Unique Products - Training: {len(train_unique_aids)} | Test: {len(test_unique_aids)} - ({len(overlapping_aids)} Overlapping Products)')
del train_unique_aids, test_unique_aids, overlapping_aids

train_clicks = df_train[df_train['type'] == 0].shape[0]
test_clicks = df_test[df_test['type'] == 0].shape[0]
print(f'Number of Clicks - Training: {train_clicks} | Test: {test_clicks}')

train_carts = df_train[df_train['type'] == 1].shape[0]
test_carts = df_test[df_test['type'] == 1].shape[0]
print(f'Number of Carts - Training: {train_carts} | Test: {test_carts}')

train_orders = df_train[df_train['type'] == 2].shape[0]
test_orders = df_test[df_test['type'] == 2].shape[0]
print(f'Number of Orders - Training: {train_orders} | Test: {test_orders}')

In [None]:
train_start = df_train['ts'].min().strftime('%Y.%m.%d %X')
train_end = df_train['ts'].max().strftime('%Y.%m.%d %X')
test_start = df_test['ts'].min().strftime('%Y.%m.%d %X')
test_end = df_test['ts'].max().strftime('%Y.%m.%d %X')
print(f'Events Time Range\nTraining: {train_start} - {train_end}\nTest: {test_start} - {test_end}')

the training data is for a period of 4 weeks and test data for 1 week

In [None]:
def visualize_categorical_feature_distribution(df, feature, path=None):
    fig, ax = plt.subplots(figsize=(24, df[feature].value_counts().shape[0] + 4), dpi=100)
    sns.barplot(
        y=df[feature].value_counts().values,
        x=df[feature].value_counts().index,
        color='tab:blue',
        ax=ax
    )
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xticklabels([
        f'{x} ({value_count:,})' for value_count, x in zip(
            df[feature].value_counts().values,
            df[feature].value_counts().index
        )
    ])
    ax.tick_params(axis='x', labelsize=15, pad=10)
    ax.tick_params(axis='y', labelsize=15, pad=10)
    ax.set_title(f'Value Counts {feature}', size=20, pad=15)

    if path is None:
        plt.show()
    else:
        plt.savefig(path)
        plt.close(fig)


visualize_categorical_feature_distribution(df=df_train, feature='type')
visualize_categorical_feature_distribution(df=df_test, feature='type')

0:CLICKS
1:CART
2:ORDERS

In [None]:
df_train_session_counts = df_train.groupby('session')[['session']].count()
df_test_session_counts = df_test.groupby('session')[['session']].count()

df_train_aid_counts = df_train.groupby('aid')[['aid']].count()
df_test_aid_counts = df_test.groupby('aid')[['aid']].count()

df_train_aid_counts = df_train_aid_counts.rename(columns={'aid': 'count'}).reset_index()
df_test_aid_counts = df_test_aid_counts.rename(columns={'aid': 'count'}).reset_index()

df_all_aid_counts = pd.concat((df_train_aid_counts, df_test_aid_counts), axis=0, ignore_index=True).reset_index(drop=True)
df_all_aid_counts = df_all_aid_counts.groupby('aid')['count'].sum().reset_index()

df_train_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_test_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_all_aid_counts.sort_values(by='count', ascending=False, inplace=True)

train_20_most_frequent_aids = df_train_aid_counts.set_index('aid').head(20).to_dict()['count']
test_20_most_frequent_aids = df_test_aid_counts.set_index('aid').head(20).to_dict()['count']
all_20_most_frequent_aids = df_all_aid_counts.set_index('aid').head(20).to_dict()['count']

del df_train_aid_counts, df_test_aid_counts, df_all_aid_counts

Top 20 most frequent products are visualized for training, test and training + test sets below. X axis is the frequency and y axis labels are aid and its frequency 

In [None]:
def visualize_prod_frequencies(aid_frequencies, title, path=None):
    fig, ax = plt.subplots(figsize=(24, 20), dpi=100)
    ax.barh(range(len(aid_frequencies)), aid_frequencies.values(), align='center')
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_yticks(range(len(aid_frequencies)))
    ax.set_yticklabels([f'{x} ({value_count:,})' for x, value_count in aid_frequencies.items()])
    ax.tick_params(axis='x', labelsize=15, pad=10)
    ax.tick_params(axis='y', labelsize=15, pad=10)
    ax.set_title(title, size=20, pad=15)
    plt.gca().invert_yaxis()

    if path is None:
        plt.show()
    else:
        plt.savefig(path)
        plt.close(fig)
    

In [None]:
visualize_prod_frequencies(
    aid_frequencies=train_20_most_frequent_aids,
    title='Top 20 Most Frequent PRODUCTS in Training Set'
)
visualize_prod_frequencies(
    aid_frequencies=test_20_most_frequent_aids,
    title='Top 20 Most Frequent PRODUCTS in Test Set'
)
visualize_prod_frequencies(
    aid_frequencies=all_20_most_frequent_aids,
    title='Top 20 Most Frequent PRODUCTS in Training + Test Set'
)


In [None]:
df_train_aids_by_types  = df_train.groupby(['type', 'aid'])[['aid']].count()
df_test_aids_by_types  = df_test.groupby(['type', 'aid'])[['aid']].count()

df_train_click_aid_counts = df_train_aids_by_types.loc[0].rename(columns={'aid': 'count'}).reset_index()
df_train_cart_aid_counts = df_train_aids_by_types.loc[1].rename(columns={'aid': 'count'}).reset_index()
df_train_order_aid_counts = df_train_aids_by_types.loc[2].rename(columns={'aid': 'count'}).reset_index()

df_test_click_aid_counts = df_test_aids_by_types.loc[0].rename(columns={'aid': 'count'}).reset_index()
df_test_cart_aid_counts = df_test_aids_by_types.loc[1].rename(columns={'aid': 'count'}).reset_index()
df_test_order_aid_counts = df_test_aids_by_types.loc[2].rename(columns={'aid': 'count'}).reset_index()

df_all_click_aid_counts = pd.concat((df_train_click_aid_counts, df_test_click_aid_counts), axis=0, ignore_index=True).reset_index(drop=True)
df_all_click_aid_counts = df_all_click_aid_counts.groupby('aid')['count'].sum().reset_index()

df_all_cart_aid_counts = pd.concat((df_train_cart_aid_counts, df_test_cart_aid_counts), axis=0, ignore_index=True).reset_index(drop=True)
df_all_cart_aid_counts = df_all_cart_aid_counts.groupby('aid')['count'].sum().reset_index()

df_all_order_aid_counts = pd.concat((df_train_order_aid_counts, df_test_order_aid_counts), axis=0, ignore_index=True).reset_index(drop=True)
df_all_order_aid_counts = df_all_order_aid_counts.groupby('aid')['count'].sum().reset_index()

In [None]:
df_train_click_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_test_click_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_all_click_aid_counts.sort_values(by='count', ascending=False, inplace=True)

df_train_cart_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_test_cart_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_all_cart_aid_counts.sort_values(by='count', ascending=False, inplace=True)

df_train_order_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_test_order_aid_counts.sort_values(by='count', ascending=False, inplace=True)
df_all_order_aid_counts.sort_values(by='count', ascending=False, inplace=True)

train_20_most_frequent_click_aids = df_train_click_aid_counts.set_index('aid').head(20).to_dict()['count']
test_20_most_frequent_click_aids = df_test_click_aid_counts.set_index('aid').head(20).to_dict()['count']
all_20_most_frequent_click_aids = df_all_click_aid_counts.set_index('aid').head(20).to_dict()['count']

train_20_most_frequent_cart_aids = df_train_cart_aid_counts.set_index('aid').head(20).to_dict()['count']
test_20_most_frequent_cart_aids = df_test_cart_aid_counts.set_index('aid').head(20).to_dict()['count']
all_20_most_frequent_cart_aids = df_all_cart_aid_counts.set_index('aid').head(20).to_dict()['count']

train_20_most_frequent_order_aids = df_train_order_aid_counts.set_index('aid').head(20).to_dict()['count']
test_20_most_frequent_order_aids = df_test_order_aid_counts.set_index('aid').head(20).to_dict()['count']
all_20_most_frequent_order_aids = df_all_order_aid_counts.set_index('aid').head(20).to_dict()['count']

Top 20 most frequent clicked products are visualized for training, test and training + test sets below. X axis is the frequency and y axis labels are aid and its frequency

In [None]:
visualize_prod_frequencies(
    aid_frequencies=train_20_most_frequent_click_aids,
    title='Top 20 Most Frequent click products in Training Set'
)
visualize_prod_frequencies(
    aid_frequencies=test_20_most_frequent_click_aids,
    title='Top 20 Most Frequent click products in Test Set'
)
visualize_prod_frequencies(
    aid_frequencies=all_20_most_frequent_click_aids,
    title='Top 20 Most Frequent click products in Training + Test Set'
)

Top 20 most frequent carted products are visualized for training, test and training + test sets below. X axis is the frequency and y axis labels are aid and its frequency

In [None]:
visualize_prod_frequencies(
    aid_frequencies=train_20_most_frequent_cart_aids,
    title='Top 20 Most Frequent cart products in Training Set'
)
visualize_prod_frequencies(
    aid_frequencies=test_20_most_frequent_cart_aids,
    title='Top 20 Most Frequent cart products in Test Set'
)
visualize_prod_frequencies(
    aid_frequencies=all_20_most_frequent_cart_aids,
    title='Top 20 Most Frequent cart products in Training + Test Set'
)

Top 20 most frequent ordered products are visualized for training, test and training + test sets below. X axis is the frequency and y axis labels are aid and its frequency

In [None]:
visualize_prod_frequencies(
    aid_frequencies=train_20_most_frequent_order_aids,
    title='Top 20 Most Frequent order aids in Training Set'
)
visualize_prod_frequencies(
    aid_frequencies=test_20_most_frequent_order_aids,
    title='Top 20 Most Frequent order aids in Test Set'
)
visualize_prod_frequencies(
    aid_frequencies=all_20_most_frequent_order_aids,
    title='Top 20 Most Frequent order aids in Training + Test Set'
)

In [None]:
train_session_type_groupby = df_train.groupby(['session', 'type'])
df_train_sessions_types = train_session_type_groupby[['session']].count().rename(columns={'session': 'count'}).reset_index()
df_train_sessions_types['total'] = df_train_sessions_types.groupby('session')['count'].transform('sum')
df_train_sessions_types.sort_values(by=['total', 'session', 'type'], ascending=False, inplace=True)
df_train_sessions_types['rate'] = df_train_sessions_types['count'] / df_train_sessions_types['total']
df_train_sessions_type_rate_means = df_train_sessions_types.groupby('type')['rate'].mean().to_dict()
df_train_sessions_type_rate_stds = df_train_sessions_types.groupby('type')['rate'].std().to_dict()

test_session_type_groupby = df_test.groupby(['session', 'type'])
df_test_sessions_types = test_session_type_groupby[['session']].count().rename(columns={'session': 'count'}).reset_index()
df_test_sessions_types['total'] = df_test_sessions_types.groupby('session')['count'].transform('sum')
df_test_sessions_types.sort_values(by=['total', 'session', 'type'], ascending=False, inplace=True)
df_test_sessions_types['rate'] = df_test_sessions_types['count'] / df_test_sessions_types['total']
df_test_sessions_type_rate_means = df_test_sessions_types.groupby('type')['rate'].mean().to_dict()
df_test_sessions_type_rate_stds = df_test_sessions_types.groupby('type')['rate'].std().to_dict()

# calculating mean and standard deviation of the different events in train and test data 

In [None]:
print(
f'''
Training: 0: {df_train_sessions_type_rate_means[0]:.4f}(±{df_train_sessions_type_rate_stds[0]:.4f}) | 1: {df_train_sessions_type_rate_means[1]:.4f}(±{df_train_sessions_type_rate_stds[1]:.4f}) | 2: {df_train_sessions_type_rate_means[2]:.4f}(±{df_train_sessions_type_rate_stds[2]:.4f})
Test: 0: {df_test_sessions_type_rate_means[0]:.4f}(±{df_test_sessions_type_rate_stds[0]:.4f}) | 1: {df_test_sessions_type_rate_means[1]:.4f}(±{df_test_sessions_type_rate_stds[1]:.4f}) | 2: {df_test_sessions_type_rate_means[2]:.4f}(±{df_test_sessions_type_rate_stds[2]:.4f})
'''
)
del df_train_sessions_type_rate_means, df_train_sessions_type_rate_stds, df_test_sessions_type_rate_means, df_test_sessions_type_rate_stds

A natural sequence is clicking to a product, adding it to the cart and ordering it. It can be seen from the visualization of session 3 below that cart events lead to order events multiple times. However, this doesn't apply to all sessions.

In [None]:
def visualize_session(df, session, path=None):
    df_session = df.loc[df['session'] == session, :]
    
    fig, ax = plt.subplots(figsize=(24, 6))
    ax.plot(df_session.set_index('ts')['type'], 'o-')
    ax.set_yticks(range(3), ['Click (0)', 'Cart (1)', 'Order (2)'])
    ax.tick_params(axis='x', labelsize=12.5)
    ax.tick_params(axis='y', labelsize=12.5)
    ax.set_xlabel('Timestamps', fontsize=15, labelpad=12.5)
    ax.set_ylabel('Events', fontsize=15, labelpad=12.5)
    title = f'''
    Session: {session}
    Clicks: {(df_session['type'] == 0).sum()} \n  Carts: {(df_session['type'] == 1).sum()} \n Orders: {(df_session['type'] == 2).sum()}
    '''
    ax.set_title(title, size=20, pad=15)
    
    if path is None:
        plt.show()
    else:
        plt.savefig(path)
        plt.close(fig)


visualize_session(df=df_train, session=3)
visualize_session(df=df_train, session=39) # starts with carts 
visualize_session(df=df_train, session=747) # start with order

Ground-truth of both training and test set can be created as long as a session has more than 1 timesteps. Ground-truth of clicks and other events are created differently. Ground-truth click of a timestep is the next click in that session. Ground-truth of carts or orders of a timestep is the collection of all next unique carts or orders in that session.

###### GROUND TRUTH 

In [None]:
def get_labels(aids, event_types):
    previous_click = None
    previous_carts = set()
    previous_orders = set()
    labels = []

    for aid, event_type in zip(reversed(aids.values), reversed(event_types.values)):
        
        label = {}
        
        if event_type == 0:
            previous_click = aid
        elif event_type == 1:
            previous_carts.add(aid)
        elif event_type == 2:
            previous_orders.add(aid)
            
        label[0] = previous_click 
        label[1] = previous_carts.copy() if len(previous_carts) > 0 else np.nan
        label[2] = previous_orders.copy() if len(previous_orders) > 0 else np.nan
        labels.append(label)
        
    labels = labels[:-1][::-1]
    labels.append({0: np.nan, 1: np.nan, 2: np.nan})
    
    return labels

In [None]:
df_session747 = df_train.loc[df_train['session'] == 747, :]
session747_labels = get_labels(aids=df_session747['aid'], event_types=df_session747['type'])
df_session747.loc[:, 'label'] = session747_labels
df_session747

Labels of session 747 are extracted with the function above. It can be seen that after a cart or order event, corresponding aid is dropped from the labels.

### MODELING THE DATA

#### MODEL 1 (absolutely simple)(wordtovec),(glove)?

In [16]:
from gensim.models import word2vec
from gensim.models import KeyedVectors

In [17]:
total=pd.concat([train,test],axis=0)

In [18]:
total

Unnamed: 0,session,aid,ts,type
0,0,1517085,1659304800025,clicks
1,0,1563459,1659304904511,clicks
2,0,1309446,1659367439426,clicks
3,0,16246,1659367719997,clicks
4,0,1781822,1659367871344,clicks
...,...,...,...,...
928110,13099776,1159407,1661844072138,clicks
928111,13099776,546448,1661844142618,clicks
928112,13099777,468584,1661795832787,clicks
928113,13099778,926609,1661795832939,clicks


In [19]:
count=0
corpus_clicks=[]
corpus_carts=[]
corpus_orders=[]
## Grouping all data of a single session to
carts,clicks,orders=total.groupby('type')
carts=pd.DataFrame(carts[1])
clicks=pd.DataFrame(clicks[1])
orders=pd.DataFrame(orders[1])

In [20]:
for session,group_df in clicks.groupby(['session']):
    corpus_clicks.append(list(group_df['aid'].astype(str)+'_'+group_df['type']))
for session,group_df in carts.groupby(['session']):
    corpus_carts.append(list(group_df['aid'].astype(str)+'_'+group_df['type']))
for session,group_df in orders.groupby(['session']):
    corpus_orders.append(list(group_df['aid'].astype(str)+'_'+group_df['type']))

In [21]:
v_2_vec_clicks=word2vec.Word2Vec(sentences=corpus_clicks,vector_size=100,window=5,min_count=1)
v_2_vec_carts=word2vec.Word2Vec(sentences=corpus_carts,vector_size=100,window=5,min_count=1)
v_2_vec_orders=word2vec.Word2Vec(sentences=corpus_orders,vector_size=100,window=5,min_count=1)

In [22]:
v_2_vec_clicks.wv.save_word2vec_format('otto_v_2_vec_clicks.bin',binary=True)
v_2_vec_carts.wv.save_word2vec_format('otto_v_2_vec_carts.bin',binary=True)
v_2_vec_orders.wv.save_word2vec_format('otto_v_2_vec_orders.bin',binary=True)

##### Predictions with word2vec model

In [23]:
carts_t,clicks_t,orders_t=test.groupby('type')
carts_t=pd.DataFrame(carts_t[1])
clicks_t=pd.DataFrame(clicks_t[1])
orders_t=pd.DataFrame(orders_t[1])

In [24]:
all_aids_carts=[]
for session,group_df in carts_t.groupby(['session']):
    aids_carts=[]
    results_carts=v_2_vec_carts.wv.most_similar(positive=list(group_df['aid'].astype(str)+"_"+group_df['type']),topn=20)
    for res in results_carts:
        aid=res[0].split('_')[0]
        if aid not in aids_carts:
            aids_carts.append(aid)
        if len(aids_carts) == 20:
            sesh_id=f"{session}_carts"
            all_aids_carts.append([sesh_id,aids_carts])
            break    

In [None]:
all_aids_clicks=[]
for session,group_df in clicks_t.groupby(['session']):
    aids_clicks=[]
    results_clicks=v_2_vec_clicks.wv.most_similar(positive=list(group_df['aid'].astype(str)+"_"+group_df['type']),topn=20)
    for res in results_clicks:
        aid=res[0].split('_')[0]
        if aid not in aids_clicks:
            aids_clicks.append(aid)
        if len(aids_clicks) == 20:
            sesh_id=f"{session}_clicks"
            all_aids_clicks.append([sesh_id,aids_clicks])
            break

In [None]:
all_aids_orders=[]
for session,group_df in orders_t.groupby(['session']):
    aids_orders=[]
    results_orders=v_2_vec_orders.wv.most_similar(positive=list(group_df['aid'].astype(str)+"_"+group_df['type']),topn=20)
    for res in results_orders:
        aid=res[0].split('_')[0]
        if aid not in aids_orders:
            aids_orderss.append(aid)
        if len(aids_orders) == 20:
            sesh_id=f"{session}_carts"
            all_aids_orders.append([sesh_id,aids_orders])
            break

In [None]:
with open(r'carts.txt', 'w') as fp:
    for item in all_aids_carts:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')
with open(r'clicks.txt', 'w') as fp:
    for item in all_aids_clicks:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')
with open(r'orders.txt', 'w') as fp:
    for item in all_aids_orders:
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')

In [None]:
all_aids=[]
print("Output :",all_aids)

#### Model 2

The test data contains truncated session data similar to that of the training data. The task is to predict the next aid clicked after the session truncation, as well as the the remaining aids that are added to carts and orders; you may predict up to 20 values for each session type

Submissions are evaluated on Recall each action type, and the three recall values are weight-averaged: {'clicks': 0.10, 'carts': 0.30, 'orders': 0.60} . It is important to get the 'orders' predictions correct as they carry most of the weigthing

For each session in the test data, we have to predict the aid values for each type that occur after the last timestamp i.e predict what occurs after the point of truncation.

For clicks there is only a single ground truth value for each session, which is the next aid clicked during the session . The ground truth for carts and orders contains all aid values that were added to a cart and ordered respectively during the session.

This baseline will use the fact that people will often interact with articles they have previouslt interacted with. The prediction will consist of the top 20 most frequent articles in the session. If there are less than 20 articles in the session the prediction will be padded with the most frequent articles in the training data

In [None]:
from collections import Counter
from tqdm.notebook import tqdm
DATA_PATH = pathlib.Path('../input/otto-recommender-system')
TEST_PATH = DATA_PATH/'test.jsonl'
TRAIN_PATH = DATA_PATH/'train.jsonl'
from heapq import nlargest

In [None]:
sample_size = 150000

chunks = pd.read_json(TRAIN_PATH, lines=True, chunksize = sample_size)

clicks_article_list = []
carts_article_list = []
orders_article_list = []

for e, c in enumerate(chunks):
    
    # Save time by not using all the data
    if e > 2:
        break
    
    sample_train_df = c
    
    for i, row in c.iterrows():
        actions = row['events']
        for action in actions:
            if action['type'] == 'clicks':
                clicks_article_list.append(action['aid'])
            elif action['type'] == 'carts':
                carts_article_list.append(action['aid'])
            else:
                orders_article_list.append(action['aid'])
    
article_click_freq = Counter(clicks_article_list)
article_carts_freq = Counter(carts_article_list)
article_order_freq = Counter(orders_article_list)
top_click_article = nlargest(20, article_click_freq, key = article_click_freq.get)
top_carts_article = nlargest(20, article_carts_freq, key = article_carts_freq.get)
top_order_article = nlargest(20, article_order_freq, key = article_order_freq.get) 
frequent_articles = {'clicks': top_click_article, 'carts':top_carts_article, 'order':top_order_article}

In [None]:
test_data = pd.read_json(TEST_PATH, lines=True, chunksize=1000)

preds = []

for chunk in tqdm(test_data, total=1671):
    
    for i, row in chunk.iterrows():
        actions = row['events']
        article_id_list = []
        for action in actions:
            article_id_list.append(action['aid'])

        article_freq = Counter(article_id_list)
        top_articles = nlargest(20, article_freq, key = article_freq.get)
        
        # Pad with most popular items in training
        padding_size = -(20 - len(top_articles))
        for action in ['clicks', 'carts', 'order']:
            top_articles = top_articles + frequent_articles[action][padding_size:]
            preds.append(" ".join([str(id) for id in top_articles]))

preds has the required predictions