In [1]:
import numpy as np
import pandas as pd
import seaborn as sn
import scipy
import urllib, json
import os
from pathlib import Path
from glob import glob
from tqdm import tqdm
from scipy.sparse.linalg import svds
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import matplotlib.pyplot as plt

In [2]:
#https://www.kaggle.com/competitions/otto-recommender-system/data?select=train.jsonl
path = "C:/Users/AdamS/Downloads/train.jsonl"

In [3]:
def loader(path):
    chunksize = 10000
    chunks = pd.read_json(path, lines=True, chunksize=chunksize, orient='index')
    num_lines = sum(1 for line in open(path))
    chunksize = 100_000
    num_chunks = int(np.ceil(num_lines / 100_000))
    df = pd.DataFrame()
    chunks = pd.read_json(path, lines=True, chunksize=100_000)

    for e, chunk in enumerate(chunks):
        event_dict = {
            'session': [],
            'aid': [],
            'ts': [],
            'type': [],
        }
        if e < 2:
            # train_sessions = pd.concat([train_sessions, chunk])
            for session, events in zip(chunk['session'].tolist(), chunk['events'].tolist()):
                for event in events:
                    event_dict['session'].append(session)
                    event_dict['aid'].append(event['aid'])
                    event_dict['ts'].append(event['ts'])
                    event_dict['type'].append(event['type'])
            chunk_session = pd.DataFrame(event_dict)
            df = pd.concat([df, chunk_session])
        else:
            break
        
    df = df.reset_index(drop=True)
    return(df)

In [4]:
df = loader(path)

In [5]:
df2 = df.copy()
df2['clickCount'] = df2.type
df2['cartCount'] = df2.type
df2['orderCount'] = df2.type
df2['clickCount'].values[:] = 0
df2['cartCount'].values[:] = 0
df2['orderCount'].values[:] = 0
df2.loc[df['type'] == 'clicks', 'clickCount'] = 1
df2.loc[df['type'] == 'carts', 'cartCount'] = 1
df2.loc[df['type'] == 'orders', 'orderCount'] = 1

In [7]:
dfMack = df2.groupby(['session','aid']).agg({'clickCount':'sum','cartCount':'sum','orderCount':'sum'})
dfMack.reset_index(inplace = True)
dfMack.sort_values('orderCount', ascending = False)
dfMack['interactionCount'] = dfMack.clickCount + dfMack.cartCount * 2 + dfMack.orderCount * 4

In [8]:
dfShort = dfMack.head(100000).copy()
df6 = dfShort.groupby(['session','aid'])['interactionCount'].first().unstack(fill_value = 0.0)

In [9]:
df6Num = df6.to_numpy().astype('float')
U, sigma, Vt = svds(df6Num, k = 32)
U.shape, sigma.shape, Vt.shape

((2302, 32), (32,), (32, 73139))

In [10]:
sigma_diag_matrix=np.diag(sigma)
train_full_matrix = np.dot(np.dot(U, sigma_diag_matrix), Vt)
prediction_train_full = pd.DataFrame(train_full_matrix, columns = df6.columns, index=df6.index)
user_orders = dfShort.groupby(['session', 'aid'])['orderCount'].first().unstack(fill_value=0.0)
user_interaction = dfShort.groupby(['session', 'aid'])['interactionCount'].first().unstack(fill_value=0.0)

In [12]:
dfRanking = dfMack.groupby('aid').agg({'orderCount':'sum','interactionCount' : 'sum'}).sort_values('interactionCount', ascending = False)
dfRanking['orderRank'] = dfRanking['orderCount'].rank(ascending=False)
dfRanking['interactionRank'] = dfRanking['interactionCount'].rank(ascending=False)

In [14]:
recs = []
for i in range (len(prediction_train_full)):
    recs.append(get_products_rec(i))
df_user_recs = pd.DataFrame(recs)
df_user_recs.columns = ['item1', 'item2', 'item3', 'item4', 'item5', 'item6', 'item7', 'item8', 'item9', 'item10']
df_user_recs

Unnamed: 0,item1,item2,item3,item4,item5,item6,item7,item8,item9,item10
0,1700164,1252357,1022566,351335,1761069,393073,166037,403257,1603001,80222.0
1,301441,1022566,351335,705229,393073,166037,1603001,335674,80222,131391.0
2,1022566,351335,1274120,500553,1603001,628141,833777,166037,953177,80222.0
3,1610178,1252357,1022566,351335,1072782,393073,166037,403257,1603001,80222.0
4,1252357,1022566,351335,705229,1072782,393073,166037,403257,1603001,80222.0
...,...,...,...,...,...,...,...,...,...,...
2297,1252357,1022566,351335,1072782,393073,166037,403257,1603001,335674,80222.0
2298,1252357,1022566,351335,1350828,1072782,393073,166037,403257,1603001,80222.0
2299,1022566,351335,29735,393073,1128786,166037,1019736,1603001,493115,80222.0
2300,1156034,1057154,1022566,351335,552688,969875,884244,166037,1603001,80222.0


In [13]:
def get_products_rec(userId):
    user_items_ordered = user_orders.loc[userId][user_orders.loc[userId] > 5].index
    user_items_interacted = user_interaction.loc[userId][user_interaction.loc[userId] > 30].index
    items_rec = prediction_train_full.loc[userId].sort_values(ascending = False).head(5).index
    most_popular = dfRanking.sort_values('orderRank', ascending = True).head(5).index
    return ((set(items_rec).union(most_popular)) - set(user_items_ordered))