In [1]:
import numpy as np
import pandas as pd
import reco
from tqdm import tqdm
import datetime
from collections import Counter

In [2]:
transactions = pd.read_csv("./data/transactions_train.csv", dtype={'article_id':str}, parse_dates=['t_dat'])

In [3]:
week_1_transactions = transactions[(transactions["t_dat"] >= '2020-9-16') & (transactions['t_dat'] < '2020-9-23')]
week_2_transactions = transactions[(transactions["t_dat"] >= '2020-9-8') & (transactions['t_dat'] < '2020-9-16')]
week_3_transactions = transactions[(transactions["t_dat"] >= '2020-8-31') & (transactions['t_dat'] < '2020-9-8')]
week_4_transactions = transactions[(transactions["t_dat"] >= '2020-8-23') & (transactions['t_dat'] < '2020-8-31')]

In [4]:
recent_transactions = pd.concat([week_1_transactions, week_2_transactions], axis=0)

In [5]:
transactions = pd.concat([week_1_transactions, week_2_transactions, week_3_transactions, week_4_transactions], axis=0)

In [6]:
def get_most_freq_next_item(user_group):
    next_items = {}
    for user in tqdm(user_group.keys()):
        items = user_group[user]
        for i,item in enumerate(items[:-1]):
            if item not in next_items:
                next_items[item] = []
            if item != items[i+1]:
                next_items[item].append(items[i+1])

    pred_next = {}
    for item in next_items:
        if len(next_items[item]) >= 5:
            most_common = Counter(next_items[item]).most_common()
            ratio = most_common[0][1]/len(next_items[item])
            if ratio >= 0.1:
                pred_next[item] = most_common[0][0]
            
    return pred_next

In [7]:
user_group = transactions.groupby(['customer_id'])['article_id'].apply(list)
pred_next = get_most_freq_next_item(user_group)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 250619/250619 [00:01<00:00, 186413.06it/s]


In [8]:
week_1_purchase_per_user = week_1_transactions.groupby(['customer_id'])['article_id'].apply(list)
week_2_purchase_per_user = week_2_transactions.groupby(['customer_id'])['article_id'].apply(list)
week_3_purchase_per_user = week_3_transactions.groupby(['customer_id'])['article_id'].apply(list)
week_4_purchase_per_user = week_4_transactions.groupby(['customer_id'])['article_id'].apply(list)

In [9]:
recent_transactions['popularity_factor'] = recent_transactions['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23) - x).days)
recent_popular_items_group = recent_transactions.groupby(['article_id'])['popularity_factor'].sum()

_, recent_popular_items = zip(*sorted(zip(recent_popular_items_group, recent_popular_items_group.keys()))[::-1])

In [10]:
transactions['popularity_factor'] = transactions['t_dat'].apply(lambda x: 1/(datetime.datetime(2020,9,23) - x).days**2)
popular_items_group = transactions.groupby(['article_id'])['popularity_factor'].sum()

transactions['rating'] = 1
transactions = transactions.groupby(['customer_id', 'article_id']).sum().reset_index()

transactions['rating'] = transactions.apply(lambda row: row['rating']/popular_items_group[row['article_id']], axis=1)

transactions['rating'] = transactions['rating'].apply(lambda x: 5.0 if x>5.0 else x)
transactions.drop(['price', 'sales_channel_id'], axis=1, inplace=True)

In [11]:
from reco.recommender import FunkSVD

model = FunkSVD(k=8, learning_rate=0.005, regularizer=0.01, iterations=200, method='stochastic', bias=True)
model.fit(X=transactions, formatizer={'user':'customer_id', 'item':'article_id', 'value':'rating'}, verbose=False)

In [12]:
submission = pd.read_csv("./data/sample_submission.csv")

In [13]:
def get_most_frequent_items_bought(model, user, userindexes, purchase_per_user):
    user_output = list()
    most_common_items_of_user = {k:v for k, v in Counter(purchase_per_user[user]).most_common()}
    user_index = userindexes[user]
    new_order = dict()
    for k in list(most_common_items_of_user.keys())[:20]:
        try:
            itemindex = model.items.index(k)
            pred_value = np.dot(model.userfeatures[user_index], model.itemfeatures[itemindex].T) + model.item_bias[0, itemindex]
        except:
            pred_value = most_common_items_of_user[k]
        new_order[k] = pred_value
    user_output = [k for k, v in sorted(new_order.items(), key=lambda item: item[1])][:12]
    return user_output

In [14]:
outputs = []
popular_items = list(recent_popular_items)
userindexes = {model.users[i]:i for i in range(len(model.users))}

for user in tqdm(submission['customer_id']):
    user_output = list()
    if user in week_1_purchase_per_user:
        user_output.extend(get_most_frequent_items_bought(model, user, userindexes, week_1_purchase_per_user))
        
    if user in week_2_purchase_per_user:
        user_output.extend(get_most_frequent_items_bought(model, user, userindexes, week_2_purchase_per_user))
        
    if user in week_3_purchase_per_user:
        user_output.extend(get_most_frequent_items_bought(model, user, userindexes, week_3_purchase_per_user))
        
    if user in week_4_purchase_per_user:
        user_output.extend(get_most_frequent_items_bought(model, user, userindexes, week_4_purchase_per_user))
        
    user_output.extend([pred_next[item] for item in user_output if item in pred_next and pred_next[item] not in user_output])      
    
    user_output.extend(list(popular_items[:12 - len(user_output)]))
    outputs.append(user_output)
    
str_outputs = []
for output in outputs:
    str_outputs.append(" ".join([str(x) for x in output]))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1371980/1371980 [06:35<00:00, 3470.91it/s]


In [15]:
submission['prediction'] = str_outputs
submission.to_csv("./submission/svd.csv", index=False)