In [1]:
import pandas as pd
import numpy as np
from implicit.nearest_neighbours import BM25Recommender
import scipy.sparse as sp

In [2]:
sample_submission = pd.read_csv('sample_submission.csv')
interactions = pd.read_csv('interactions.csv')
items = pd.read_csv('items.csv')
users = pd.read_csv('users.csv')

In [3]:
repeat = interactions.duplicated(subset=['user_id', 'item_id'], keep=False)
interactions_repeat = interactions[repeat].sort_values(by=['user_id', 'start_date'])
interactions = interactions[~repeat]

interactions_repeat = interactions_repeat.groupby(['user_id', 'item_id']).agg({
    'progress': 'max',
    'rating': 'max',
    'start_date': 'min'})

interactions = interactions.append(interactions_repeat.reset_index(), ignore_index=True)
interactions

Unnamed: 0,user_id,item_id,progress,rating,start_date
0,90133,82910,100,,2018-01-01
1,159130,331068,70,,2018-01-01
2,80061,26540,69,4.0,2018-01-01
3,12811,301895,16,,2018-01-01
4,5778,127872,100,,2018-01-01
...,...,...,...,...,...
1562500,160045,291585,85,,2018-02-25
1562501,161224,7819,58,,2018-04-25
1562502,163719,27040,99,,2018-05-11
1562503,165754,303933,42,0.0,2018-08-09


In [4]:
diff = np.setdiff1d(users['user_id'], interactions['user_id'].unique())
num_users = len(users['user_id'])
for i, user_id in enumerate(users['user_id']):
    if user_id in diff:
        users = users.drop(i)

In [5]:
diff = np.setdiff1d(interactions['user_id'].unique(), users['user_id'])
unknowns = ['unknown' for _ in range(len(diff))]
users_with_interactions = pd.DataFrame(list(zip(diff, unknowns, unknowns)),
                                       columns=users.columns)
users = users.append(users_with_interactions)

In [6]:
num_interactions = []
for user in users['user_id']:
    num_interactions.append(len(interactions[interactions['user_id'] == user]))
users['num_its'] = num_interactions

In [7]:
class Popular_Recommender():
    def __init__(self, max_K=100, days=30, item_column='item_id', dt_column='date'):
        self.max_K = max_K
        self.days = days
        self.item_column = item_column
        self.dt_column = dt_column
        self.recommendations = []
        
    def fit(self, df):
        min_date = pd.to_datetime(df[self.dt_column]).max().normalize() - pd.DateOffset(days=self.days)
        self.recommendations = df.loc[pd.to_datetime(df[self.dt_column]) > min_date, self.item_column].value_counts().head(self.max_K).index.values

In [8]:
pr = Popular_Recommender(days=10, dt_column='start_date')
pr.fit(interactions)
populars = list(pr.recommendations[:10])

In [9]:
dropped_users = []
n_interactions = 4

for user in sample_submission['Id']:
    if user not in users['user_id'].unique():
        dropped_users.append(user)
    elif users[users['user_id'] == user]['num_its'].item() < n_interactions:
        dropped_users.append(user)

In [10]:
remained_users = users[users['user_id'].isin(np.setdiff1d(sample_submission['Id'], dropped_users))]

In [11]:
len(remained_users), len(dropped_users)

(2792, 282)

In [12]:
def get_coo_matrix(interactions, 
                   users_mapping, 
                   items_mapping,
                   user_col='user_id', 
                   item_col='item_id', 
                   weight_col=None):
    if weight_col is None:
        weights = np.ones(len(interactions), dtype=np.float32)
    else:
        weights = interactions[weight_col].astype(np.float32)

    interaction_matrix = sp.coo_matrix((
        weights, 
        (
            interactions[user_col].map(users_mapping.get), 
            interactions[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

def make_mapping(data):
    return dict([(v, k) for k, v in enumerate(data)])

In [13]:
items_mapping = make_mapping(items['id'].unique())
users_mapping = make_mapping(interactions['user_id'].unique())

items_inv_mapping = dict({(v, k) for k, v in items_mapping.items()})

In [14]:
interactions_matrix = get_coo_matrix(interactions, 
                                     users_mapping=users_mapping, 
                                     items_mapping=items_mapping).tocsr()

In [15]:
imp_model = BM25Recommender(K=10)
imp_model.fit(interactions_matrix.T)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=63758.0), HTML(value='')))




In [16]:
top_n = 10

total_preds = {}
for user in remained_users['user_id']:
    preds = imp_model.recommend(users_mapping[user], interactions_matrix, 
                               N=top_n, filter_already_liked_items=True)
    preds = [items_inv_mapping[pred[0]] for pred in preds]
    
    total_preds[user] = preds

In [17]:
for user in dropped_users:
    total_preds[user] = populars

In [18]:
submission = pd.DataFrame({'Id': sample_submission['Id']})
submission['Predicted'] = [total_preds[user] for user in submission['Id']]
submission['Predicted'] = submission['Predicted'].apply(lambda x: ' '.join(map(str, x)))

In [19]:
submission.to_csv('submission_4.csv', index=False)