In [24]:
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm


class EASE:
    def __init__(self):
        self.user_enc = LabelEncoder()
        self.item_enc = LabelEncoder()

    def _get_users_and_items(self, df):
        users = self.user_enc.fit_transform(df.loc[:, 'user_id'])
        items = self.item_enc.fit_transform(df.loc[:, 'item_id'])
        return users, items

    def fit(self, df, lambda_: float = 0.5, implicit=True):
        """
        df: pandas.DataFrame with columns user_id, item_id and (rating)
        lambda_: l2-regularization term
        implicit: if True, ratings are ignored and taken as 1, else normalized ratings are used
        """
        users, items = self._get_users_and_items(df)
        values = np.ones(df.shape[0]) if implicit else df['rating'].to_numpy() / df['rating'].max()

        X = csr_matrix((values, (users, items)))
        self.X = X

        G = X.T.dot(X).toarray()
        diagIndices = np.diag_indices(G.shape[0])
        G[diagIndices] += lambda_
        P = np.linalg.inv(G)
        B = P / (-np.diag(P))
        B[diagIndices] = 0

        self.B = B
        self.pred = X.dot(B)

    def predict(self, train, users, items, k):
        df = pd.DataFrame()
        items = self.item_enc.transform(items)
        dd = train.loc[train.user_id.isin(users)]
        dd['ci'] = self.item_enc.transform(dd.item_id)
        dd['cu'] = self.user_enc.transform(dd.user_id)
        g = dd.groupby('user_id')
        for user, group in tqdm(g):
            watched = set(group['ci'])
            candidates = [item for item in items if item not in watched]
            u = group['cu'].iloc[0]
            pred = np.take(self.pred[u, :], candidates)
            res = np.argpartition(pred, 0) #res = np.argpartition(pred, -k)[-k:]
            r = pd.DataFrame({
                "user_id": [user] * len(res),
                "item_id": np.take(candidates, res),
                "score": np.take(pred, res)
            }).sort_values('score', ascending=False)
            df = df.append(r, ignore_index=True)
        df['item_id'] = self.item_enc.inverse_transform(df['item_id'])
        return df

In [3]:
train = pd.read_csv('train.csv')

In [4]:
test = pd.read_csv('test.csv')

In [25]:
model = EASE()

In [26]:
%%time
model.fit(train)

CPU times: user 7min 28s, sys: 13.2 s, total: 7min 41s
Wall time: 23.6 s


In [27]:
num_items = len(train.item_id.unique())

In [28]:
num_items

7895

In [31]:
from joblib import Parallel, delayed

In [101]:
def predict(model, N, i):
    all_users = sorted(list(test.user_id.unique()))
    
    lower = max(
            0,
            int(len(all_users) * float(i) / N)
        )
    upper = min(
            int(len(all_users) * float(i+1) / N), 
            len(all_users)
        )
    
    users = all_users[lower:upper]
    
    model.predict(train, users, train.item_id.unique(), num_items).to_csv('pred_full_{}.csv'.format(i), index=False)

In [102]:
N = 10

with Parallel(n_jobs=N) as p:
    p([delayed(predict)(model, N, i) for i in range(N)])

KeyboardInterrupt: 

In [90]:
import os

In [96]:
df = pd.concat([pd.read_csv(x) for x in os.listdir() if 'pred_full_' in x])

In [98]:
df.to_csv('preds_full.csv', index=False)