In [1]:
%matplotlib inline
import pandas as pd
import numpy as np

from scipy import sparse as sp
from scipy.sparse import coo_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import NearestNeighbors

In [2]:
train_full = pd.read_csv('./data/train.csv')
print(train_full.shape)

(31795064, 4)


In [3]:
item_data = pd.read_csv('./data/item_data.csv')
print(item_data.shape)

(5436440, 9)


In [4]:
test_users = pd.read_csv('./data/test_users.csv')
print(test_users.shape)

(100000, 1)


# 1. Data preparation 

In [5]:
# label encode user_id & microcat_id for sparse matrix construction
mc_le = LabelEncoder()
uid_le = LabelEncoder()

train_full.user_id = uid_le.fit_transform(train_full.user_id)
item_data.microcat_id = mc_le.fit_transform(item_data.microcat_id)

In [6]:
# split all data into train & validation
train = train_full[train_full.event_date < '2017-02-21 00:00:00']

# drop user/item pairs present in train from validation
train_copy = train_full.copy()
train_copy.sort_values(by='event_date', inplace=True)
train_copy.drop_duplicates(subset=['user_id', 'item_id'], inplace=True)
validation = train_copy[train_copy.event_date > '2017-02-21 00:00:00']
print(train.shape, validation.shape)
del train_copy

(27259520, 4) (2938292, 4)


In [7]:
def create_train_matrix(train, item_data):
    ''' Transform clickstream to user-microcat sparse matrix '''
    # merge with item data to get microcat ids of interactions
    train_mc = train.merge(item_data[['item_id', 'microcat_id']])
    
    # get counts of user/microcat interactions
    train_mc_cnt = train_mc.groupby(['user_id', 'microcat_id']).apply(len).reset_index().rename(columns={0: 'cnt'})
    
    # create sparse matrix of user-microcat iterests
    user_microcat = coo_matrix((train_mc_cnt.cnt, (train_mc_cnt.user_id, train_mc_cnt.microcat_id))).tocsr()
    
    return user_microcat
    

In [8]:
user_microcat = create_train_matrix(train, item_data)

In [9]:
# set weight for each eventtype according to rules
event_weigts = {57: 1, 15: 2, 25: 2, 36: 5}
validation.is_copy = False
validation.loc[:, 'weight'] = validation.eventtype_id.apply(lambda x: event_weigts.get(x, 0))

# 2. Making recommendations 

In [10]:
class NearestNeighborsRecommender():
    def __init__(self, data, n_neighbors, max_recs_per_user, metric, batch_size):
        self.data = data
        self.n_neighbors = n_neighbors
        self.max_recs_per_user = max_recs_per_user
        self.metric = metric
        self.batch_size = batch_size
        self.nn_model = NearestNeighbors(n_neighbors=self.n_neighbors, metric=self.metric, algorithm='brute')
        self.nn_model.fit(self.data)
        
    def _get_similar_users(self, user_ids):
        '''
        Create dataframe with 3 columns: 
            target_user_id - user we want to make recommendations for
            user_id - most similar users to target user
            score - user similarity (1 - cosine_distance)
        '''
        users = self.data[user_ids, :]
        df = pd.DataFrame({'target_user_id': np.repeat(user_ids, self.nn_model.n_neighbors-1)})
        neighbors = self.nn_model.kneighbors(users)
        df['user_id'] = neighbors[1][:, 1:].flatten()
        df['score'] = 1-neighbors[0][:, 1:].flatten()
        return df
        
    def get_similar_users(self, user_ids):
        ''' 
        Batch version of similar users.
        Helps to avoid huge memory overhead but slightly increases working time
        '''
        num_batches = np.ceil(len(user_ids) / self.batch_size).astype(int)
        dfs = [self._get_similar_users(batch) for batch in np.array_split(user_ids, num_batches)]
        return pd.concat(dfs)
    
    def recommend_items(self, user_ids, user_items, test_items):
        ''' 
        Create recommendations for users based on item interactions of similar users:
        - choose all items similar users iteracted with
        - sort them by user similarity and event_date
        - include only test_items
        '''
        similar_users = self.get_similar_users(user_ids)
        recs = pd.merge(user_items, similar_users) \
            .sort_values(by=['score', 'event_date'], ascending=False) \
            .merge(test_items).drop_duplicates(subset=['target_user_id', 'item_id'])
        recs['rank'] = recs.groupby('target_user_id').cumcount()
        recs = recs[recs['rank'] < self.max_recs_per_user]
        return recs[['target_user_id', 'item_id']].rename(columns={'target_user_id': 'user_id'})



In [11]:
# select users present in both train and validation
val_users = list(set(train.user_id).intersection(validation.user_id.unique()))
user_ids = np.random.choice(val_users, 10000, replace=False)

In [12]:
# restrict recommended items with items from test only
val_items = validation[['item_id']].drop_duplicates()

In [13]:
nnr = NearestNeighborsRecommender(user_microcat, 21, 50, 'cosine', 100)

In [14]:
recs = nnr.recommend_items(user_ids, train, val_items)

In [15]:
test_items = item_data[item_data.active_during_test==1][['item_id']]

# 3. Evaluation 

In [16]:
def create_random_recs(user_ids, item_ids, max_recs_per_user):
    users = np.repeat(user_ids, max_recs_per_user)
    items = np.random.choice(item_ids, len(user_ids)*max_recs_per_user)
    recs = pd.DataFrame({'user_id': users, 'item_id': items})
    return recs
        

In [17]:
def score_predictions(true, pred):
    """
    Score is a weighted sum of matching (user, item) pairs.
    Weights are based on eventtype: view - 1, contact - 5, favorite - 2, message - 2.
    """
    score = pd.merge(true.drop_duplicates(subset=['user_id', 'item_id']), pred.drop_duplicates(subset=['user_id', 'item_id']))['weight'].sum()
    return score

In [18]:
true = pd.merge(validation, pd.DataFrame({'user_id': user_ids}))

In [19]:
score_predictions(true, recs)

404

In [20]:
score_predictions(true, create_random_recs(user_ids, val_items.item_id, 50))

6

# 4. Train on all data and make a submission. 

In [21]:
user_microcat_full = create_train_matrix(train_full, item_data)

In [22]:
user_microcat_full.shape

(619615, 4732)

In [23]:
test_user_ids = uid_le.transform(test_users.user_id)

In [24]:
test_user_ids.shape

(100000,)

In [25]:
test_items = item_data[item_data.active_during_test==1][['item_id']]

In [26]:
nnr_full = NearestNeighborsRecommender(user_microcat_full, 21, 50, 'cosine', 100)

In [27]:
recs = nnr_full.recommend_items(test_user_ids, train_full, test_items)

In [28]:
recs.user_id = uid_le.inverse_transform(recs.user_id)

In [29]:
recs.to_csv('./data/sample_submission.csv', index=None)