In [34]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from typing import Tuple, Callable, Dict, Optional, List

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/recsyschallenge/alg_sample_submission.csv
/kaggle/input/recsyschallenge/data_ICM_type.csv
/kaggle/input/recsyschallenge/data_target_users_test.csv
/kaggle/input/recsyschallenge/interactions_and_impressions.csv
/kaggle/input/recsyschallenge/data_ICM_length.csv


In [35]:
def get_URM():
    return pd.read_csv('../input/recsyschallenge/interactions_and_impressions.csv')

In [36]:
ratings = get_URM()

  """Entry point for launching an IPython kernel.


In [37]:
ratings

Unnamed: 0,UserID,ItemID,Impressions,Data
0,0,11,012345678910111213141516171819,1
1,0,21,,0
2,0,21,,0
3,0,21,20212223242526272829,0
4,0,21,,1
...,...,...,...,...
5826501,41628,20448,,0
5826502,41628,20896,,1
5826503,41628,21506,,1
5826504,41628,22882,,0


In [38]:
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
def dataset_splits(ratings, num_users, num_items, validation_percentage: float, testing_percentage: float):
    seed = 1234
    (user_ids_training, user_ids_test,
     item_ids_training, item_ids_test,
     #mpressions_training, impressions_test,
     data_training, data_test)       = train_test_split(ratings.UserID,
                                                        ratings.ItemID,
                                                        #sers_id.Impressions,
                                                        ratings.Data,
                                                        test_size=testing_percentage,
                                                        shuffle=True,
                                                        random_state=seed)
    
    (user_ids_training, user_ids_validation,
     item_ids_training, item_ids_validation,
     #mpressions_training, impressions_validation,
     data_training, data_validation) = train_test_split(user_ids_training,
                                                              item_ids_training,
                                                              #mpressions_training,
                                                              data_training,
                                                              test_size=validation_percentage,
                                                             )
    
    urm_train = sp.csr_matrix(( data_training, (user_ids_training, item_ids_training)), 
                              shape=(num_users, num_items))
    
    urm_validation = sp.csr_matrix((data_validation, (user_ids_validation, item_ids_validation)), 
                              shape=(num_users, num_items))
    
    urm_test = sp.csr_matrix((data_test, (user_ids_test, item_ids_test)), 
                              shape=(num_users, num_items))
    
    
    return urm_train, urm_validation, urm_test
    

In [39]:
urm_train, urm_validation, urm_test = dataset_splits(ratings, 
                                                     num_users=ratings.UserID.unique().shape[0], 
                                                     num_items=ratings.ItemID.unique().shape[0], 
                                                     validation_percentage=0.10, 
                                                     testing_percentage=0.20)

In [40]:
def matrix_similarity(urm: sp.csc_matrix, shrink: int):
    item_weights = np.sqrt(
        np.sum(urm.power(2), axis=0)
    ).A
    
    numerator = urm.T.dot(urm)
    denominator = item_weights.T.dot(item_weights) + shrink + 1e-6
    weights = numerator / denominator
    np.fill_diagonal(weights, 0.0)
    
    return weights

In [41]:
urm_csc = urm_train.tocsc()
shrink = 5
slice_size = 100


matrix_weights = matrix_similarity(urm_csc[:slice_size,:slice_size], shrink)
matrix_weights

matrix([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.27639316],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.27639316, 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [42]:
class CFItemKNN(object):
    def __init__(self, shrink: int):
        self.shrink = shrink
        self.weights = None
    
    
    def fit(self, urm_train: sp.csc_matrix, similarity_function):
        if not sp.isspmatrix_csc(urm_train):
            raise TypeError(f"We expected a CSC matrix, we got {type(urm_train)}")
        
        self.weights = similarity_function(urm_train, self.shrink)
        
    def recommend(self, user_id: int, urm_train: sp.csr_matrix, at: Optional[int] = None, remove_seen: bool = True):
        user_profile = urm_train[user_id]
        
        ranking = user_profile.dot(self.weights).A.flatten()
        
        if remove_seen:
            user_profile_start = urm_train.indptr[user_id]
            user_profile_end = urm_train.indptr[user_id+1]
            
            seen_items = urm_train.indices[user_profile_start:user_profile_end]
            
            ranking[seen_items] = -np.inf
            
        ranking = np.flip(np.argsort(ranking))
        return ranking[:at]

In [43]:
itemknn_recommender = CFItemKNN(shrink=50)
itemknn_recommender

<__main__.CFItemKNN at 0x7f155a4c1450>

In [44]:
%%time

itemknn_recommender.fit(urm_train.tocsc(), matrix_similarity)

CPU times: user 5.3 s, sys: 4.79 s, total: 10.1 s
Wall time: 8.36 s


In [45]:
for user_id in range(10):
    print(itemknn_recommender.recommend(user_id=user_id, urm_train=urm_train, at=10, remove_seen=True))

[  544  1979  5476 18985   431  1378   916   770  2638   645]
[15868  1000 21403  2127  5636 24022 21475  4333  2466 17928]
[ 5222  4681 13622  2160   207   816  8070  6069  5875 13387]
[ 353  917   21  771  124  433  269 1911 2388   29]
[  963    80 17001    83    77  7413 16160 13865 18086    73]
[ 2628   940 20742   773   393    21   395    60   348  1068]
[  272    62    58  1068    54   394    60 20725   395   396]
[ 8511  2047  1873  7067  1410 13941 19240  5793  6466  2785]
[  827  1695  3638  1696  2535  1620  1665 23483   445 12718]
[ 2712  1512 19229   698  3156 18850  7813 15915   319  6420]


In [47]:
def recall(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)
    
    recall_score = np.sum(is_relevant) / relevant_items.shape[0]
    
    return recall_score
    
    
def precision(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)
    
    precision_score = np.sum(is_relevant) / recommendations.shape[0]

    return precision_score

def mean_average_precision(recommendations: np.array, relevant_items: np.array) -> float:
    is_relevant = np.in1d(recommendations, relevant_items, assume_unique=True)
    
    precision_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    map_score = np.sum(precision_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return map_score

In [50]:
def evaluator(recommender: object, urm_train: sp.csr_matrix, urm_test: sp.csr_matrix):
    recommendation_length = 10
    accum_precision = 0
    accum_recall = 0
    accum_map = 0
    
    num_users = urm_train.shape[0]
    
    num_users_evaluated = 0
    num_users_skipped = 0
    for user_id in range(num_users):
        user_profile_start = urm_test.indptr[user_id]
        user_profile_end = urm_test.indptr[user_id+1]
        
        relevant_items = urm_test.indices[user_profile_start:user_profile_end]
        
        if relevant_items.size == 0:
            num_users_skipped += 1
            continue
            
        recommendations = recommender.recommend(user_id=user_id, 
                                               at=recommendation_length, 
                                               urm_train=urm_train, 
                                               remove_seen=True)
        
        accum_precision += precision(recommendations, relevant_items)
        accum_recall += recall(recommendations, relevant_items)
        accum_map += mean_average_precision(recommendations, relevant_items)
        
        num_users_evaluated += 1
        
    
    accum_precision /= max(num_users_evaluated, 1)
    accum_recall /= max(num_users_evaluated, 1)
    accum_map /=  max(num_users_evaluated, 1)
    
    return accum_precision, accum_recall, accum_map, num_users_evaluated, num_users_skipped

In [51]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped = evaluator(itemknn_recommender, 
                                                                                            urm_train, 
                                                                                            urm_test)

In [52]:
accum_precision, accum_recall, accum_map, num_user_evaluated, num_users_skipped


(0.006725970926448153, 0.005029428298894253, 0.002898723795621795, 41481, 148)

In [53]:
def hyperparameter_tuning():
    shrinks = [0,1,5,10,50, 100, 200, 500]
    results = []
    for shrink in shrinks:
        print(f"Currently trying shrink {shrink}")
        
        itemknn_recommender = CFItemKNN(shrink=shrink)
        itemknn_recommender.fit(urm_train.tocsc(), matrix_similarity)
        
        ev_precision, ev_recall, ev_map, _, _ = evaluator(itemknn_recommender, urm_train, urm_validation)
        
        results.append((shrink, (ev_precision, ev_recall, ev_map)))
        
    return results

In [54]:
hyperparameter_results = hyperparameter_tuning()
hyperparameter_results

Currently trying shrink 0
Currently trying shrink 1
Currently trying shrink 5
Currently trying shrink 10
Currently trying shrink 50
Currently trying shrink 100
Currently trying shrink 200
Currently trying shrink 500


[(0, (0.0029804622653555905, 0.004166548664042646, 0.0017977599605987454)),
 (1, (0.0029881241220788696, 0.004168188143492985, 0.0018043846683525122)),
 (5, (0.003023879453454172, 0.004209069987937038, 0.0018410532556311602)),
 (10, (0.0030366492146596366, 0.004279448083493119, 0.0018672274736216336)),
 (50, (0.003090282211722588, 0.004540810311079554, 0.0019668271026776025)),
 (100, (0.0031439152087855424, 0.004675294948235588, 0.0019949557593247867)),
 (200, (0.003172008683437565, 0.0048104039539685825, 0.001968434088969238)),
 (500, (0.0031158217341335157, 0.004757315015952019, 0.0019408778596310184))]