In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy.sparse import *

import os

# Read data for the URM and ICM
URM_df= pd.read_csv('../input/recommender-system-2021-challenge-polimi/data_train.csv', dtype={0:int, 1:int, 2:float})
genre_matrix = pd.read_csv('../input/recommender-system-2021-challenge-polimi/data_ICM_genre.csv', dtype={0:int, 1:int, 2:int})
target_users = pd.read_csv('../input/recommender-system-2021-challenge-polimi/data_target_users_test.csv', dtype={0:int})
subgenre_matrix = pd.read_csv('../input/recommender-system-2021-challenge-polimi/data_ICM_subgenre.csv', dtype={0:int, 1:int, 2:int})
channel_matrix = pd.read_csv('../input/recommender-system-2021-challenge-polimi/data_ICM_channel.csv', dtype={0:int, 1:int, 2:int})

target_users.columns = ['user_id']
URM_df.columns = ['user', 'item', 'interaction']
genre_matrix.columns = ['item', 'genre', 'hasgenre']
subgenre_matrix.columns = ['item', 'subgenre', 'hassubgenre']
channel_matrix.columns = ['item', 'channel', 'onchannel']

# Merge datasets into the ICM
genre_subgenre_ICM = pd.merge(genre_matrix, subgenre_matrix, on='item')
ICM_df = pd.merge(genre_subgenre_ICM, channel_matrix, on='item')

ICM_df.pop('hasgenre')
ICM_df.pop('hassubgenre')
ICM_df = ICM_df.rename({'onchannel':'data'}, axis='columns')
ICM_df

In [None]:
# Calculates number of genres, subgenres and channels

# Creates csc matrix from dataframe
URM_all = coo_matrix((URM_df['interaction'].values, (URM_df['user'].values, URM_df['item'].values)))
URM_csr = URM_all.tocsr()
genre_coo = coo_matrix((genre_matrix['hasgenre'].values, (genre_matrix['item'].values, genre_matrix['genre'].values)))
genre_csc = genre_coo.tocsc()
subgenre_coo = coo_matrix((subgenre_matrix['hassubgenre'].values, (subgenre_matrix['item'].values, subgenre_matrix['subgenre'].values)))
subgenre_csc = subgenre_coo.tocsc()
channel_coo = coo_matrix((channel_matrix['onchannel'].values, (channel_matrix['item'].values, channel_matrix['channel'].values)))
channel_csc = channel_coo.tocsc()

# Quite inefficient, maybe there's a unique approach from exercise sessions
n_of_genres = genre_coo.shape[1]

n_of_subgenres = subgenre_coo.shape[1]

n_of_channels = channel_coo.shape[1]

print(n_of_genres, n_of_subgenres, n_of_channels)

In [None]:
# Creates ICM horizontally stacking the feature matrices
ICM = hstack([genre_coo, subgenre_coo, channel_coo])
ICM_csr = ICM.tocsr()

In [None]:
ICM.shape

In [None]:
import matplotlib.pyplot as plt
features_per_item = np.ediff1d(ICM.tocsc().indptr)
features_per_item = np.sort(features_per_item)
# features_per_item = np.flip()

# Plot the graph
plt.plot(features_per_item, 'ro')
plt.show()

In [None]:
from os import sys

package_paths = [
    '../input/mauriziofd/',
]

for pth in package_paths:
    sys.path.append(pth)

from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample

URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_csr, train_percentage = 0.80)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])



In [None]:
# Create similarity matrix

from Recommenders.Similarity.Compute_Similarity_Python import Compute_Similarity_Python

similarity_object = Compute_Similarity_Python(ICM_csr.T, 
                                            shrink=10, 
                                          topK=100,
                                          normalize=True, 
                                          similarity = "cosine")

W_sparse = similarity_object.compute_similarity()
W_sparse

In [None]:
# Recommendation Class
class ItemKNNCBFRecommender(object):
    
    def __init__(self, URM, ICM):
        self.URM = URM
        self.ICM = ICM
        
            
    def fit(self, topK=50, shrink=100, normalize = True, similarity = "cosine"):
        
        similarity_object = Compute_Similarity_Python(self.ICM.T, shrink=shrink, 
                                                  topK=topK, normalize=normalize, 
                                                  similarity = similarity)
        
        self.W_sparse = similarity_object.compute_similarity()

        
    def recommend(self, user_id, at=None, exclude_seen=True):
        # compute the scores using the dot product
        user_profile = self.URM[user_id]
        scores = user_profile.dot(self.W_sparse).toarray().ravel()

        if exclude_seen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]
            
        return ranking[:at]
    
    
    def filter_seen(self, user_id, scores):

        start_pos = self.URM.indptr[user_id]
        end_pos = self.URM.indptr[user_id+1]

        user_profile = self.URM.indices[start_pos:end_pos]
        
        scores[user_profile] = -np.inf

        return scores

In [None]:
recommender = ItemKNNCBFRecommender(URM_train, ICM_csr)
recommender.fit(shrink=10, topK=100)


In [None]:
# Hyperparameter tuning

from Recommenders.KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender

x_tick = [80,90,100,110,120]
MAP_per_k = []

for shrink in x_tick:
    
    recommender = ItemKNNCBFRecommender(URM_train, ICM_csr)
    recommender.fit(shrink=80, topK=8)
    
    result_df, _ = evaluator_validation.evaluateRecommender(recommender)
    
    MAP_per_k.append(result_df.loc[10]["MAP"])

In [None]:
from matplotlib import pyplot
pyplot.plot(x_tick, MAP_per_k)
pyplot.ylabel('MAP')
pyplot.xlabel('Shrink')
pyplot.show()
# best topK parameter = 8, best Shrink parameter = 80

In [None]:
# Create CSV submission

recommender = ItemKNNCBFRecommender(URM_train, ICM_csr)
recommender.fit(shrink=80, topK=8)
submission = []

for user in target_users['user_id'].values:
    submission.append(( user, recommender.recommend(user, at=10) ))


def write_submission(submissions):
    with open("./submission.csv", "w") as f:
        f.write("user_id,item_list\n")
        for user_id, items in submissions:
            f.write(f"{user_id},{' '.join([str(item) for item in items])}\n")


write_submission(submission)