In [1]:
!cp -r ../input/recsys-repo/RecSys_Course_AT_PoliMi-master/* ./

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from urllib.request import urlretrieve
import zipfile, os
import scipy.sparse as sps
import matplotlib.pyplot as pyplot
from Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.split_functions.split_train_validation_random_holdout import split_train_in_two_percentage_global_sample
## In order to evaluate put it in a recommender class
from Recommenders.BaseRecommender import BaseRecommender
from lightfm import LightFM
URM_path = "../input/urm-true-binary/URM_True_Binary.csv"
ICM_type = "/kaggle/input/competition-data/data_ICM_length.csv"

ICM_type_df = pd.read_csv("../input/competition-data/data_ICM_length.csv")
ICM_type_df



Unnamed: 0,item_id,feature_id,data
0,0,0,1
1,1,0,1
2,2,0,21
3,3,0,1
4,4,0,1
...,...,...,...
23086,27963,0,1
23087,27964,0,1
23088,27965,0,1
23089,27966,0,1


In [2]:
items = ICM_type_df.item_id
features = ICM_type_df.feature_id
data = ICM_type_df.data
ICM_type = sps.csr_matrix((data, (items, features)))
ICM_type = ICM_type.astype(dtype = np.int32)
ICM_type.shape

(27968, 1)

In [3]:
n_users = 41629
n_itemsFromICM = ICM_type.shape[0]

URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_path, 
                                sep=",",
                                dtype={0:int, 1:int, 2:float},
                                header=0)
URM_all_dataframe.columns = ["UserID", "ItemID", "Data"]
URM_all = sps.coo_matrix((URM_all_dataframe["Data"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)), shape=(n_users,n_itemsFromICM))
URM_all = URM_all.tocsr() # to obtain fast access to rows (users)


In [4]:
class LightFMCBFRecommender(BaseRecommender):
    """LightFMCBFRecommender"""

    RECOMMENDER_NAME = "LightFMCBFRecommender"

    def __init__(self, URM_train, ICM_train):
        super(LightFMCBFRecommender, self).__init__(URM_train)
        
        self.ICM_train = ICM_train.copy()


    def fit(self, epochs = 300, alpha = 1e-6, n_factors = 10, n_threads = 4):
        
        # Let's fit a WARP model
        self.lightFM_model = LightFM(loss='warp',
                                     item_alpha=alpha,
                                     no_components=n_factors)

        self.lightFM_model = self.lightFM_model.fit(URM_train, 
                                       item_features=self.ICM_train, 
                                       epochs=epochs, 
                                       num_threads=n_threads)


    def _compute_item_score(self, user_id_array, items_to_compute = None):
        
        # Create a single (n_items, ) array with the item score, then copy it for every user
        items_to_compute = np.arange(self.n_items)
        
        item_scores = - np.ones((len(user_id_array), self.n_items)) * np.inf

        for user_index, user_id in enumerate(user_id_array):
            item_scores[user_index] = self.lightFM_model.predict(int(user_id), 
                                                                 items_to_compute,
                                                                 item_features = self.ICM_train)

        return item_scores

In [5]:
import scipy.sparse as sps

class LightFMItemHybridRecommender(LightFMCBFRecommender):
    """LightFMItemHybridRecommender"""

    RECOMMENDER_NAME = "LightFMItemHybridRecommender"

    def __init__(self, URM_train, ICM_train):
        super(LightFMItemHybridRecommender, self).__init__(URM_train, ICM_train)

        # Need to hstack item_features to ensure each ItemIDs are present in the model
        eye = sps.eye(self.n_items, self.n_items).tocsr()
        self.ICM_train = sps.hstack((eye, self.ICM_train)).tocsr()

In [6]:
URM_train_validation, URM_test = split_train_in_two_percentage_global_sample(URM_all, train_percentage = 0.8)
URM_train, URM_validation = split_train_in_two_percentage_global_sample(URM_train_validation, train_percentage = 0.8)

evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[10])


EvaluatorHoldout: Ignoring 729 ( 1.8%) Users that have less than 1 test interactions
EvaluatorHoldout: Ignoring 331 ( 0.8%) Users that have less than 1 test interactions


In [7]:
recommender = LightFMItemHybridRecommender(URM_train, ICM_type)
recommender.fit(epochs = 10)

result_df, _ = evaluator_validation.evaluateRecommender(recommender)
result_df

LightFMItemHybridRecommender: URM Detected 3461 (12.4%) items with no interactions.
EvaluatorHoldout: Processed 40900 (100.0%) in 1.65 min. Users per second: 413


Unnamed: 0_level_0,PRECISION,PRECISION_RECALL_MIN_DEN,RECALL,MAP,MAP_MIN_DEN,MRR,NDCG,F1,HIT_RATE,ARHR_ALL_HITS,...,COVERAGE_USER,COVERAGE_USER_HIT,USERS_IN_GT,DIVERSITY_GINI,SHANNON_ENTROPY,RATIO_DIVERSITY_HERFINDAHL,RATIO_DIVERSITY_GINI,RATIO_SHANNON_ENTROPY,RATIO_AVERAGE_POPULARITY,RATIO_NOVELTY
cutoff,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10,0.000328,0.000535,0.00048,0.000117,0.00019,0.001172,0.000486,0.000389,0.003276,0.001172,...,0.982488,0.003219,0.982488,0.000853,4.890824,0.960209,0.001934,0.362628,0.169272,0.430117


In [8]:
URM_all_dataframe = pd.read_csv(filepath_or_buffer=URM_path, 
                                sep=",",
                                dtype={0:int, 1:int, 2:float},
                                header=0)
URM_all_dataframe.columns = ["UserID", "ItemID", "Data"]
URM_all = sps.coo_matrix((URM_all_dataframe["Data"].values, 
                          (URM_all_dataframe["UserID"].values, URM_all_dataframe["ItemID"].values)))
URM_all = URM_all.tocsr() # to obtain fast access to rows (users)

In [9]:
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

model = LightFM(learning_rate=0.05, loss='bpr')
model.fit(URM_all, epochs=10)

train_precision = precision_at_k(model, URM_all, k=10).mean()

train_auc = auc_score(model, URM_all).mean()

print('Precision all: %.2f' % (train_precision))
print('AUC all: %.2f' % (train_auc))

Precision all: 0.21
AUC all: 0.69


In [10]:
test_users = pd.read_csv('/kaggle/input/competition-data/data_target_users_test.csv')
test_users

Unnamed: 0,user_id
0,0
1,1
2,2
3,3
4,4
...,...
41111,41624
41112,41625
41113,41626
41114,41627


In [11]:
user_ids = test_users['user_id']
recommendations = []

In [12]:
def sample_recommendation(model, URM_all, user_ids):
    
    n_users = URM_all.shape[0]
    n_items = URM_all.shape[1]
    print(f"N_users: {n_users}")
    print(f"N_items: {n_items}")
    
    for user_id in user_ids:        
        scores = model.predict(user_id, np.arange(n_items))
        top_items = np.argsort(-scores)
        recommendations.append(top_items[:10])
        

In [13]:
sample_recommendation(model, URM_all, user_ids) 
print(len(recommendations))

N_users: 41629
N_items: 24507
41116


In [14]:
test_users['item_list'] = recommendations #41116
test_users['item_list'] = pd.DataFrame([str(line).strip('[').strip(']').replace("'","") for line in test_users['item_list']])
test_users.to_csv('submission.csv', index=False)