In [None]:
import pandas as pd
import numpy as np
import scipy.sparse as sps

from scipy.sparse import *

Content Based recommender for competion data


In [None]:
urm_path = '/content/data_train.csv'

urm_all_df = pd.read_csv(filepath_or_buffer=urm_path,
                                sep=",",
                                header=0,
                                dtype={0:int, 1:int, 2:float},
                                engine='python')

urm_all_df.columns = ["UserID", "ItemID", "Interaction"]

In [None]:
urm_all_df.head(10)

Unnamed: 0,UserID,ItemID,Interaction
0,1,7,1.0
1,1,15,1.0
2,1,16,1.0
3,1,133,1.0
4,1,161,1.0
5,1,187,1.0
6,1,205,1.0
7,1,222,1.0
8,1,237,1.0
9,1,354,1.0


In [None]:
print ("The number of interactions is {}".format(len(urm_all_df)))

The number of interactions is 478730


In [None]:
userID_unique = urm_all_df["UserID"].unique()
itemID_unique = urm_all_df["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(urm_all_df)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 22222, Number of users	 12638
Max ID items	 22347, Max Id users	 13024

Average interactions per user 37.88
Average interactions per item 21.54

Sparsity 99.83 %


In [None]:
urm_all = sps.coo_matrix((urm_all_df["Interaction"].values,
                          (urm_all_df["UserID"].values, urm_all_df["ItemID"].values)))

urm_all

<13025x22348 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in COOrdinate format>

In [None]:
urm_all.tocsr()

<13025x22348 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

In [None]:
def precision(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)

    return precision_score

def recall(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]

    return recall_score

def AP(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    ap_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return ap_score

In [None]:
def evaluate_algorithm(URM_test, recommender_object, at=5):

    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_AP = 0.0

    num_eval = 0


    for user_id in range(URM_test.shape[0]):

        relevant_items = URM_test.indices[URM_test.indptr[user_id]:URM_test.indptr[user_id+1]]

        if len(relevant_items)>0:

            recommended_items = recommender_object.recommend(user_id, at=at)
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_AP += AP(recommended_items, relevant_items)

    cumulative_precision /= num_eval
    cumulative_recall /= num_eval
    MAP = cumulative_AP / num_eval

    print("Recommender results are: Precision = {:.4f}, Recall = {:.4f}, MAP = {:.4f}".format(
        cumulative_precision, cumulative_recall, MAP))

# Predictor

Here's the class for our predictor

In [None]:
!git clone https://github.com/MaurizioFD/RecSys_Course_AT_PoliMi.git

Cloning into 'RecSys_Course_AT_PoliMi'...
remote: Enumerating objects: 1476, done.[K
remote: Counting objects: 100% (221/221), done.[K
remote: Compressing objects: 100% (130/130), done.[K
remote: Total 1476 (delta 95), reused 209 (delta 89), pack-reused 1255[K
Receiving objects: 100% (1476/1476), 50.55 MiB | 20.92 MiB/s, done.
Resolving deltas: 100% (826/826), done.


In [None]:
!unzip /content/Recommenders.zip

Archive:  /content/Recommenders.zip
   creating: Recommenders/__pycache__/
  inflating: Recommenders/__pycache__/Recommender_utils.cpython-38.pyc  
  inflating: Recommenders/Recommender_utils.py  
   creating: Recommenders/Similarity/
   creating: Recommenders/Similarity/__pycache__/
  inflating: Recommenders/Similarity/__pycache__/Compute_Similarity_Python.cpython-38.pyc  
  inflating: Recommenders/Similarity/Compute_Similarity_Python.py  


In [None]:
from Recommenders.Similarity.Compute_Similarity_Python import Compute_Similarity_Python
class ItemKNNCFRecommender(object):

    def __init__(self, URM):
        self.URM = URM


    def fit(self, topK=50, shrink=100, normalize=True, similarity="cosine"):

        similarity_object = Compute_Similarity_Python(self.URM, shrink=shrink,
                                                  topK=topK, normalize=normalize,
                                                  similarity = similarity)

        self.W_sparse = similarity_object.compute_similarity()


    def recommend(self, user_id, at=None, exclude_seen=True, users_not_in_train=[]):
        # Check if user_id is a valid index
        if user_id < 0 or user_id >= self.URM.shape[0]:
            print(f"Invalid user_id: {user_id}")
            return

        # Check if user_id not in train use the topRec
        if user_id in users_not_in_train:
            return ["517 189 44 0 284 808 285 1 557 1266"]

        # compute the scores using the dot product
        user_profile = self.URM[user_id]
        scores = user_profile.dot(self.W_sparse).toarray().ravel()

        if exclude_seen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]

        return ranking[:at]


    def filter_seen(self, user_id, scores):

        start_pos = self.URM.indptr[user_id]
        end_pos = self.URM.indptr[user_id+1]

        user_profile = self.URM.indices[start_pos:end_pos]

        scores[user_profile] = -np.inf

        return scores

# **Make the recommendations**

Here's the model:

In [None]:
urm_all_train = sps.csr_matrix((urm_all.data,
                            (urm_all.row, urm_all.col)))

In [None]:
recommender = ItemKNNCFRecommender(urm_all_train)
recommender.fit(shrink=10)

Similarity column 22348 (100.0%), 1345.88 column/sec. Elapsed time 16.60 sec


**Now let's read the data we want to predict for and see if there're overlaps with UserIDs in train data**

In [None]:
urm_pred_path = '../content/data_target_users_test.csv'

urm_pred_df = pd.read_csv(filepath_or_buffer=urm_pred_path,
                                sep=",",
                                header=0,
                                dtype={0:int},
                                engine='python')

urm_pred_df.columns = ["UserID"]
len(urm_pred_df['UserID'])
print('Unique user id to predict:', urm_pred_df['UserID'].nunique())

Unique user id to predict: 10882


Now we make the predictions for every user in our test set and add them to pred_df

Here we list the users that are in our test set but not in our whole train set

In [None]:
users_not_in_train = urm_pred_df[~urm_pred_df['UserID'].isin(urm_all_df['UserID'])]

print("Users in urm_pred_df but not in urm_all_orgdf:")
print(users_not_in_train)
print(len(users_not_in_train))

users_not_in_train = users_not_in_train['UserID'].to_numpy()

Users in urm_pred_df but not in urm_all_orgdf:
       UserID
54         60
58         65
147       168
223       261
272       316
...       ...
10682   12775
10699   12798
10729   12837
10802   12921
10856   12992

[221 rows x 1 columns]
221


In [None]:
recommendations = recommender.recommend(60, at=10, users_not_in_train=users_not_in_train)
print(recommendations)


['517 189 44 0 284 808 285 1 557 1266']


In [None]:
pred_df = pd.DataFrame(columns = ['user_id','item_list'])

for userid in urm_pred_df['UserID']:
  recommendations = recommender.recommend(userid, at=10, users_not_in_train=users_not_in_train)
  recommendations = " ".join(str(item) for item in recommendations)
  pred_df.loc[len(pred_df)] = [userid,recommendations]

In [None]:
pred_df

Unnamed: 0,user_id,item_list
0,1,101 506 403 36 515 1546 977 637 3316 922
1,2,2 50 11 1095 47 5 28 19 14 9
2,3,59 956 584 259 536 648 857 1281 414 2748
3,4,28 50 249 145 136 1 5 111 277 14
4,5,4 8 20 116 75 170 24 135 278 44
...,...,...
10877,13020,6450 6198 6452 6749 7395 7394 6724 7029 5915 6429
10878,13021,6720 6179 6426 6749 7027 6451 7395 6721 13621 ...
10879,13022,1668 1446 1411 1674 809 10998 11056 1561 4608 ...
10880,13023,1124 837 1534 706 978 828 1290 1051 1532 1715


In [None]:
pred_df.to_csv('/content/predCF_Max.csv',index=False)

-------------------------------------------------------------------------------