# Data Load
### Create the Pandas Dataframe

In [72]:
import pandas as pd
path = "recommender-system-2022-challenge-polimi-data/interactions_and_impressions.csv"

Interactions_and_Impressions = pd.read_csv(filepath_or_buffer=path,
                               sep=",",
                               header=1,
                               engine='python',
                               names=['UserId', 'ItemId', 'ImpressionList', 'Data'])
Interactions_and_Impressions

Unnamed: 0,UserId,ItemId,ImpressionList,Data
0,0,21,,0
1,0,21,,0
2,0,21,20212223242526272829,0
3,0,21,,1
4,0,21,,1
...,...,...,...,...
5826500,41628,20448,,0
5826501,41628,20896,,1
5826502,41628,21506,,1
5826503,41628,22882,,0


In [73]:
print(Interactions_and_Impressions["UserId"].max())
print(Interactions_and_Impressions["ItemId"].max())

41628
24506


### Data Preparation
First we remove empty indeces:

In [74]:
userId_unique = Interactions_and_Impressions["UserId"].unique()
itemId_unique = Interactions_and_Impressions["ItemId"].unique()

In [75]:
mapped_id, original_id = pd.factorize(Interactions_and_Impressions["UserId"].unique())
user_original_Id_to_index = pd.Series(mapped_id, index=original_id)

mapped_id, original_id = pd.factorize(Interactions_and_Impressions["ItemId"].unique())
item_original_Id_to_index = pd.Series(mapped_id, index=original_id)

In [76]:
Interactions_and_Impressions["UserId"] = Interactions_and_Impressions["UserId"].map(user_original_Id_to_index)
Interactions_and_Impressions["ItemId"] = Interactions_and_Impressions["ItemId"].map(item_original_Id_to_index)

In [77]:
Interactions_and_Impressions.head(15)

Unnamed: 0,UserId,ItemId,ImpressionList,Data
0,0,0,,0
1,0,0,,0
2,0,0,20212223242526272829,0
3,0,0,,1
4,0,0,,1
5,0,0,,1
6,0,0,,1
7,0,0,,1
8,0,0,,1
9,0,0,,1


### Remove Single 0 Interactions

First remove ImpressionList

In [78]:
Distinct_Interactions = Interactions_and_Impressions.copy(deep=True)
Distinct_Interactions = Distinct_Interactions.drop(["ImpressionList"], axis=1)
Distinct_Interactions

Unnamed: 0,UserId,ItemId,Data
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,1
4,0,0,1
...,...,...,...
5826500,41628,3699,0
5826501,41628,12693,1
5826502,41628,15131,1
5826503,41628,18127,0


Group the interactions by UserId+ItemId with Data==0 and add their respective count: we have to get rid of those couples with count == 1

In [79]:
g = Distinct_Interactions[Distinct_Interactions["Data"]==0].groupby(['UserId', 'ItemId'], as_index=False).count()
g = g[g["Data"]==1]
g # queste sono le coppie UserId-ItemId da eliminare in Distinct_Interactions

Unnamed: 0,UserId,ItemId,Data
1,0,5,1
2,0,10,1
3,0,12,1
4,0,14,1
5,0,24,1
...,...,...,...
1051823,41628,10974,1
1051824,41628,14572,1
1051825,41628,18127,1
1051826,41628,19539,1


Let's drop the duplicates now

In [80]:
Distinct_Interactions = Distinct_Interactions.drop_duplicates()
Distinct_Interactions

Unnamed: 0,UserId,ItemId,Data
0,0,0,0
3,0,0,1
12,0,1,1
27,0,2,1
28,0,3,1
...,...,...,...
5826500,41628,3699,0
5826501,41628,12693,1
5826502,41628,15131,1
5826503,41628,18127,0


Non so come funzioni questa cella ma funziona gg

In [81]:
g["Data"]=0
new = pd.merge(Distinct_Interactions, g, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)
new

Unnamed: 0,UserId,ItemId,Data
0,0,0,0
1,0,0,1
2,0,1,1
3,0,2,1
4,0,3,1
...,...,...,...
1715186,41628,12162,1
1715188,41628,22724,1
1715189,41628,2284,1
1715193,41628,12693,1


We want an Implicit/Binary Matrix, so:
1) We drop the Data column, only keeping User and Item Id
2) We remove duplicates
3) Split data into Train/Test/Validation
4) We build the Sparse URM using Scipy

##### Step 1

In [82]:
new = new.drop(["Data"], axis=1)

##### Step 2

In [83]:
new = new.drop_duplicates()
new['Rating'] = 1
new

Unnamed: 0,UserId,ItemId,Rating
0,0,0,1
2,0,1,1
3,0,2,1
4,0,3,1
5,0,4,1
...,...,...,...
1715186,41628,12162,1
1715188,41628,22724,1
1715189,41628,2284,1
1715193,41628,12693,1


In [84]:
unique_userId = Distinct_Interactions.UserId.unique()
num_users = len(unique_userId)
print(num_users)

unique_itemId = Distinct_Interactions.ItemId.unique()
num_items = len(unique_itemId)
print(num_items)

41629
24507


We can see some User profiles are now empty

In [85]:
print(len(new.UserId.unique()))
print(len(new.ItemId.unique()))

41624
24507


#### Step 3

In [86]:
from sklearn.model_selection import train_test_split
import numpy as np

seed = 1234


(user_ids_training, user_ids_test,
 item_ids_training, item_ids_test,
 ratings_training, ratings_test) = train_test_split(new.UserId,
                                                    new.ItemId,
                                                    new.Rating,
                                                    test_size=0.20,
                                                    shuffle=True,
                                                    random_state=seed)


(user_ids_training, user_ids_validation,
 item_ids_training, item_ids_validation,
 ratings_training, ratings_validation) = train_test_split(user_ids_training,
                                                          item_ids_training,
                                                          ratings_training,
                                                          test_size=0.10,
                                                          )

##### Step 4

In [87]:
import scipy.sparse as sp
urm_train = sp.csr_matrix((ratings_training, (user_ids_training, item_ids_training)),
                          shape=(num_users, num_items))

urm_test = sp.csr_matrix((ratings_test, (user_ids_test, item_ids_test)),
                         shape=(num_users, num_items))

urm_validation = sp.csr_matrix((ratings_validation, (user_ids_validation, item_ids_validation)),
                               shape=(num_users, num_items))

urm_train, urm_test, urm_validation

(<41629x24507 sparse matrix of type '<class 'numpy.int64'>'
 	with 590024 stored elements in Compressed Sparse Row format>,
 <41629x24507 sparse matrix of type '<class 'numpy.int64'>'
 	with 163896 stored elements in Compressed Sparse Row format>,
 <41629x24507 sparse matrix of type '<class 'numpy.int64'>'
 	with 65559 stored elements in Compressed Sparse Row format>)

# Now we try running SLIM

In [88]:
from Recommenders.SLIM.SLIMElasticNetRecommender import SLIMElasticNetRecommender

In [89]:
MySlimRecommender = SLIMElasticNetRecommender(urm_train)

SLIMElasticNetRecommender: URM Detected 50 ( 0.1%) users with no interactions.
SLIMElasticNetRecommender: URM Detected 3 ( 0.0%) items with no interactions.


In [100]:
liS = [0.1, 0.01, 0.001]
alphaS = [0.1, 0.01, 0.001]

from Evaluation.Evaluator import EvaluatorHoldout
e = EvaluatorHoldout(urm_test, [10])

for li in liS:
    for alp in alphaS:
        MySlimRecommender.fit(l1_ratio=li, alpha = alp, positive_only=True, topK = 100)
        print("\n-----------------------------------------------------\n")
        print(str(li) + "       " + str(alp) + "\n")
        print(e.evaluateRecommender(MySlimRecommender))
        print("\n-----------------------------------------------------\n")

EvaluatorHoldout: Ignoring 3970 ( 9.5%) Users that have less than 1 test interactions
SLIMElasticNetRecommender: Processed 24507 (100.0%) in 2.73 min. Items per second: 149.51

-----------------------------------------------------

0.1       0.1

EvaluatorHoldout: Processed 37659 (100.0%) in 14.98 sec. Users per second: 2514
(       PRECISION PRECISION_RECALL_MIN_DEN    RECALL       MAP MAP_MIN_DEN  \
cutoff                                                                      
10      0.007841                 0.017955  0.017339  0.004883    0.011151   

             MRR      NDCG        F1  HIT_RATE ARHR_ALL_HITS  ...  \
cutoff                                                        ...   
10      0.044179  0.019302  0.010799  0.070368      0.046398  ...   

       COVERAGE_USER COVERAGE_USER_HIT USERS_IN_GT DIVERSITY_GINI  \
cutoff                                                              
10          0.904634          0.063658    0.904634       0.000683   

       SHANNON_ENTROPY R

### No parameter tuning for now

# Computing the submission
First we compute the original indexes

In [None]:
target = pd.read_csv(filepath_or_buffer="recommender-system-2022-challenge-polimi-data/data_target_users_test.csv",
                                          sep=",",
                                          header=0,
                                          engine='python',
                                          names=['id'])
target

In [None]:
def write_recommendations(file_name):
    recommendations = 'user_id,item_list'
    f = open("submissions/" + file_name + ".csv", "w")

    for id in target['id']:
        recommendations_per_user = MySlimRecommender.recommend(user_id_array=id, remove_seen_flag=True, cutoff=10)

        recommendation_string = str(user_original_Id_to_index[user_original_Id_to_index==id].index.item()
                                    ) + ','

        for rec in recommendations_per_user:
            recommendation_string = recommendation_string + str(item_original_Id_to_index[item_original_Id_to_index==rec].index.item()) + ' '

        recommendation_string = recommendation_string[:-1]
        recommendations = recommendations + '\n' + recommendation_string

    f.write(recommendations)
    f.close()