# Data Load
### Create the Pandas Dataframe

In [1]:
import pandas as pd
path = "recommender-system-2022-challenge-polimi-data/interactions_and_impressions.csv"

Interactions_and_Impressions = pd.read_csv(filepath_or_buffer=path,
                               sep=",",
                               header=1,
                               engine='python',
                               names=['UserId', 'ItemId', 'ImpressionList', 'Data'])
Interactions_and_Impressions

Unnamed: 0,UserId,ItemId,ImpressionList,Data
0,0,21,,0
1,0,21,,0
2,0,21,20212223242526272829,0
3,0,21,,1
4,0,21,,1
...,...,...,...,...
5826500,41628,20448,,0
5826501,41628,20896,,1
5826502,41628,21506,,1
5826503,41628,22882,,0


In [2]:
print(Interactions_and_Impressions["UserId"].max())
print(Interactions_and_Impressions["ItemId"].max())

41628
24506


- First remove ImpressionList
- We want to give the same value to 0 and 1 interactions so we drop "Data" too
- We don't consider multiple interactions so we drop duplicates

In [3]:
Distinct_Interactions = Interactions_and_Impressions.copy(deep=True)
Distinct_Interactions = Distinct_Interactions.drop(["ImpressionList", "Data"], axis=1)
Distinct_Interactions = Distinct_Interactions.drop_duplicates()
Distinct_Interactions

Unnamed: 0,UserId,ItemId
0,0,21
12,0,22
27,0,24
28,0,44
29,0,54
...,...,...
5826500,41628,20448
5826501,41628,20896
5826502,41628,21506
5826503,41628,22882


Let's drop the duplicates now

### Let's now deal with outliers

In [4]:
Group_by_item = Distinct_Interactions.groupby(['ItemId'], as_index=False).count()
Group_by_item

Unnamed: 0,ItemId,UserId
0,0,41
1,1,87
2,2,50
3,3,63
4,4,66
...,...,...
24502,24502,22
24503,24503,31
24504,24504,25
24505,24505,24


In [5]:
print("The mean number of interactions per Item is {}".format(Group_by_item.UserId.mean()))
print("")
print("The 25% quantile number of interactions per Item is {}".format(Group_by_item.UserId.quantile(.25)))
print("")
print("The 75% quantile number of interactions per Item is {}".format(Group_by_item.UserId.quantile(.75)))
print("")
print("The max number of interactions per Item is {}".format(Group_by_item.UserId.max()))
print("\n")

The mean number of interactions per Item is 63.43652833884197

The 25% quantile number of interactions per Item is 29.0

The 75% quantile number of interactions per Item is 46.0

The max number of interactions per Item is 9481




In [6]:
def find_outliers(data, percentage):
    # the first quartile
    q1 = data.quantile(percentage)

    # the 3rd quartile
    q3 = data.quantile(1-percentage)

    # the iqr region
    iqr = q3 - q1

    # finding upper and lower whiskers
    upper_bound = q3+(1.5*iqr)
    lower_bound = q1-(1.5*iqr)

    return (upper_bound, lower_bound)


outliers_ratio = 0.10
(item_upper_bound, item_lower_bound) = find_outliers(Group_by_item.UserId, outliers_ratio)   # outliers are outside these bounds
item_upper_bound, item_lower_bound

(182.5, -69.5)

In [7]:
item_outliers = Group_by_item[Group_by_item['UserId'] >= item_upper_bound]
print("The number of Item outliers is {}".format(len(item_outliers)))
print("The total number of Items is {}, so outliers are the {:.3f}%".format(len(Distinct_Interactions.ItemId.unique()), len(item_outliers)/len(Distinct_Interactions.ItemId.unique())*100))

The number of Item outliers is 1070
The total number of Items is 24507, so outliers are the 4.366%


In [8]:
item_outliers

Unnamed: 0,ItemId,UserId
5,5,658
6,6,455
7,7,248
19,19,485
20,20,5016
...,...,...
22782,22782,273
22844,22844,187
22864,22864,372
22894,22894,214


# Experiment 1: Try removing the outliers items

In [9]:
Distinct_Interactions_no_outlier = Distinct_Interactions.copy(deep=True)
for v in item_outliers.ItemId.values:
    Distinct_Interactions_no_outlier = Distinct_Interactions_no_outlier[Distinct_Interactions_no_outlier["ItemId"] != v]

Distinct_Interactions_no_outlier

Unnamed: 0,UserId,ItemId
37,0,159
42,0,987
47,0,1715
54,0,2292
55,0,2730
...,...,...
5826500,41628,20448
5826501,41628,20896
5826502,41628,21506
5826503,41628,22882


In [10]:
Distinct_Interactions_no_outlier['Rating'] = 1
Distinct_Interactions_no_outlier

Unnamed: 0,UserId,ItemId,Rating
37,0,159,1
42,0,987,1
47,0,1715,1
54,0,2292,1
55,0,2730,1
...,...,...,...
5826500,41628,20448,1
5826501,41628,20896,1
5826502,41628,21506,1
5826503,41628,22882,1


In [11]:
itemId_unique_no_outlier = Distinct_Interactions_no_outlier["ItemId"].unique()
print("The new number of Items is {}".format(len(itemId_unique_no_outlier)))

The new number of Items is 23437


Let's remove the empty indexes

In [12]:
userId_unique = Distinct_Interactions_no_outlier["UserId"].unique()
itemId_unique = Distinct_Interactions_no_outlier["ItemId"].unique()

In [13]:
mapped_id, original_id = pd.factorize(Distinct_Interactions_no_outlier["UserId"].unique())
user_original_Id_to_index = pd.Series(mapped_id, index=original_id)

mapped_id, original_id = pd.factorize(Distinct_Interactions_no_outlier["ItemId"].unique())
item_original_Id_to_index = pd.Series(mapped_id, index=original_id)

In [14]:
Distinct_Interactions_no_outlier["UserId"] = Distinct_Interactions_no_outlier["UserId"].map(user_original_Id_to_index)
Distinct_Interactions_no_outlier["ItemId"] = Distinct_Interactions_no_outlier["ItemId"].map(item_original_Id_to_index)

In [15]:
Distinct_Interactions_no_outlier.head(15)

Unnamed: 0,UserId,ItemId,Rating
37,0,0,1
42,0,1,1
47,0,2,1
54,0,3,1
55,0,4,1
56,0,5,1
57,0,6,1
58,0,7,1
59,0,8,1
61,0,9,1


### Let's build the URM

Split the Dataset

In [16]:
from sklearn.model_selection import train_test_split
import numpy as np

seed = 1234


(user_ids_training_validation, user_ids_test,
 item_ids_training_validation, item_ids_test,
 ratings_training_validation, ratings_test) = train_test_split(Distinct_Interactions_no_outlier.UserId,
                                                               Distinct_Interactions_no_outlier.ItemId,
                                                               Distinct_Interactions_no_outlier.Rating,
                                                               test_size=0.20,
                                                               shuffle=True,
                                                               random_state=seed)

(user_ids_training, user_ids_validation,
 item_ids_training, item_ids_validation,
 ratings_training, ratings_validation) = train_test_split(user_ids_training_validation,
                                                          item_ids_training_validation,
                                                          ratings_training_validation,
                                                          test_size=0.20,
                                                          shuffle=True,
                                                          random_state=seed)

##### Step 4

In [17]:
num_users = len(userId_unique)
num_items = len(itemId_unique)

In [18]:
import scipy.sparse as sps

urm_all = sps.csr_matrix((Distinct_Interactions_no_outlier.Rating, (Distinct_Interactions_no_outlier.UserId, Distinct_Interactions_no_outlier.ItemId)),
                         shape = (num_users, num_items))

urm_train = sps.csr_matrix((ratings_training, (user_ids_training, item_ids_training)),
                           shape = (num_users, num_items))

urm_validation = sps.csr_matrix((ratings_validation, (user_ids_validation, item_ids_validation)),
                                shape = (num_users, num_items))

urm_train_validation = sps.csr_matrix((ratings_training_validation, (user_ids_training_validation, item_ids_training_validation)),
                                      shape = (num_users, num_items))

urm_test = sps.csr_matrix((ratings_test, (user_ids_test, item_ids_test)),
                          shape = (num_users, num_items))

urm_all, urm_train, urm_test, urm_validation, urm_train_validation

(<41629x23437 sparse matrix of type '<class 'numpy.int64'>'
 	with 976819 stored elements in Compressed Sparse Row format>,
 <41629x23437 sparse matrix of type '<class 'numpy.int64'>'
 	with 625164 stored elements in Compressed Sparse Row format>,
 <41629x23437 sparse matrix of type '<class 'numpy.int64'>'
 	with 195364 stored elements in Compressed Sparse Row format>,
 <41629x23437 sparse matrix of type '<class 'numpy.int64'>'
 	with 156291 stored elements in Compressed Sparse Row format>,
 <41629x23437 sparse matrix of type '<class 'numpy.int64'>'
 	with 781455 stored elements in Compressed Sparse Row format>)

# Now we try running SLIM

In [19]:
from Recommenders.SLIM.SLIMElasticNetRecommender import SLIMElasticNetRecommender

In [20]:
MySlimRecommender = SLIMElasticNetRecommender(urm_train)

In [21]:
liS = [0.1, 0.01, 0.001]
alphaS = [0.1, 0.01, 0.001]

from Evaluation.Evaluator import EvaluatorHoldout
e = EvaluatorHoldout(urm_validation, [10])


for li in liS:
    for alp in alphaS:
        MySlimRecommender.fit(l1_ratio=li, alpha = alp, positive_only=True, topK = 100)
        print("\n-----------------------------------------------------\n")
        print(str(li) + "       " + str(alp) + "\n")
        print(e.evaluateRecommender(MySlimRecommender))
        print("\n-----------------------------------------------------\n")


"""
MySlimRecommender.fit(l1_ratio=0.085, alpha = 0.001, positive_only=True, topK = 100)
e.evaluateRecommender(MySlimRecommender)
"""

EvaluatorHoldout: Ignoring 1934 ( 4.6%) Users that have less than 1 test interactions
SLIMElasticNetRecommender: Processed 23437 (100.0%) in 3.90 min. Items per second: 100.18
EvaluatorHoldout: Processed 39695 (100.0%) in 16.14 sec. Users per second: 2460


(       PRECISION PRECISION_RECALL_MIN_DEN    RECALL       MAP MAP_MIN_DEN  \
 cutoff                                                                      
 10      0.011077                 0.021806  0.020866  0.005315    0.010629   
 
             MRR      NDCG        F1  HIT_RATE ARHR_ALL_HITS  ...  \
 cutoff                                                       ...   
 10      0.04199  0.020507  0.014471  0.087265      0.047031  ...   
 
        COVERAGE_USER COVERAGE_USER_HIT USERS_IN_GT DIVERSITY_GINI  \
 cutoff                                                              
 10          0.953542          0.083211    0.953542       0.065703   
 
        SHANNON_ENTROPY RATIO_DIVERSITY_HERFINDAHL RATIO_DIVERSITY_GINI  \
 cutoff                                                                   
 10           10.692296                   0.997723             0.089835   
 
        RATIO_SHANNON_ENTROPY RATIO_AVERAGE_POPULARITY RATIO_NOVELTY  
 cutoff                                      

### No parameter tuning for now

# Computing the submission
First we compute the original indexes

In [None]:
target = pd.read_csv(filepath_or_buffer="recommender-system-2022-challenge-polimi-data/data_target_users_test.csv",
                                          sep=",",
                                          header=0,
                                          engine='python',
                                          names=['id'])
target

In [None]:
def write_recommendations(file_name):
    recommendations = 'user_id,item_list'
    f = open("submissions/" + file_name + ".csv", "w")

    for id in target['id']:
        recommendations_per_user = MySlimRecommender.recommend(user_id_array=id, remove_seen_flag=True, cutoff=10)

        recommendation_string = str(user_original_Id_to_index[user_original_Id_to_index==id].index.item()
                                    ) + ','

        for rec in recommendations_per_user:
            recommendation_string = recommendation_string + str(item_original_Id_to_index[item_original_Id_to_index==rec].index.item()) + ' '

        recommendation_string = recommendation_string[:-1]
        recommendations = recommendations + '\n' + recommendation_string

    f.write(recommendations)
    f.close()