# Data Load
### Create the Pandas Dataframe

In [1]:
import pandas as pd
path = "recommender-system-2022-challenge-polimi-data/interactions_and_impressions.csv"

Interactions_and_Impressions = pd.read_csv(filepath_or_buffer=path,
                               sep=",",
                               header=1,
                               engine='python',
                               names=['UserId', 'ItemId', 'ImpressionList', 'Data'])
Interactions_and_Impressions

Unnamed: 0,UserId,ItemId,ImpressionList,Data
0,0,21,,0
1,0,21,,0
2,0,21,20212223242526272829,0
3,0,21,,1
4,0,21,,1
...,...,...,...,...
5826500,41628,20448,,0
5826501,41628,20896,,1
5826502,41628,21506,,1
5826503,41628,22882,,0


In [2]:
print(Interactions_and_Impressions["UserId"].max())
print(Interactions_and_Impressions["ItemId"].max())

41628
24506


### Data Preparation
First we remove empty indeces:

In [3]:
userId_unique = Interactions_and_Impressions["UserId"].unique()
itemId_unique = Interactions_and_Impressions["ItemId"].unique()

In [4]:
mapped_id, original_id = pd.factorize(Interactions_and_Impressions["UserId"].unique())
user_original_Id_to_index = pd.Series(mapped_id, index=original_id)

mapped_id, original_id = pd.factorize(Interactions_and_Impressions["ItemId"].unique())
item_original_Id_to_index = pd.Series(mapped_id, index=original_id)

In [5]:
Interactions_and_Impressions["UserId"] = Interactions_and_Impressions["UserId"].map(user_original_Id_to_index)
Interactions_and_Impressions["ItemId"] = Interactions_and_Impressions["ItemId"].map(item_original_Id_to_index)

In [6]:
Interactions_and_Impressions.head(15)

Unnamed: 0,UserId,ItemId,ImpressionList,Data
0,0,0,,0
1,0,0,,0
2,0,0,20212223242526272829,0
3,0,0,,1
4,0,0,,1
5,0,0,,1
6,0,0,,1
7,0,0,,1
8,0,0,,1
9,0,0,,1


First remove ImpressionList

In [7]:
Distinct_Interactions = Interactions_and_Impressions.copy(deep=True)
Distinct_Interactions = Distinct_Interactions.drop(["ImpressionList"], axis=1)
Distinct_Interactions

Unnamed: 0,UserId,ItemId,Data
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,1
4,0,0,1
...,...,...,...
5826500,41628,3699,0
5826501,41628,12693,1
5826502,41628,15131,1
5826503,41628,18127,0


Let's drop the duplicates now

We want an Implicit/Binary Matrix, so:
1) We drop the Data column, only keeping User and Item Id
2) We remove duplicates
3) Split data into Train/Test/Validation
4) We build the Sparse URM using Scipy

##### Step 1

In [8]:
Distinct_Interactions = Distinct_Interactions.drop(["Data"], axis=1).drop_duplicates()
Distinct_Interactions

Unnamed: 0,UserId,ItemId
0,0,0
12,0,1
27,0,2
28,0,3
29,0,4
...,...,...
5826500,41628,3699
5826501,41628,12693
5826502,41628,15131
5826503,41628,18127


##### Step 2

In [9]:
Distinct_Interactions['Rating'] = 1
Distinct_Interactions

Unnamed: 0,UserId,ItemId,Rating
0,0,0,1
12,0,1,1
27,0,2,1
28,0,3,1
29,0,4,1
...,...,...,...
5826500,41628,3699,1
5826501,41628,12693,1
5826502,41628,15131,1
5826503,41628,18127,1


In [10]:
unique_userId = Distinct_Interactions.UserId.unique()
num_users = len(unique_userId)
print(num_users)

unique_itemId = Distinct_Interactions.ItemId.unique()
num_items = len(unique_itemId)
print(num_items)

41629
24507


#### Step 3

In [11]:
from sklearn.model_selection import train_test_split
import numpy as np

seed = 1234


(user_ids_training, user_ids_test,
 item_ids_training, item_ids_test,
 ratings_training, ratings_test) = train_test_split(Distinct_Interactions.UserId,
                                                    Distinct_Interactions.ItemId,
                                                    Distinct_Interactions.Rating,
                                                    test_size=0.20,
                                                    shuffle=True,
                                                    random_state=seed)


(user_ids_training, user_ids_validation,
 item_ids_training, item_ids_validation,
 ratings_training, ratings_validation) = train_test_split(user_ids_training,
                                                          item_ids_training,
                                                          ratings_training,
                                                          test_size=0.10,
                                                          )

##### Step 4

In [12]:
import scipy.sparse as sp
urm_train = sp.csr_matrix((ratings_training, (user_ids_training, item_ids_training)),
                          shape=(num_users, num_items))

urm_test = sp.csr_matrix((ratings_test, (user_ids_test, item_ids_test)),
                         shape=(num_users, num_items))

urm_validation = sp.csr_matrix((ratings_validation, (user_ids_validation, item_ids_validation)),
                               shape=(num_users, num_items))

urm_train, urm_test, urm_validation

(<41629x24507 sparse matrix of type '<class 'numpy.int64'>'
 	with 1119339 stored elements in Compressed Sparse Row format>,
 <41629x24507 sparse matrix of type '<class 'numpy.int64'>'
 	with 310928 stored elements in Compressed Sparse Row format>,
 <41629x24507 sparse matrix of type '<class 'numpy.int64'>'
 	with 124372 stored elements in Compressed Sparse Row format>)

# Now we try running SLIM

In [13]:
from Recommenders.SLIM.SLIMElasticNetRecommender import SLIMElasticNetRecommender

In [14]:
MySlimRecommender = SLIMElasticNetRecommender(urm_train)

In [15]:
from Evaluation.Evaluator import EvaluatorHoldout
e = EvaluatorHoldout(urm_test, [10])

"""
liS = [0.1, 0.01, 0.001]
alphaS = [0.1, 0.01, 0.001]

for li in liS:
    for alp in alphaS:
        MySlimRecommender.fit(l1_ratio=li, alpha = alp, positive_only=True, topK = 100)
        print("\n-----------------------------------------------------\n")
        print(str(li) + "       " + str(alp) + "\n")
        print(e.evaluateRecommender(MySlimRecommender))
        print("\n-----------------------------------------------------\n")
"""

MySlimRecommender.fit(l1_ratio=0.085, alpha = 0.001, positive_only=True, topK = 100)
e.evaluateRecommender(MySlimRecommender)

EvaluatorHoldout: Ignoring 309 ( 0.7%) Users that have less than 1 test interactions


KeyboardInterrupt: 

### I want to see how good recommendations are for Users with few interactions

In [23]:
Group_by_user = Distinct_Interactions.groupby(["UserId"], as_index=False).count()
Group_by_user = Group_by_user.sort_values("ItemId")
Group_by_user

Unnamed: 0,UserId,ItemId
38400,38400,5
30257,30257,6
25061,25061,6
41083,41083,6
38965,38965,7
...,...,...
30753,30753,796
27985,27985,836
19407,19407,877
8693,8693,1176


# Computing the submission
First we compute the original indexes

In [None]:
target = pd.read_csv(filepath_or_buffer="recommender-system-2022-challenge-polimi-data/data_target_users_test.csv",
                                          sep=",",
                                          header=0,
                                          engine='python',
                                          names=['id'])
target

In [None]:
def write_recommendations(file_name):
    recommendations = 'user_id,item_list'
    f = open("submissions/" + file_name + ".csv", "w")

    for id in target['id']:
        recommendations_per_user = MySlimRecommender.recommend(user_id_array=id, remove_seen_flag=True, cutoff=10)

        recommendation_string = str(user_original_Id_to_index[user_original_Id_to_index==id].index.item()
                                    ) + ','

        for rec in recommendations_per_user:
            recommendation_string = recommendation_string + str(item_original_Id_to_index[item_original_Id_to_index==rec].index.item()) + ' '

        recommendation_string = recommendation_string[:-1]
        recommendations = recommendations + '\n' + recommendation_string

    f.write(recommendations)
    f.close()