In [1]:
import time
import pandas as pd
import numpy as np
import scipy.sparse as sps
import random as rnd

from scipy.sparse import *

In [2]:
urm_path = '../content/data_train.csv'
urm_all_df = pd.read_csv(filepath_or_buffer=urm_path,
                                sep=",",
                                header=0,
                                dtype={0:int, 1:int, 2:float},
                                engine='python')

urm_all_df.columns = ["UserID", "ItemID", "Interaction"]

In [3]:
urm_all_df.head(10)

Unnamed: 0,UserID,ItemID,Interaction
0,1,7,1.0
1,1,15,1.0
2,1,16,1.0
3,1,133,1.0
4,1,161,1.0
5,1,187,1.0
6,1,205,1.0
7,1,222,1.0
8,1,237,1.0
9,1,354,1.0


In [4]:
print ("The number of interactions is {}".format(len(urm_all_df)))

The number of interactions is 478730


In [5]:
userID_unique = urm_all_df["UserID"].unique()
itemID_unique = urm_all_df["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(urm_all_df)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 22222, Number of users	 12638
Max ID items	 22347, Max Id users	 13024

Average interactions per user 37.88
Average interactions per item 21.54

Sparsity 99.83 %


# Remove empty profiles

In [6]:
# Remove empty IDs
mapped_id, original_id = pd.factorize(urm_all_df["UserID"].unique())
user_original_ID_to_index = pd.Series(mapped_id, index=original_id)

mapped_id, original_id = pd.factorize(urm_all_df["ItemID"].unique())
item_original_ID_to_index = pd.Series(mapped_id, index=original_id)



# Let's create the mapping from items indices to original item_ids
item_index_to_original_ID = pd.Series(item_original_ID_to_index.index,index = item_original_ID_to_index.values)



original_item_ID = 125
print("New index for item {} is {}".format(original_item_ID, item_original_ID_to_index[original_item_ID]))


urm_all_df["UserID"] = urm_all_df["UserID"].map(user_original_ID_to_index)
urm_all_df["ItemID"] = urm_all_df["ItemID"].map(item_original_ID_to_index)
urm_all_df.head(n=10)

New index for item 125 is 93


Unnamed: 0,UserID,ItemID,Interaction
0,0,0,1.0
1,0,1,1.0
2,0,2,1.0
3,0,3,1.0
4,0,4,1.0
5,0,5,1.0
6,0,6,1.0
7,0,7,1.0
8,0,8,1.0
9,0,9,1.0


In [7]:
userID_unique = urm_all_df["UserID"].unique()
itemID_unique = urm_all_df["ItemID"].unique()

n_users = len(userID_unique)
n_items = len(itemID_unique)
n_interactions = len(urm_all_df)

print ("Number of items\t {}, Number of users\t {}".format(n_items, n_users))
print ("Max ID items\t {}, Max Id users\t {}\n".format(max(itemID_unique), max(userID_unique)))
print ("Average interactions per user {:.2f}".format(n_interactions/n_users))
print ("Average interactions per item {:.2f}\n".format(n_interactions/n_items))

print ("Sparsity {:.2f} %".format((1-float(n_interactions)/(n_items*n_users))*100))

Number of items	 22222, Number of users	 12638
Max ID items	 22221, Max Id users	 12637

Average interactions per user 37.88
Average interactions per item 21.54

Sparsity 99.83 %


In [12]:
urm_all = sps.coo_matrix((urm_all_df["Interaction"].values,
                          (urm_all_df["UserID"].values, urm_all_df["ItemID"].values)))

urm_all

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in COOrdinate format>

In [None]:
urm_all = urm_all.tocsr()
urm_all

<13025x22348 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

In [13]:
train_test_split = 0.80

n_interactions = urm_all.nnz

train_mask = np.random.choice([True,False], n_interactions, p=[train_test_split, 1-train_test_split])

urm_train = sps.csr_matrix((urm_all.data[train_mask],
                            (urm_all.row[train_mask], urm_all.col[train_mask])))

val_mask = np.logical_not(train_mask)

urm_val = sps.csr_matrix((urm_all.data[val_mask],
                            (urm_all.row[val_mask], urm_all.col[val_mask])))

In [9]:
def precision(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    precision_score = np.sum(is_relevant, dtype=np.float32) / len(is_relevant)

    return precision_score

def recall(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    recall_score = np.sum(is_relevant, dtype=np.float32) / relevant_items.shape[0]

    return recall_score

def AP(recommended_items, relevant_items):

    is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

    # Cumulative sum: precision at 1, at 2, at 3 ...
    p_at_k = is_relevant * np.cumsum(is_relevant, dtype=np.float32) / (1 + np.arange(is_relevant.shape[0]))

    ap_score = np.sum(p_at_k) / np.min([relevant_items.shape[0], is_relevant.shape[0]])

    return ap_score

def evaluate_algorithm(URM_test, recommender_object, at=10):

    cumulative_precision = 0.0
    cumulative_recall = 0.0
    cumulative_AP = 0.0

    num_eval = 0


    for user_id in range(URM_test.shape[0]):

        relevant_items = URM_test.indices[URM_test.indptr[user_id]:URM_test.indptr[user_id+1]]

        if len(relevant_items)>0:

            recommended_items = recommender_object.recommend(user_id, at=at,exclude_seen=True)
            num_eval+=1

            cumulative_precision += precision(recommended_items, relevant_items)
            cumulative_recall += recall(recommended_items, relevant_items)
            cumulative_AP += AP(recommended_items, relevant_items)

    mean_precision = cumulative_precision / num_eval
    mean_recall = cumulative_recall / num_eval
    MAP = cumulative_AP / num_eval

    print("Recommender results are: Precision = {:.10f}, Recall = {:.10f}, MAP = {:.10f}".format(
        cumulative_precision, cumulative_recall, MAP))
    return MAP, mean_precision, mean_recall

# Preparing for training

In [14]:
urm_train

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 383042 stored elements in Compressed Sparse Row format>

In [15]:
train_n_interactions = urm_train.nnz
train_n_interactions

383042

In [16]:
n_users, n_items = urm_train.shape
print(f"The num of users is {n_users}")
print(f"The num of items is {n_items}")

The num of users is 12638
The num of items is 22222


In [14]:
num_factors = 190

user_factors = np.random.random((n_users, num_factors))
item_factors = np.random.random((n_items, num_factors))

# We define a function to transform the interaction data in a "confidence" value.

In [17]:
def popularity_confidence(URM_train):

    item_popularity = np.ediff1d(URM_train.tocsc().indptr)
    item_confidence = np.zeros(len(item_popularity))
    item_confidence[item_popularity!=0] = np.log(item_popularity[item_popularity!=0])

    C_URM_train = URM_train.copy()
    C_URM_train = C_URM_train.tocsc()

    for item_id in range(C_URM_train.shape[1]):
        start_pos = C_URM_train.indptr[item_id]
        end_pos = C_URM_train.indptr[item_id+1]

        C_URM_train.data[start_pos:end_pos] = item_confidence[item_id]

    C_URM_train = C_URM_train.tocsr()

    return C_URM_train

# Define the update rules for the user factors

In [18]:
def _update_row(interaction_profile, interaction_confidence, Y, YtY, regularization_diagonal):

    Y_interactions = Y[interaction_profile, :]

    A = Y_interactions.T.dot(((interaction_confidence - 1) * Y_interactions.T).T)

    B = YtY + A + regularization_diagonal

    return np.dot(np.linalg.inv(B), Y_interactions.T.dot(interaction_confidence))

# Apply updates on the user item factors as well

**Training loop with Early stopping**

In [19]:
C_urm_train = popularity_confidence(urm_train)

C_urm_train.data[:10]

array([7.05185562, 6.69950034, 6.39191711, 5.73979291, 5.3082677 ,
       5.37989735, 5.4161004 , 5.32787617, 5.48893773, 4.6443909 ])

In [25]:
def train_ials(C_urm_train,num_epochs,num_factors,regularization_coefficient,urm_train, urm_val,min_map_increase = 0.001,
               map_patience_thr = 15,user_factors = None,item_factors = None):
  C_urm_train_csc = C_urm_train.tocsc()

  if user_factors is not None:
    user_factors = user_factors
  else:
    user_factors = np.random.random((n_users, num_factors))

  if item_factors is not None:
    item_factors = item_factors
  else:
    item_factors = np.random.random((n_items, num_factors))


  regularization_diagonal = np.diag(regularization_coefficient * np.ones(num_factors))

  best_map = 0.0
  map_patience_count = 0
  map = 0.0

  for n_epoch in range(num_epochs):

    if map > best_map and n_epoch != 0:
      best_map = map
      map_patience_count = 0
    elif map < best_map + min_map_increase:
      map_patience_count += 1
      print(f"LR patience is incremented and is {map_patience_count}")

    if map_patience_count >= map_patience_thr:
      return user_factors, item_factors

    start_time = time.time()

    VV = item_factors.T.dot(item_factors)

    for user_id in range(C_urm_train.shape[0]):

        start_pos = C_urm_train.indptr[user_id]
        end_pos = C_urm_train.indptr[user_id + 1]

        user_profile = C_urm_train.indices[start_pos:end_pos]
        user_confidence = C_urm_train.data[start_pos:end_pos]

        user_factors[user_id, :] = _update_row(user_profile, user_confidence, item_factors, VV, regularization_diagonal)

        # Print some stats
        if (user_id +1)% 100000 == 0 or user_id == C_urm_train.shape[0]-1:
            elapsed_time = time.time() - start_time
            samples_per_second = user_id/elapsed_time
            print("Iteration {} in {:.2f} seconds. Users per second {:.2f}".format(user_id+1, elapsed_time, samples_per_second))

    UU = user_factors.T.dot(user_factors)

    for item_id in range(C_urm_train.shape[1]):

        start_pos = C_urm_train_csc.indptr[item_id]
        end_pos = C_urm_train_csc.indptr[item_id + 1]

        item_profile = C_urm_train_csc.indices[start_pos:end_pos]
        item_confidence = C_urm_train_csc.data[start_pos:end_pos]

        item_factors[item_id, :] = _update_row(item_profile, item_confidence, user_factors, UU, regularization_diagonal)

        # Print some stats
        if (item_id +1)% 100000 == 0 or item_id == C_urm_train.shape[1]-1:
            elapsed_time = time.time() - start_time
            samples_per_second = item_id/elapsed_time
            print("Iteration {} in {:.2f} seconds. Items per second {:.2f}".format(item_id+1, elapsed_time, samples_per_second))

    # Evaluate the model on val
    recommender_ials = IALSRecommender(urm_train,user_factors, item_factors)
    map, mp,mr = evaluate_algorithm(urm_val,recommender_ials)

    total_epoch_time = time.time() - start_time
    print("Epoch {} complete in in {:.2f} seconds".format(n_epoch+1, total_epoch_time))

  return user_factors, item_factors

In [30]:
num_epochs = 200
num_factors = 350
regularization_coefficient = 0.7

min_map_increase = 0.0001
map_patience_thr = 5


usr_factors, itm_factors = train_ials(C_urm_train,num_epochs,num_factors,regularization_coefficient,urm_train,urm_val,
                                      min_map_increase=min_map_increase,map_patience_thr=map_patience_thr)

LR patience is incremented and is 1
Iteration 12638 in 169.75 seconds. Users per second 74.44
Iteration 22222 in 461.67 seconds. Items per second 48.13
Recommender results are: Precision = 399.3000000000, Recall = 609.7980149761, MAP = 0.0300155462
Epoch 1 complete in in 594.22 seconds
Iteration 12638 in 167.39 seconds. Users per second 75.50
Iteration 22222 in 462.61 seconds. Items per second 48.03
Recommender results are: Precision = 479.6000000000, Recall = 806.7936919020, MAP = 0.0443770326
Epoch 2 complete in in 593.93 seconds
Iteration 12638 in 172.79 seconds. Users per second 73.14
Iteration 22222 in 471.99 seconds. Items per second 47.08
Recommender results are: Precision = 592.6000000000, Recall = 964.6883843627, MAP = 0.0537692076
Epoch 3 complete in in 605.02 seconds
Iteration 12638 in 170.12 seconds. Users per second 74.28
Iteration 22222 in 466.30 seconds. Items per second 47.65
Recommender results are: Precision = 635.4000000000, Recall = 1020.5826652082, MAP = 0.05684182

In [27]:
usr_factors_chpt = usr_factors
itm_factors_chpt = itm_factors

# Evaluate the trained model


In [22]:
class IALSRecommender(object):

    def __init__(self, URM,usr_factors, itm_factors):
        self.URM = URM
        self.usr_factors = usr_factors
        self.itm_factors = np.transpose(itm_factors)

    def recommend(self, user_id, at=None, exclude_seen=True, users_not_in_train=[]):
        # Check if user_id is a valid index
        if user_id < 0 or user_id >= self.URM.shape[0]:
            print(f"Invalid user_id: {user_id}")
            return

        # Check if user_id not in train use the topRec
        if user_id in users_not_in_train:
            return ["517 189 44 0 284 808 285 1 557 1266"]

        # compute the scores using the dot product
        user_profile = self.usr_factors[user_id,:]
        scores = np.dot(user_profile,self.itm_factors)

        if exclude_seen:
            scores = self.filter_seen(user_id, scores)

        # rank items
        ranking = scores.argsort()[::-1]

        return ranking[:at]


    def filter_seen(self, user_id, scores):

        start_pos = self.URM.indptr[user_id]
        end_pos = self.URM.indptr[user_id+1]

        user_profile = self.URM.indices[start_pos:end_pos]

        scores[user_profile] = -np.inf

        return scores

In [42]:
#num_epochs = 3
#num_factors = 190
#regularization_coefficient = 1.5

recommender_ials = IALSRecommender(urm_train,usr_factors, itm_factors)
map, mp,mr = evaluate_algorithm(urm_val,recommender_ials)
print(f"MAP@10 on val is {map}")

Recommender results are: Precision = 599.2000, Recall = 959.0677, MAP = 0.0526
MAP@10 on val is 0.05255837011660953


In [40]:
#num_epochs = 3
#num_factors = 250
#regularization_coefficient = 1.5

recommender_ials = IALSRecommender(urm_train,usr_factors, itm_factors)
map, mp,mr = evaluate_algorithm(urm_val,recommender_ials)
print(f"MAP@10 on val is {map}")

Recommender results are: Precision = 598.1000, Recall = 953.8834, MAP = 0.0520
MAP@10 on val is 0.052036196808158006


In [38]:
#num_epochs = 5
#num_factors = 250
#regularization_coefficient = 1

recommender_ials = IALSRecommender(urm_train,usr_factors, itm_factors)
map, mp,mr = evaluate_algorithm(urm_val,recommender_ials)
print(f"MAP@10 on val is {map}")

Recommender results are: Precision = 667.3000, Recall = 1055.3109, MAP = 0.0577
MAP@10 on val is 0.057705989571089876


In [36]:
#num_epochs = 5
#num_factors = 250
#regularization_coefficient = 0.7

recommender_ials = IALSRecommender(urm_train,usr_factors, itm_factors)
map, mp,mr = evaluate_algorithm(urm_val,recommender_ials)
print(f"MAP@10 on val is {map}")

Recommender results are: Precision = 674.7000, Recall = 1069.0870, MAP = 0.0584
MAP@10 on val is 0.058409631100358045


In [None]:
#num_epochs = 80
#num_factors = 190
#regularization_coefficient = 0.7

recommender_ials = IALSRecommender(urm_train,usr_factors, itm_factors)
map, mp,mr = evaluate_algorithm(urm_val,recommender_ials)
print(f"MAP@10 on val is {map}")

Recommender results are: Precision = 709.4000, Recall = 1115.9259, MAP = 0.0613
MAP@10 on val is 0.061295767617697346


In [None]:
#num_epochs = 50
#num_factors = 200
#regularization_coefficient = 1


recommender_ials = IALSRecommender(urm_train,usr_factors, itm_factors)
map, mp,mr = evaluate_algorithm(urm_val,recommender_ials)
print(f"MAP@10 on val is {map}")

Recommender results are: Precision = 694.6000, Recall = 1093.4653, MAP = 0.0602
MAP@10 on val is 0.06021209879799655


In [None]:
#num_epochs = 50
#num_factors = 200
#regularization_coefficient = 1e-1

recommender_ials = IALSRecommender(urm_train,usr_factors, itm_factors)
map, mp,mr = evaluate_algorithm(urm_val,recommender_ials)
print(f"MAP@10 on val is {map}")

Recommender results are: Precision = 701.0000, Recall = 1077.0758, MAP = 0.0597
MAP@10 on val is 0.05968736862896224


In [None]:
#num_epochs = 50
#num_factors = 150
#regularization_coefficient = 1e-1

recommender_ials = IALSRecommender(urm_train,usr_factors, itm_factors)
map, mp,mr = evaluate_algorithm(urm_val,recommender_ials)
print(f"MAP@10 on val is {map}")

Recommender results are: Precision = 697.5000, Recall = 1071.9676, MAP = 0.0595
MAP@10 on val is 0.059518880947133854


In [None]:
#num_epochs = 50
#num_factors = 150
#regularization_coefficient = 1e-2

recommender_ials = IALSRecommender(urm_train,usr_factors, itm_factors)
map, mp,mr = evaluate_algorithm(urm_val,recommender_ials)
print(f"MAP@10 on val is {map}")

Recommender results are: Precision = 698.7000, Recall = 1076.8531, MAP = 0.0592
MAP@10 on val is 0.0591967494090857


In [None]:
#num_epochs = 50
#num_factors = 90
#regularization_coefficient = 1e-2

recommender_ials = IALSRecommender(urm_train,usr_factors, itm_factors)
map, mp,mr = evaluate_algorithm(urm_val,recommender_ials)
print(f"MAP@10 on val is {map}")

Recommender results are: Precision = 692.0000, Recall = 1054.9991, MAP = 0.0571
MAP@10 on val is 0.05714597620898498


In [None]:
#num_epochs = 50
#num_factors = 90
#regularization_coefficient = 1e-3

recommender_ials = IALSRecommender(urm_train,usr_factors, itm_factors)
map, mp,mr = evaluate_algorithm(urm_val,recommender_ials)
print(f"MAP@10 on val is {map}")

Recommender results are: Precision = 693.9000, Recall = 1053.9768, MAP = 0.0571
MAP@10 on val is 0.05713325569086096


In [None]:
#num_epochs = 50
#num_factors = 90
#regularization_coefficient = 1e-4

recommender_ials = IALSRecommender(urm_train,usr_factors, itm_factors)
map, mp,mr = evaluate_algorithm(urm_val,recommender_ials)
print(f"MAP@10 on val is {map}")

Recommender results are: Precision = 687.8000, Recall = 1047.0269, MAP = 0.0567
MAP@10 on val is 0.05673956644904108


# Train on the whole data and submit the predictions

In [None]:
C_urm_all = popularity_confidence(urm_all)

C_urm_all.data[:10]

array([7.26612878, 6.91473089, 6.62539237, 5.99645209, 5.54126355,
       5.56452041, 5.6347896 , 5.58349631, 5.68017261, 4.82831374])

In [None]:
num_epochs = 80
num_factors = 190
regularization_coefficient = 0.7

usr_factors, itm_factors = train_ials(C_urm_all,num_epochs,num_factors,regularization_coefficient)

Iteration 12638 in 38.10 seconds. Users per second 331.70
Iteration 22222 in 129.64 seconds. Items per second 171.41
Epoch 1 complete in in 129.64 seconds
Iteration 12638 in 34.55 seconds. Users per second 365.80
Iteration 22222 in 99.40 seconds. Items per second 223.56
Epoch 2 complete in in 99.40 seconds
Iteration 12638 in 38.11 seconds. Users per second 331.58
Iteration 22222 in 162.56 seconds. Items per second 136.69
Epoch 3 complete in in 162.57 seconds
Iteration 12638 in 34.25 seconds. Users per second 368.96
Iteration 22222 in 96.54 seconds. Items per second 230.17
Epoch 4 complete in in 96.54 seconds
Iteration 12638 in 52.68 seconds. Users per second 239.90
Iteration 22222 in 118.37 seconds. Items per second 187.73
Epoch 5 complete in in 118.37 seconds
Iteration 12638 in 38.14 seconds. Users per second 331.29
Iteration 22222 in 103.71 seconds. Items per second 214.27
Epoch 6 complete in in 103.71 seconds
Iteration 12638 in 56.11 seconds. Users per second 225.21
Iteration 22222 

# Predict for the test data

In [None]:
urm_path = '/content/data_target_users_test.csv'

urm_pred_df = pd.read_csv(filepath_or_buffer=urm_path,
                                sep=",",
                                header=0,
                                dtype={0:int},
                                engine='python')

urm_pred_df.columns = ["UserID"]
len(urm_pred_df['UserID'])

10882

In [None]:
urm_all = urm_all.tocsr()
urm_all

<12638x22222 sparse matrix of type '<class 'numpy.float64'>'
	with 478730 stored elements in Compressed Sparse Row format>

In [None]:
Recommender = IALSRecommender(urm_all,usr_factors, itm_factors)

**Do the predictions, but bear in mind that before feeding the user_id to the model you need to translate it to the model user indices space. Also, you have to translate the recommended item IDs into the original IDs space form the model item IDs space**

In [None]:
pred_df = pd.DataFrame(columns = ['user_id','item_list'])

for userid in urm_pred_df['UserID']:
  if userid in user_original_ID_to_index.keys():
    # Map user ID to the index of model user space
    usr_idx = user_original_ID_to_index[userid]
    recomendatoins = Recommender.recommend(usr_idx, at=10,exclude_seen=True)
    # Map item indices from model item space to original item ID
    recomendatoins = [item_index_to_original_ID[idx] for idx in recomendatoins]
  else:
    # If the user isn't present in the given data, recommend TopPop items
    recomendatoins = ["517 189 44 0 284 808 285 1 557 1266"]

  recomendatoins = " ".join(str(item) for item in recomendatoins)
  pred_df.loc[len(pred_df)] = [userid,recomendatoins]

In [None]:
pred_df

Unnamed: 0,user_id,item_list
0,1,101 403 36 52 506 785 922 183 515 102
1,2,197 359 78 1095 949 47 283 12 196 629
2,3,59 259 536 414 584 648 956 999 1001 147
3,4,249 314 28 54 136 69 46 57 145 22
4,5,170 215 131 110 301 222 95 838 285 766
...,...,...
10877,13020,155 113 105 87 681 51 859 153 345 627
10878,13021,593 57 133 316 6179 6720 455 13 87 7027
10879,13022,809 1411 1446 1674 1668 776 105 705 37 415
10880,13023,32 23 96 408 324 706 153 837 1124 261


In [None]:
pred_df.to_csv('/content/predIALS_Max.csv',index=False)