# Top popular recommender

Most popular items are recommended to each user.

### Import libraries

In [None]:
import pandas as pd
import numpy as np

### Read data

In [None]:
# base path for csv files
base_path = "data"

# interactions_and_impressions.csv
# Contains the training set, describing implicit preferences expressed by the users.
# user_id : identifier of the user
# item_id : identifier of the item (TV series)
# impression_list : string containing the items that were present on the screen when the user interacted with the item in column item_id. Not all interactions have a corresponding impressions list.
# data : "0" if the user watched the item, "1" if the user opened the item details page.
interactions_df_path = base_path + "/interactions_and_impressions.csv"

# data_ICM_length.csv 
# Contains the number of episodes of the items. TV series may have multiple episodes.
# item_id : identifier of the item
# feature_id : identifier of the feature, only one value (0) exists since this ICM only contains the feature "length"
# data : number of episodes. Some values may be 0 due to incomplete data.
items_length_df_path = base_path + "/data_ICM_length.csv"

# data_ICM_type.csv:
# Contains the type of the items. An item can only have one type.
# All types are anonymized and described only by a numerical identifier.
# item_id : identifier of the item
# feature_id : identifier of the type
# data : "1" if the item is described by the type
items_type_df_path = base_path + "/data_ICM_type.csv"

# data_target_users_test.csv:
# Contains the ids of the users that should appear in your submission file.
# The submission file should contain all and only these users.
users_df_path = base_path + "/data_target_users_test.csv"

In [None]:
dtype = {0: int, 1: int, 2: str, 3: int}
interactions_df = pd.read_csv(
    filepath_or_buffer=interactions_df_path,
    dtype=dtype,
    keep_default_na=False  # avoid NaN
)
items_length_pf = pd.read_csv(filepath_or_buffer=items_length_df_path)
items_types_df = pd.read_csv(filepath_or_buffer=items_type_df_path)
users_df = pd.read_csv(filepath_or_buffer=users_df_path)


### Calculate most seen items

Accuracy: 0.00154

In [None]:
class TopPopRecommender(object):

    def fit(self, interactions_df):
        # avoid error messages
        watched_items_df = interactions_df.copy()
        
        # selecting watched items
        watched_items_df = watched_items_df[interactions_df['Data'] == 0]
        
        # calculate how many times each item has been watched
        watched_items_df['Count'] = watched_items_df.groupby(['ItemID'])['ItemID'].transform('count')
        
        # merge rows with same ItemID to obtain a list of items and watched times
        watched_items_df = watched_items_df.groupby(['ItemID']).agg({'ItemID':'first', 'Count':'first'})
        
        # order items by the most watched
        watched_items_df = watched_items_df.sort_values(by='Count', ascending=False)
        
        self.watched_items_df = watched_items_df

    def recommend(self, at=10):
        return [self.watched_items_df[0:at]]

In [None]:
topPopRecommender = TopPopRecommender()
topPopRecommender.fit(interactions_df)

In [None]:
out_df = pd.DataFrame(columns=["user_id", "item_list"])
num_users = users_df['user_id'].shape[0]
count = 0

print(num_users, end=" - ")

for user_id in users_df['user_id']:
    
    items_list = topPopRecommender.recommend(at=10)
    items_list = map(str, items_list)
    items_list = " ".join(items_list)
    row = pd.DataFrame([[user_id, items_list]], columns=['user_id' , 'item_list'])
    out_df = pd.concat([out_df, row])
    
    count += 1
    if count % 1000 == 0:
        print(count/num_users, end=" - ")

### Calculate most seen items, but avoid already seen items by the user

Accuracy: 

In [325]:
class TopPopRecommenderAdvanced(object):

    def fit(self, interactions_df):
        # avoid error messages
        watched_items_df = interactions_df.copy()

        # selecting watched items
        watched_items_df = watched_items_df[watched_items_df['Data'] == 0]

        # calculate how many times each item has been watched
        watched_items_df['Count'] = watched_items_df.groupby(
            ['ItemID'])['ItemID'].transform('count')

        # merge rows with same ItemID to obtain a list of items and watched times
        popular_items_df = watched_items_df.groupby(
            ['ItemID']).agg({'ItemID': 'first', 'Count': 'first'})

        # order items by the most watched
        popular_items_df = popular_items_df.sort_values(
            by='Count', ascending=False)

        self.watched_items_df = watched_items_df
        self.popular_items_df = popular_items_df

    def recommend(self, user_id, at=10):
        res = []
        count = 0
        
        df = self.watched_items_df
        user_watched_df = df[df['UserID'] == user_id]
        
        n_items = self.popular_items_df.shape[0]
        
        # find first ten items not seen by the user
        while len(res) < at and count < n_items:
            item_id = self.popular_items_df['ItemID'].iloc[count + 1]

            # watched items by the user
            df = user_watched_df
            watched_df = df[df['ItemID'] == item_id]
            
            # check if not watched
            if watched_df.shape[0] == 0:
                res.append(item_id)

            count += 1
        
        count = 0
        # add items already seen to complete the list
        while len(res) < at:
            item_id = self.popular_items_df['ItemID'].iloc[count + 1]
            
            if item_id not in res:
                res.append(item_id)

        return res

In [326]:
recommender = TopPopRecommenderAdvanced()
recommender.fit(interactions_df)

In [329]:
out_df = pd.DataFrame(columns=["user_id", "item_list"])
num_users = users_df['user_id'].shape[0]
count = 0

print("[", end="")

for user_id in users_df['user_id']:
    
    items_list = recommender.recommend(user_id, at=10)
    items_list = map(str, items_list)
    items_list = " ".join(items_list)
    row = pd.DataFrame([[user_id, items_list]], columns=['user_id' , 'item_list'])
    out_df = pd.concat([out_df, row])
    
    count += 1
    if count % 1000  == 0:
        print(".", end="")
        
print("] Done!")

[.........................................] Done!


### Save result

In [330]:
out_df.to_csv("submission.csv", columns=["user_id", "item_list"], index=False)