# Top popular recommender

Most popular items are recommended to each user.

In [217]:
# set to false to use fake data for testing
TEST = False

### Import libraries

In [11]:
import pandas as pd
import numpy as np

### Utilities

In [249]:
def pretty_print_progress(current, total, prepend):
    if current == total:
        print(" " * 100, end="\r")
        print(prepend, "finished!")
    elif current % 100 == 0:
        print("%s %8s of %8s" % (prepend, current, total), end="\r")

### Read data

In [218]:
# base path for csv files
base_path = "data"
if TEST:
    base_path = "fake_data"

In [219]:
# interactions_and_impressions.csv
# Contains the training set, describing implicit preferences expressed by the users.
# user_id : identifier of the user
# item_id : identifier of the item (TV series)
# impression_list : string containing the items that were present on the screen when the user interacted with the item in column item_id. Not all interactions have a corresponding impressions list.
# data : "0" if the user watched the item, "1" if the user opened the item details page.
interactions_df_path = base_path + "/interactions_and_impressions.csv"

# data_ICM_length.csv 
# Contains the number of episodes of the items. TV series may have multiple episodes.
# item_id : identifier of the item
# feature_id : identifier of the feature, only one value (0) exists since this ICM only contains the feature "length"
# data : number of episodes. Some values may be 0 due to incomplete data.
items_length_df_path = base_path + "/data_ICM_length.csv"

# data_ICM_type.csv:
# Contains the type of the items. An item can only have one type.
# All types are anonymized and described only by a numerical identifier.
# item_id : identifier of the item
# feature_id : identifier of the type
# data : "1" if the item is described by the type
items_type_df_path = base_path + "/data_ICM_type.csv"

# data_target_users_test.csv:
# Contains the ids of the users that should appear in your submission file.
# The submission file should contain all and only these users.
users_df_path = base_path + "/data_target_users_test.csv"

In [253]:
dtype = {0: int, 1: int, 2: str, 3: int}
interactions_df = pd.read_csv(
    filepath_or_buffer=interactions_df_path,
    dtype=dtype,
    keep_default_na=False  # avoid NaN
)
dtype = {0: int, 1: int, 2: int}
items_length_df = pd.read_csv(filepath_or_buffer=items_length_df_path, dtype=dtype)
items_types_df = pd.read_csv(filepath_or_buffer=items_type_df_path, dtype=dtype)
users_df = pd.read_csv(filepath_or_buffer=users_df_path)


### Top Popular v1

Accuracy: 0.00154

In [None]:
class TopPopRec(object):

    def fit(self, interactions_df):
        # avoid error messages
        watched_items_df = interactions_df.copy()
        
        # selecting watched items
        watched_items_df = watched_items_df[interactions_df['data'] == 0]
        
        # calculate how many times each item has been watched
        watched_items_df['count'] = watched_items_df.groupby(['item_id'])['item_id'].transform('count')
        
        # merge rows with same ItemID to obtain a list of items and watched times
        watched_items_df = watched_items_df.groupby(['item_id']).agg({'item_id':'first', 'count':'first'})
        
        # order items by the most watched
        watched_items_df = watched_items_df.sort_values(by='count', ascending=False)
        
        self.watched_items_df = watched_items_df

    def recommend(self, at=10):
        return [self.watched_items_df[0:at]]

In [None]:
rec = TopPopRec()
rec.fit(interactions_df)

In [None]:
out_df = pd.DataFrame(columns=["user_id", "item_list"])
num_users = users_df['user_id'].shape[0]
count = 0

print(num_users, end=" - ")

for user_id in users_df['user_id']:
    
    items_list = rec.recommend(at=10)
    items_list = map(str, items_list)
    items_list = " ".join(items_list)
    row = pd.DataFrame([[user_id, items_list]], columns=['user_id' , 'item_list'])
    out_df = pd.concat([out_df, row])
    
    count += 1
    if count % 1000 == 0:
        print(count/num_users, end=" - ")

### Top Popular v2

- Avoid already watched items


Accuracy: 

In [None]:
class TopPopRecV2(object):

    def fit(self, interactions_df):
        # avoid error messages
        watched_items_df = interactions_df.copy()

        # selecting watched items
        watched_items_df = watched_items_df[watched_items_df['data'] == 0]

        # calculate how many times each item has been watched
        watched_items_df['count'] = watched_items_df.groupby(
            ['item_id'])['item_id'].transform('count')

        # merge rows with same ItemID to obtain a list of items and watched times
        popular_items_df = watched_items_df.groupby(
            ['item_id']).agg({'item_id': 'first', 'count': 'first'})

        # order items by the most watched
        popular_items_df = popular_items_df.sort_values(
            by='count', ascending=False)

        self.watched_items_df = watched_items_df
        self.popular_items_df = popular_items_df

    def recommend(self, user_id, at=10):
        res = []
        count = 0
        
        df = self.watched_items_df
        user_watched_df = df[df['user_id'] == user_id]
        
        n_items = self.popular_items_df.shape[0]
        
        # find first ten items not seen by the user
        while len(res) < at and count < n_items:
            item_id = self.popular_items_df['item_id'].iloc[count + 1]

            # watched items by the user
            df = user_watched_df
            watched_df = df[df['item_id'] == item_id]
            
            # check if not watched
            if watched_df.shape[0] == 0:
                res.append(item_id)

            count += 1
        
        count = 0
        # add items already seen to complete the list
        while len(res) < at:
            item_id = self.popular_items_df['item_id'].iloc[count + 1]
            
            if item_id not in res:
                res.append(item_id)

        return res

In [None]:
rec = TopPopRecV2()
rec.fit(interactions_df)

In [None]:
out_df = pd.DataFrame(columns=["user_id", "item_list"])
num_users = users_df['user_id'].shape[0]
count = 0

print("[", end="")

for user_id in users_df['user_id']:
    
    items_list = rec.recommend(user_id, at=10)
    items_list = map(str, items_list)
    items_list = " ".join(items_list)
    row = pd.DataFrame([[user_id, items_list]], columns=['user_id' , 'item_list'])
    out_df = pd.concat([out_df, row])
    
    count += 1
    if count % 1000  == 0:
        print(".", end="")
        
print("] Done!")

### Top Popular v3

Like V2 but with custom weights

Accuracy: 0.01179

Too custom, the model is not efficient.

In [251]:
class TopPopRecV3(object):

    def fit(self):
        
        # switch watch with one and interacted with zero
        df = interactions_df.copy()
        num_interactions = df.shape[0]
        for index in df.index:
            pretty_print_progress(index + 1, num_interactions, "fixing interactions:")
            
            data = df.loc[index, 'data']
            
            if data == 0:
                df.loc[index, 'data'] = 1
            else:
                df.loc[index, 'data'] = 0
                
        # group items by user_id and item_id
        # and count number of interactions
        agg_interactions_df = df.groupby(['user_id', 'item_id'], as_index=False).sum(True)
        
        # custom weights
        # i     tv_series       film        not known
        # 1       < 0.4           0             0 
        # 2       < 0.6           1            > 0
        # 3       > 0.6          > 1            -
        num_interactions = agg_interactions_df.shape[0]
        for index in agg_interactions_df.index:
            pretty_print_progress(index + 1, num_interactions, "calculating weights:")
            
            df = agg_interactions_df
            item_id = df.loc[index,'item_id']
            data = df.loc[index,'data']
            
            df = items_length_df.copy()
            length = df[df['item_id'] == item_id]['data']
            
            if length.shape[0] == 0: 
                continue
                
            length = length.iloc[0]
            
            # tv series has more than one element
            df = agg_interactions_df
            if length > 1:            
                if data < length * 0.2:
                    df.loc[index,'data'] = 1
                elif data < length * 0.7:
                    df.loc[index,'data'] = 2
                else:
                    df.loc[index,'data'] = 3
            # films has only one element
            elif length == 1:
                if data == 0:
                    df.loc[index,'data'] = 1
                elif data == 1:
                    df.loc[index,'data'] = 2
                else:
                    df.loc[index,'data'] = 3
            # it is not known if this item is a film or a tv series
            else:
                if data == 0:
                    df.loc[index,'data'] = 1
                else:
                    df.loc[index,'data'] = 2
                            
        df = agg_interactions_df

        # merge rows with same ItemID to obtain a list of items and watched times
        df = df.groupby(
            ['item_id']).agg({'item_id': 'first', 'data': 'sum'})

        # order items based on total weights
        df = df.sort_values(by='data', ascending=False)
        
        # saved watched_items by the user
        df = interactions_df.copy()
        watched_items_df = df[df['data'] == 0]
        
        self.watched_items_df = watched_items_df
        self.popular_items_df = agg_interactions_df

    def recommend(self, user_id, at=10):
        res = []
        count = 0
        
        df = self.watched_items_df
        user_watched_df = df[df['user_id'] == user_id]
        
        n_items = self.popular_items_df.shape[0]
        
        # find first ten items not seen by the user
        while len(res) < at and count < n_items:
            item_id = self.popular_items_df['item_id'].iloc[count + 1]

            # watched items by the user
            df = user_watched_df
            watched_df = df[df['item_id'] == item_id]
            
            # check if not watched
            if watched_df.shape[0] == 0:
                res.append(item_id)

            count += 1
        
        count = 0
        # add items already seen to complete the list if needed
        while len(res) < at:
            item_id = self.popular_items_df['item_id'].iloc[count + 1]
            
            if item_id not in res:
                res.append(item_id)

        return res

In [254]:
rec = TopPopRecV3()
rec.fit()

fixing interactions: finished!                                                                      
calculating weights: finished!                                                                      


In [255]:
out_df = pd.DataFrame(columns=["user_id", "item_list"])
num_users = users_df['user_id'].shape[0]
count = 0

for user_id in users_df['user_id']:
    count += 1
    pretty_print_progress(count, num_users, "calculating recommendations:")
    
    items_list = rec.recommend(user_id, at=10)
    
    items_list = map(str, items_list)
    items_list = " ".join(items_list)
    row = pd.DataFrame([[user_id, items_list]], columns=['user_id' , 'item_list'])
    out_df = pd.concat([out_df, row])

calculating recommendations: finished!                                                              


### Save result

In [256]:
out_df.to_csv("submission.csv", columns=["user_id", "item_list"], index=False)