[LFM-2b](http://www.cp.jku.at/datasets/LFM-2b/) is a large dataset of over two billion listening events, spanning across ~15 years, crawled from LastFM platform. It is supported with user demographics information and music track meta-data. In this exercise we take a look at a small sample of the aggregated LFM-2B (lfm-tiny) as well as MovieLens-1M dataset (ml-1m). Each of them consists of three files, in case of lfm-tiny it is:

* 'lfm-tiny.inter' - data about user-track interactions;
* 'lfm-tiny.item' - track-related information;
* 'lfm-tiny.user' - user-related information;
    
And for ml-1m respectively:
    
* 'ml-1m.inter' - data about user-movie ratings;
* 'ml-1m.item' - movie-related information;
* 'ml-1m.user' - user-related information;


In [1]:
import pandas as pd
import numpy as np

## Interaction Matrix
Interaction matrix with dimensions: [number of users] x [number of items]

In [2]:
def inter_matr_implicit(users: pd.DataFrame,
                        items: pd.DataFrame,
                        interactions: pd.DataFrame,
                        dataset_name: str,
                        threshold=1) -> np.ndarray:
    '''
    users - pandas Dataframe, use it as loaded from the dataset;
    items - pandas Dataframe, use it as loaded from the dataset;
    interactions - pandas Dataframe, use it as loaded from the dataset;
    dataset_name - string out of ["lfm-ismir", "ml-1m"], name of the dataset, used in case there are differences in the column names of the data frames;
    threshold - int > 0, criteria of a valid interaction

    returns - 2D np.array, rows - users, columns - items;
    '''
    #initialize the binary interaction matrix with only zeros
    res = np.zeros((users.shape[0], items.shape[0]))

    if dataset_name == "lfm-ismir":
        #iterate over the interactions
        for i in range(interactions.shape[0]):

            #check if the listening events are enough to consider the interaction valid
            if interactions.iloc[i, 2] >= threshold:    
                #add the interaction to the matrix           
                res[interactions.iloc[i, 0], interactions.iloc[i, 1]] = 1
    
    
    elif dataset_name == "ml-1m":
        
        #iterate over the interactions
        for i in range(interactions.shape[0]):

            #if the interaction is valid, add it to the matrix
            if interactions.iloc[i, 2] >= threshold:
                res[interactions.iloc[i, 0], interactions.iloc[i, 1]] = 1
    
    
    
    return res

In [3]:
# load the data for both datasets, keep it as specified in the csv files
def read(dataset, file):
    return pd.read_csv(dataset + '/' + dataset + '.' + file, sep='\t')

users_lfm = read("lfm-tiny", 'user')
items_lfm = read("lfm-tiny", 'item')
interactions_lfm = read("lfm-tiny", 'inter')

users_ml = read("ml-1m", 'user')
items_ml = read("ml-1m", 'item')
interactions_ml = read("ml-1m", 'inter')

In [4]:
#interactions_lfm.head()

In [16]:
# Creates interaction matrix for LFM dataset, choose the correct threshold for this dataset
_interaction_matrix_test_lfm = inter_matr_implicit(users_lfm, items_lfm, interactions_lfm, "lfm-ismir", threshold=1)
_interaction_matrix_test_lfm

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

## POP Recommender 
baseline. Recommends [K] most popular unseen items to a given user

The function take three arguments: np.array of arbitrary dimensions (supporting any number of users and items) in the format from task 1 (interaction matrix), user ID (int) and K (int > 0).
return: a list or a 1D array with [K] IDs of most popular items (sorted in the order of descending popularity) not seen by the user.


In [18]:
def recTopKPop(inter_matr: np.array,
               user: int,
               top_k: int) -> np.array:
    '''
    inter_matr - np.array from the task 1;
    user - user_id, integer;
    top_k - expected length of the resulting list;

    returns - list/array of top K popular items that the user has never seen
              (sorted in the order of descending popularity);
    '''

    
    #save IDs of items seen by the target user
    seen = np.where(inter_matr[user] == 1)[0]

    #get and adjust the popularity distribution
    pop = np.sum(inter_matr, axis=0)
    
    #set the popularity of items seen by the target user to 0
    pop[seen] = 0
   
    #select the top K items 
    top_pop = np.argsort(pop)[::-1][:top_k]
    
    
    return top_pop

In [19]:
# TODO: YOUR IMPLEMENTATION
top_10 = recTopKPop(_interaction_matrix_test_lfm, 0, 10)
top_10

array([ 42,  43,  51,  96, 105, 151,  12, 104,  68, 150])

## POP Recommender by country

In [20]:
def recTopKPopByCountry(inter_matr: np.array,
               user: int,
               top_k: int,
               users: pd.DataFrame) -> np.array:
    '''
    inter_matr - np.array from the task 1;
    user - user_id, integer;
    top_k - expected length of the resulting list;
    users: pandas Dataframe consisting of user information for all users, requires a "country" column

    returns - list/array of top K popular items that the user has never seen
              (sorted in the order of descending popularity);
    '''

    # get the country of the target user
    target_country = users.iloc[user, 0]
    
    # get the IDs of items seen by the target user
    seen = np.where(inter_matr[user] == 1)[0]
    
    #get and adjust the popularity distribution but only for the target country
    pop_by_country = np.sum(inter_matr[users["country"] == target_country], axis=0)
    
    # set the popularity of items seen by the target user to 0
    pop_by_country[seen] = 0
    
    # select the top K items 
    top_pop = np.argsort(-pop_by_country)[:top_k]
    #print(top_pop)
    
    return top_pop

In [21]:
inter_matr_lfm = inter_matr_implicit(users_lfm, items_lfm, interactions_lfm, "lfm-ismir", threshold=1)
# create a pandas Dataframe with user data that has at least a "country column"
users = users_lfm
top_10 = recTopKPopByCountry(inter_matr=inter_matr_lfm, user=0, top_k=10, users=users)
top_10

array([43, 42, 69, 30, 96, 33, 51, 11, 71, 65])