In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random
import sys

In [2]:
from ipynb.fs.full.user_playlist_utils import summarise_listening_history

In [3]:
def split_history(triplets, hidden_size, sort = True, users_summary = None, random_state = None, verbose = True):
    """
    Split a dataframe corresponding to users's listening history in the form of a triplet user - track - number of listening
    into two dataframes 'hidden_triplets' and 'apparent_triplets'.
    hidden_triplets will correspond for each user to the proportion hidden_size of its listening history and
    apparent_triplet will correspond to the proportion 1 - hidden_size.
    Parameters:
        triplets (pandas.Dataframe): a dataframe whose columns are
            - user : a the unique id of the user
            
            - track_id : a unique id for a track
            
            - listening_count: the number of times the user has listened to the track
            
            ... : other columns corresponding to track's features and/or the user's features and/or element of context of the interaction user/track.
        
        hidden_size (float): a value between 0 and 1 corresponding to the proportion of the user's listening history to hide.
        
        sort (boolean, default = True): defines wether the triplets need to be sorted by users id.
        
        users_summary (pd.DataFrame, default=None): a dataframe corresponding to the summary of the users listening history whose columns are
            - user : a the unique id of the user
            
            - listening_count : the total number of listenings of the user
            
            - track_count : the number of different tracks the users have listened
            
            if not specified, the user's listening history is determined from triplets.
        
        random_state (int, default=None): pass an int for reproducible output across multiple function calls.
        
        verbose (boolean, default=True): defines whether the function should display process progress messages.

    Return :
       apparent_triplets, hidden_triplets (pandas.Dataframe tuple) : the two splitted listening history.
    """
    
    # If users_summary is passed, verify it describes the same users as in triplets
    if users_summary is not None:
        if not len(set(users_summary.user)) == len(set(triplets.user)) or len(set(users_summary.user) - set(triplets.user)) > 0 or  len(set(triplets.user) - set(users_summary.user)) > 0:
            raise ValueError('At least one user differ between triplets and users_summary')
            
    # Set a seed for the random number generator
    if random_state is not None:
        random.seed(random_state)
    
    # Sort the triplets by users' id
    if sort:
        if verbose:
            sys.stdout.write('Sorting the triplets by users id ...')
            sys.stdout.flush()
        
        triplets = triplets.sort_values('user')
    
        if verbose:
            sys.stdout.write('\r')
            sys.stdout.write('Sorting the triplets by users id ... Done')
            sys.stdout.flush()
    
    # Calculate the number of different tracks listened by the users
    if verbose:
        sys.stdout.write('\n')
        sys.stdout.write('Computing the number of tracks listened by the users ...')
        sys.stdout.flush()
    
    if users_summary is None:
        users_summary = summarise_listening_history(triplets)
    else:
        if not all(users_summary.user == triplets.user.unique()):
            user_summary = user_summary.sort_values('user')
            
    t_count = users_summary.track_count
    
    if verbose:
        sys.stdout.write('\r')
        sys.stdout.write('Computing the number of tracks listened by the users ... Done')
        sys.stdout.flush()
        
    # Define the number of trîplets to hide for each user according to the proportion defined by hidden_size
    n_hidden = np.around(np.array(t_count*hidden_size)).astype('int')
    
    # Get the users' first and last triplet's id
    last = users_summary.track_count.cumsum()
    first = list(last.shift(1, fill_value = 0))
    last = list(last)

    # Define the id's of the hidden triplets
    if verbose:
        sys.stdout.write('\n')
        sys.stdout.write('Defining hidden tracks ids ...')
        sys.stdout.flush()
        
    hidden_ids = np.concatenate([random.sample([j for j in range(first[i], last[i])], n_hidden[i]) for i in range(len(users_summary))])
    
    # Define the remaining triplet as the apparents set
    if verbose:
        sys.stdout.write('\r')
        sys.stdout.write('Defining hidden tracks ids ... Done')
        sys.stdout.flush()
        sys.stdout.write('\n')
        sys.stdout.write('Defining apparent tracks ids ...')
        sys.stdout.flush()
        
    apparent_ids = list(set(range(len(triplets))) - set(hidden_ids))
    
    if verbose:
        sys.stdout.write('\r')
        sys.stdout.write('Defining apparent tracks ids ... Done')
        sys.stdout.flush()
        sys.stdout.write('\n')
        sys.stdout.write('Splitting the listening history ...')
        sys.stdout.flush()
    # Return the hidden and apparent triplets
    return triplets.iloc[apparent_ids,:], triplets.iloc[hidden_ids,:]