In [None]:
import os
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import ipynb
from scipy.spatial import distance_matrix
import random

In [None]:
def split_history(triplets, hidden_size, random_state = None):
    """
    Split a dataframe corresponding to users's listening history in the form of a triplet user - track - number of listening
    into two dataframes 'hidden_triplets' and 'apparent_triplets'.
    hidden_triplets will correspond for each user to the proportion hidden_size of its listening history and
    apparent_triplet will correspond to the proportion 1 - hidden_size.
    Parameters:
        triplets (pandas.Dataframe): a dataframe whose columns are
            user : a the unique id of the user
            
            track_id : a unique id for a track
            
            listening_count: the number of times the user has listened to the track
            
            ... : other columns corresponding to track's features and/or the user's features and/or element of context of the interaction user/track.
        
        hidden_size (float): a value between 0 and 1 corresponding to the proportion of the user's listening history to hide.
        
        random_state (int, default=None): Pass an int for reproducible output across multiple function calls

    Return :
        hidden_triplets, apparent_triplets (pandas.Dataframe tuple) : the two splitted listening history.
    """
    # Set a seed for the random number generator
    if random_state is not None:
        random.seed(random_state)

    # Initiate empty dataframes corresponding to hidden and apparent triplets
    hidden_triplets = pd.DataFrame(columns = triplets.columns)
    apparent_triplets = pd.DataFrame(columns = triplets.columns)
    
    # For each users u, divide the historique of u into apparent and hidden triplets according to the proportion defined by hidden_size
    for u in triplets.user.unique():
        # Select the triplets corresponding to u's listening history
        triplets_u = triplets.loc[triplets.user == u]
        # Define the number of hidden triplets in u's listening history according to p and u's number of listened tracks
        n_hidden = round(len(triplets_u) * hidden_size)
        # Select n_hidden triplets in u's listening history
        i_hidden = random.sample([i for i in range(len(triplets_u))], n_hidden)
        # Assign the remaining triplet in the apparent triplets
        i_apparent = [i for i in range(len(triplets_u)) if i not in i_hidden]
        
        hidden_triplets = pd.concat([hidden_triplets, triplets_u.iloc[i_hidden, :]]).reset_index(drop=True)
        apparent_triplets = pd.concat([apparent_triplets, triplets_u.iloc[i_apparent, :]]).reset_index(drop=True)
    
    return hidden_triplets, apparent_triplets