# DATA PREPARATION Train_test_split

FICHIER: *'filtered_rating_fm_dataset.csv'*

## Imports

In [3]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Data imports

In [4]:
df = pd.read_csv('filtered data/filtered_rating_fm_dataset.csv')
df.head()

Unnamed: 0,user_id,time_stamp,artist_id,artist_name,track_id,track_name,rating
0,1,2009-05-03T15:10:18Z,4,Minus 8,0,Elysian Fields,9
1,1,2009-05-03T15:04:31Z,5,Beanfield,1,Planetary Deadlock,11
2,1,2009-05-03T14:50:51Z,7,Alif Tree,2,Deadly Species,11
3,1,2009-05-03T14:46:29Z,4,Minus 8,3,Cold Fusion,9
4,1,2009-05-02T15:00:59Z,14,4Hero,4,Look Inside,8


## Creation d'une matrice d'interaction Item/user/weight

In [5]:
df_gb = df.groupby(['user_id','track_id']).mean().reset_index()

In [6]:
item = np.array(df_gb[['track_id']].values).T[0]
user = np.array(df_gb[['user_id']].values).T[0]
weight = np.array(df_gb[['rating']].values).T[0]

In [7]:
from scipy.sparse import csr_matrix
mat_music = csr_matrix((weight,(item, user)))

## Train_test_split

In [9]:
def select(x, samples):
    pair = (x['track_id'],x['user_id'])
    return pair in samples

In [10]:
def make_train(df, ratings, pct_test = 0.2):
    '''
    
    parameters: 
    
    ratings - the original ratings matrix from which you want to generate a train/test set. Test is just a complete
    copy of the original set. This is in the form of a sparse csr_matrix. 
    
    pct_test - The percentage of user-item interactions where an interaction took place that you want to mask in the 
    training set for later comparison to the test set, which contains all of the original ratings. 
    
    returns:
    
    samples - une liste d'intercation à séparer 
    '''
    test_set = ratings.copy() # Make a copy of the original set to be the test set.
    #test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
    
    random.seed(0) # Set the random seed to zero for reproducibility
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
    samples = set(samples)
    
    filtre = df.apply(lambda row: select(row, samples) ,axis = 1)
    
    train_df = df[filtre==False]
    test_df = df[filtre]
    
    return train_df, test_df

In [41]:
train_df, test_df =  make_train(df, mat_music, pct_test=0.3)

### Filtre des track_id
les models mixte colaborativ filtering + content base necessitent un nombre de track_id homogéne entre le train et le test

In [42]:
print('df nb track_id', df['track_id'].nunique())
print('df_train nb track_id', train_df['track_id'].nunique())
print('df_test nb track_id', test_df['track_id'].nunique())

df nb track_id 80600
df_train nb track_id 80600
df_test nb track_id 80021


In [43]:
track_to_keep = set(test_df['track_id'].unique())
filter = train_df['track_id'].apply(lambda x: x in track_to_keep)
train_df = train_df[filter]

In [44]:
print('df nb track_id', df['track_id'].nunique())
print('df_train nb track_id', train_df['track_id'].nunique())
print('df_test nb track_id', test_df['track_id'].nunique())

df nb track_id 80600
df_train nb track_id 80021
df_test nb track_id 80021


### Verification User_id

In [45]:
print('df nb user_id', df['user_id'].nunique())
print('df_train nb user_id', train_df['user_id'].nunique())
print('df_test nb user_id', test_df['user_id'].nunique())

df nb user_id 959
df_train nb user_id 959
df_test nb user_id 959


### Verification Artist_id

In [46]:
print('df nb artist_id', df['artist_id'].nunique())
print('df_train nb artist_id', train_df['artist_id'].nunique())
print('df_test nb artist_id', test_df['artist_id'].nunique())

df nb artist_id 6336
df_train nb artist_id 6307
df_test nb artist_id 6303


## Sauvegarde

In [25]:
train_df.to_csv('filtered data/filtered_rating_fm_dataset_train.csv')

In [26]:
test_df.to_csv('filtered data/filtered_rating_fm_dataset_test.csv')

In [27]:
test_df.head()

Unnamed: 0,user_id,time_stamp,artist_id,artist_name,track_id,track_name,rating
3,0,2009-05-03T14:46:29Z,0,Minus 8,3,Cold Fusion,9
8,0,2009-04-30T17:42:39Z,3,4Hero,6,Dedication To The Horse,1
16,0,2009-04-28T13:25:52Z,0,Minus 8,3,Cold Fusion,9
18,0,2009-04-27T12:29:01Z,4,Röyksopp,10,The Girl And The Robot,25
21,0,2009-04-27T12:24:32Z,4,Röyksopp,10,The Girl And The Robot,25
