# CONTENT BASED RECOMENDATION

Test des model pour la recommendation sur les faux utilisateurs
- User based
- Item based

## Imports

In [1]:
import numpy as np 
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import roc_auc_score

from tqdm import tqdm

import matplotlib.pyplot as plt

## Data Import

In [2]:
df_train = pd.read_csv('filtered data/filtered_rating_fm_dataset_train.csv', index_col=0)

  mask |= (ar1 == a)


In [3]:
files = ['last_fm_fake_user(1001)_jazz.csv',
         'last_fm_fake_user(1002)_classic.csv',
         'last_fm_fake_user(1003)_pop.csv',
         'last_fm_fake_user(1004)_rock.csv',
         'last_fm_fake_user(1005)_rap.csv']

for file in files:
    df_temp = pd.read_csv(f'filtered data/fake_user/{file}', index_col=0)
    df_temp['rating']=100 # a revoir
    df_train = pd.concat([df_train, df_temp])

In [12]:
df_test = pd.read_csv('filtered data/filtered_rating_fm_dataset_test.csv', index_col=0)

  mask |= (ar1 == a)


In [4]:
df_track = pd.read_csv('filtered data/track_features_fm_dataset.csv', index_col=0)

## Data preparation

#### On ne garde que les track_id présent dans le df_track

In [5]:
track_ids = set(df_track['track_id'].unique())

In [6]:
filter = df_train['track_id'].apply(lambda x: x in track_ids)
df_train= df_train[filter]

In [13]:
filter = df_test['track_id'].apply(lambda x: x in track_ids)
df_test = df_test[filter]

#### Normalistion et vectorissation des track features

In [14]:
df_track = pd.concat([df_track,pd.get_dummies(df_track['key'], 'key')], axis=1)
df_track.drop(columns=['key'], inplace=True)

In [15]:
df_track['loudness'] = (df_track['loudness']-df_track['loudness'].min())/(df_track['loudness'].max()-df_track['loudness'].min())

In [16]:
df_track['tempo'] = (df_track['tempo']-df_track['tempo'].min())/(df_track['tempo'].max()-df_track['tempo'].min())

In [17]:
df_track['vector'] = df_track.drop(columns=['track_id','artist_name','track_name']).apply(lambda r: tuple(r), axis=1).apply(np.array).values

In [18]:
df_track.head(2)

Unnamed: 0,artist_name,track_name,track_id,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,...,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,vector
0,Minus 8,Elysian Fields,0,0.627,0.871,0.809418,0,0.0328,0.0962,0.358,...,0,0,0,0,0,0,1,0,0,"[0.627, 0.871, 0.8094177838836515, 0.0, 0.0328..."
1,Beanfield,Planetary Deadlock,1,0.775,0.624,0.798175,1,0.0475,0.137,0.849,...,0,0,0,0,1,0,0,0,0,"[0.775, 0.624, 0.798174554297535, 1.0, 0.0475,..."


#### Creation d'un vecteur utilisateur

In [19]:
df_gb = df_train.groupby(['user_id','track_id']).mean().reset_index()
df_gb = df_gb[['user_id','track_id']]

In [20]:
df_user = df_gb.merge(df_track, left_on='track_id', right_on='track_id')
df_user = df_user.drop(columns=['track_id', 'artist_name', 'track_name']).groupby('user_id').agg('mean').reset_index()

In [21]:
df_user['vector'] = df_user.drop(columns=['user_id']).apply(lambda r: tuple(r), axis=1).apply(np.array).values

In [22]:
df_user.head(2)

Unnamed: 0,user_id,danceability,energy,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,...,key_3,key_4,key_5,key_6,key_7,key_8,key_9,key_10,key_11,vector
0,1,0.532538,0.615229,0.786647,0.566343,0.07593,0.291969,0.428167,0.189321,0.416327,...,0.02589,0.071197,0.058252,0.093851,0.100324,0.048544,0.097087,0.080906,0.080906,"[0.5325381877022655, 0.6152288025889967, 0.786..."
1,2,0.494137,0.654593,0.825662,0.709108,0.057996,0.255986,0.100821,0.193513,0.474405,...,0.02881,0.096654,0.068309,0.052045,0.118959,0.042286,0.141729,0.038569,0.091543,"[0.49413661710037177, 0.654593401626394, 0.825..."


## User based recomendation

#### Test de la fonction cosine_similarity

In [25]:
# Retourne les n meilleur recommendation en tutilisant la cosine_similarity
def get_reco(vec, n=50):  
        
    track_vec = np.array([x for x in df_track['vector']])
    res = cosine_similarity(vec,track_vec)[0]
    
    df_res = pd.DataFrame()
    df_res['track_id'] = df_track['track_id']
    df_res['cosine_similarity'] = res
    
    if n>=0:
        df_res = df_res.sort_values(by=['cosine_similarity'], ascending=False).iloc[0:n]
    
    return df_res

### Utilisateur Jazz 1001

In [35]:
user = 1001

vect = df_user[df_user['user_id']==user]['vector'].values # On recupére le veteur de notre utilisateur
recomended_tracks = get_reco([vect[0]],10) # On calcule les n=50 meilleurs recomendations

recomended_tracks.merge(df_track, left_on='track_id', right_on='track_id')[['track_id','artist_name','track_name']]

Unnamed: 0,track_id,artist_name,track_name
0,12166,Chuck Berry,No Money Down
1,77618,Seu Jorge,Queen Bitch
2,43907,The Fiery Furnaces,We Got Back The Plague
3,7893,Johnny Cash,Big River
4,35720,Big Joe Turner,"Shake, Rattle And Roll"
5,5254,T. Rex,One Inch Rock
6,73635,Muddy Waters,She Moves Me
7,73237,Patsy Cline,Walking After Midnight
8,47013,The Magnetic Fields,For We Are The King Of The Boudoir
9,34142,The Fiery Furnaces,Benton Harbor Blues


### Utilisateur Calssique 1002

In [37]:
user = 1002

vect = df_user[df_user['user_id']==user]['vector'].values # On recupére le veteur de notre utilisateur
recomended_tracks = get_reco([vect[0]],10) # On calcule les n=50 meilleurs recomendations

recomended_tracks.merge(df_track, left_on='track_id', right_on='track_id')[['track_id','artist_name','track_name']]

Unnamed: 0,track_id,artist_name,track_name
0,1144,Sufjan Stevens,Say Yes! To M!Ch!Gan!
1,58539,Johann Sebastian Bach,Air On A G String
2,33893,Devendra Banhart,Poughkeepsie
3,14616,Porcupine Tree,Collapse The Light Into Earth
4,33839,Oh No Oh My,Women Are Born In Love
5,1221,Devendra Banhart,So Long Old Bean
6,40704,Yo La Tengo,From Black To Blue
7,79674,The Glove,A Blues In Drag
8,70468,Michael Nyman,Dreams Of A Journey
9,62208,Sufjan Stevens,The Undivided Self (For Eppie And Popo)


### Utilisateur Pop 1003

In [46]:
user = 1003

vect = df_user[df_user['user_id']==user]['vector'].values # On recupére le veteur de notre utilisateur
recomended_tracks = get_reco([vect[0]],10) # On calcule les n=50 meilleurs recomendations

recomended_tracks.merge(df_track, left_on='track_id', right_on='track_id')[['track_id','artist_name','track_name']]

Unnamed: 0,track_id,artist_name,track_name
0,4747,Lcd Soundsystem,Daft Punk Is Playing At My House
1,15955,Lcd Soundsystem,Watch The Tapes
2,52100,Freezepop,Parlez-Vous Freezepop?
3,10442,Le Tigre,On The Verge
4,50424,Cut Chemist,What'S The Altitude (Feat. Hymnal)
5,5501,Billy Joel,We Didn'T Start The Fire
6,74169,Ween,Shamemaker
7,10436,Le Tigre,Tell You Now
8,39284,Dr. Dre,Xxplosive
9,7237,Atmosphere,The Keys To Life Vs. 15 Minutes Of Fame


### Utilisateur Rock 1004

In [47]:
user = 1004

vect = df_user[df_user['user_id']==user]['vector'].values # On recupére le veteur de notre utilisateur
recomended_tracks = get_reco([vect[0]],10) # On calcule les n=50 meilleurs recomendations

recomended_tracks.merge(df_track, left_on='track_id', right_on='track_id')[['track_id','artist_name','track_name']]

Unnamed: 0,track_id,artist_name,track_name
0,10447,Le Tigre,My My Metrocard
1,20859,Avril Lavigne,Girlfriend
2,31836,David Bowie,I Took A Trip On A Gemini Spaceship
3,65196,Hatebreed,Spitting Venom
4,13545,The Prodigy,Omen
5,5745,The Vines,Get Free
6,23370,Feeder,Seven Days In The Sun
7,48895,Hadouken!,That Boy That Girl
8,2999,Hot Hot Heat,Middle Of Nowhere
9,30577,Horrorpops,Freaks In Uniforms


### Utilisateur Rap 1005

In [48]:
user = 1005

vect = df_user[df_user['user_id']==user]['vector'].values # On recupére le veteur de notre utilisateur
recomended_tracks = get_reco([vect[0]],10) # On calcule les n=50 meilleurs recomendations

recomended_tracks.merge(df_track, left_on='track_id', right_on='track_id')[['track_id','artist_name','track_name']]

Unnamed: 0,track_id,artist_name,track_name
0,26434,Michael Jackson,She Drives Me Wild
1,68527,Cut Chemist,Motivational Speaker
2,39554,2Pac,Troublesome '96
3,56360,Choking Victim,Crack Rock Steady
4,5965,Hieroglyphics,Oakland Blackouts
5,39307,Jurassic 5,Quality Control
6,28108,The Herbaliser,It Ain'T Nuttin'
7,39380,Outkast,The Rooster
8,39514,Gang Starr,Full Clip
9,22905,Dj Jazzy Jeff & The Fresh Prince,Boom! Shake The Room


## Item based recomendation

In [38]:
def get_reco_from_traklist(user_track_vector_list, n=50):
    
    reco_full = pd.DataFrame({'track_id': [], 'cosine_similarity': []})

    for track_vec in user_track_vector_list:
        try:
            if n>=0:
                reco = get_reco([track_vec],50)
            else:
                reco = get_reco([track_vec],-1)
                
            reco_full = pd.concat([reco_full,reco.iloc[1:]])
        except:
            pass
    if n>=0:
        return reco_full.sort_values(by=['cosine_similarity'], ascending=False).iloc[0:n]
    else:
        return reco_full.groupby(['track_id']).mean().reset_index()

In [39]:
def top_user_music(user_id, n=10):
    a = df_train[df_train['user_id']==user_id]
    l = a['track_id']
    return set(l.value_counts()[:n].index)

### Utilisateur Jazz 1001

In [45]:
user = 1001

user_tracks_train = set([x for x in df_train[df_train['user_id']==user]['track_id'].unique()])
user_track_list = pd.DataFrame({'track_id': list(top_user_music(user,10))})
vector_list = pd.merge(user_track_list, df_track, how='left', left_on=['track_id'], right_on = ['track_id'])['vector'].values

recomended_tracks = get_reco_from_traklist(vector_list,10)

recomended_tracks.merge(df_track, left_on='track_id', right_on='track_id')[['track_id','artist_name','track_name']]

Unnamed: 0,track_id,artist_name,track_name
0,4761.0,Ryan Adams,"Goodnight, Hollywood Blvd."
1,1591.0,Damien Rice,9 Crimes
2,10194.0,Sondre Lerche,Wet Ground
3,7465.0,Jack Johnson,Posters
4,1719.0,Death Cab For Cutie,The Ice Is Getting Thinner
5,58930.0,Amy Millan,Pour Me Up Another
6,31706.0,She & Him,You Really Got A Hold On Me
7,17775.0,Nellie Mckay,I Wanna Get Married
8,77149.0,Johnny Cash,In The Jailhouse Now
9,50458.0,Hello Saferide,Loneliness Is Better When You'Re Not Alone


### Utilisateur Calssique 1002

In [49]:
user = 1002

user_tracks_train = set([x for x in df_train[df_train['user_id']==user]['track_id'].unique()])
user_track_list = pd.DataFrame({'track_id': list(top_user_music(user,10))})
vector_list = pd.merge(user_track_list, df_track, how='left', left_on=['track_id'], right_on = ['track_id'])['vector'].values

recomended_tracks = get_reco_from_traklist(vector_list,10)

recomended_tracks.merge(df_track, left_on='track_id', right_on='track_id')[['track_id','artist_name','track_name']]

Unnamed: 0,track_id,artist_name,track_name
0,2181.0,Sigur Rós,Síðasta Ferð
1,19457.0,Eluvium,Indoor Swimming At The Space Station
2,75262.0,Sarah Mclachlan,Wintersong
3,31449.0,Yo La Tengo,The Hour Grows Late
4,72168.0,Frank Sinatra,Young At Heart
5,49668.0,Frank Sinatra,It Came Upon A Midnight Clear
6,72634.0,Brightblack Morning Light,Come Another Rain Down
7,50286.0,Colleen,Floating In The Clearest Night
8,41771.0,No Age,Keechie
9,9733.0,Bat For Lashes,Seal Jubilee


### Utilisateur Pop 1003

In [50]:
user = 1003

user_tracks_train = set([x for x in df_train[df_train['user_id']==user]['track_id'].unique()])
user_track_list = pd.DataFrame({'track_id': list(top_user_music(user,10))})
vector_list = pd.merge(user_track_list, df_track, how='left', left_on=['track_id'], right_on = ['track_id'])['vector'].values

recomended_tracks = get_reco_from_traklist(vector_list,10)

recomended_tracks.merge(df_track, left_on='track_id', right_on='track_id')[['track_id','artist_name','track_name']]

Unnamed: 0,track_id,artist_name,track_name
0,64589.0,P!Nk,'Cuz I Can
1,63114.0,I Am Kloot,No Direction Home
2,2327.0,Tokyo Police Club,Your English Is Good
3,11032.0,Simply Red,Something Got Me Started
4,54745.0,Gnarls Barkley,Would Be Killer
5,41269.0,Inxs,Listen Like Thieves
6,11726.0,The Faint,I Disappear
7,658.0,Björk,Human Behavior
8,5347.0,Kings Of Leon,Happy Alone
9,13898.0,Michael Jackson,Off The Wall


### Utilisateur Rock 1004

In [51]:
user = 1004

user_tracks_train = set([x for x in df_train[df_train['user_id']==user]['track_id'].unique()])
user_track_list = pd.DataFrame({'track_id': list(top_user_music(user,10))})
vector_list = pd.merge(user_track_list, df_track, how='left', left_on=['track_id'], right_on = ['track_id'])['vector'].values

recomended_tracks = get_reco_from_traklist(vector_list,10)

recomended_tracks.merge(df_track, left_on='track_id', right_on='track_id')[['track_id','artist_name','track_name']]

Unnamed: 0,track_id,artist_name,track_name
0,29548.0,Less Than Jake,Danny Says
1,13998.0,Kaiser Chiefs,You Want History
2,64134.0,Atlas Sound,Quarantined
3,16254.0,Guided By Voices,Teenage Fbi
4,8764.0,Camera Obscura,Come Back Margaret
5,56455.0,Idlewild,Quiet Crown
6,58934.0,My Morning Jacket,Two Halves
7,27082.0,Bush,Greedy Fly
8,47793.0,Morrissey,Mama Lay Softly On The Riverbed
9,6678.0,Fall Out Boy,Bang The Doldrums


### Utilisateur Rap 1005

In [52]:
user = 1005

user_tracks_train = set([x for x in df_train[df_train['user_id']==user]['track_id'].unique()])
user_track_list = pd.DataFrame({'track_id': list(top_user_music(user,10))})
vector_list = pd.merge(user_track_list, df_track, how='left', left_on=['track_id'], right_on = ['track_id'])['vector'].values

recomended_tracks = get_reco_from_traklist(vector_list,10)

recomended_tracks.merge(df_track, left_on='track_id', right_on='track_id')[['track_id','artist_name','track_name']]

Unnamed: 0,track_id,artist_name,track_name
0,45781.0,Counting Crows,A Murder Of One
1,70642.0,Alice Russell,Humankind
2,14599.0,Porcupine Tree,The Rest Will Flow
3,51130.0,Iced Earth,Ghost Of Freedom
4,33225.0,Magnolia Electric Co.,The Dark Don'T Hide It
5,4083.0,All Saints,Pure Shores
6,7824.0,Gym Class Heroes,Taxi Driver
7,38955.0,Blackalicious,Sky Is Falling
8,35636.0,Mae,This Time Is The Last Time (Live)
9,57246.0,Me First And The Gimme Gimmes,Danny'S Song
