# Hybride (lightFM)

Exploration du module lightFM, modèle hybride combinant content_based et collaborative filtering

## Imports

In [1]:
import numpy as np
import pandas as pd

from tqdm import tqdm

# all lightfm imports 
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score

# imports re for text cleaning 
import re
from datetime import datetime, timedelta

# we will ignore pandas warning 
import warnings
warnings.filterwarnings('ignore')

#others
import itertools

from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt



## Data import

In [2]:
#### import tous les donées##########
base_path = 'filtered data/'

df_train = pd.read_csv(base_path + 'filtered_rating_fm_dataset_train.csv', index_col=0)

df_user_features = pd.read_csv(base_path+'user_features_fm_dataset.csv', index_col=0)

df_track_features = pd.read_csv(base_path + 'track_features_fm_dataset_normalized.csv', index_col=0)

In [3]:
files = ['last_fm_fake_user(1001)_jazz.csv',
         'last_fm_fake_user(1002)_classic.csv',
         'last_fm_fake_user(1003)_pop.csv',
         'last_fm_fake_user(1004)_rock.csv',
         'last_fm_fake_user(1005)_rap.csv']

for file in files:
    df_temp = pd.read_csv(f'filtered data/fake_user/{file}', index_col=0)
    df_temp['rating']=100 # a revoir
    df_train = pd.concat([df_train, df_temp])
    
    user_id= int(file[18:22])
    df_temp = pd.DataFrame({'user_id':[user_id],
                          'gender':['m'],
                          'country':['United States'],
                          'region':['North America'],
                          'registered':[2006.0]})
    
    df_user_features = pd.concat([df_user_features, df_temp])
    
    

## Fonctions

In [4]:
##################################################
# tous les functions utiles
#################################################
def create_features(dataframe, features_name, id_col_name):
    """
    Generate features that will be ready for feeding into lightfm

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe which contains features
    features_name : List
        List of feature columns name avaiable in dataframe
    id_col_name: String
        Column name which contains id of the question or
        answer that the features will map to.
        There are two possible values for this variable.
        1. questions_id_num
        2. professionals_id_num

    Returns
    -------
    Pandas Series
        A pandas series containing process features
        that are ready for feed into lightfm.
        The format of each value
        will be (user_id, ['feature_1', 'feature_2', 'feature_3'])
        Ex. -> (1, ['military', 'army', '5'])
    """

    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = list(zip(dataframe[id_col_name], features))
    return features



def generate_feature_list(dataframe, features_name):
    """
    Generate features list for mapping 

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    features_name : List
        List of feature columns name avaiable in dataframe. 
        
    Returns
    -------
    List of all features for mapping 
    """
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = features.apply(pd.Series).stack().reset_index(drop=True)
    return features


def calculate_auc_score(lightfm_model, interactions_matrix, 
                        track_features, user_features): 
    """
    Measure the ROC AUC metric for a model. 
    A perfect score is 1.0.

    Parameters
    ----------
    lightfm_model: LightFM model 
        A fitted lightfm model 
    interactions_matrix : 
        A lightfm interactions matrix 
    question_features, professional_features: 
        Lightfm features 
        
    Returns
    -------
    String containing AUC score 
    """
    score = auc_score( 
        lightfm_model, interactions_matrix, 
        item_features=track_features, 
        user_features=user_features,
        num_threads=3).mean()
    return score

In [5]:
class k_best():
    def __init__(self, user_tracks, recomended_tracks):
        self.user_tracks = user_tracks
        self.recomended_tracks = recomended_tracks
    
    def CG(self):
        
        return len(set(self.recomended_tracks).intersection(set(self.user_tracks)))
    
    def DCG(self):
        
        result = 0
        for i,id in enumerate(self.recomended_tracks):
            
            if id in self.user_tracks:
                result+=1/np.log2(i+2)

        return result
    
    def IDCG(self):
        
        a = self.CG()
        result=0
        
        for i in range (a):
            result+=1/np.log2(i+2)
        
        return result
    
    def NDCG(self):
        if self.IDCG()!=0:
            result = self.DCG()/self.IDCG()
        else:
            result=0
            
        return result
    
    def P_at_k(self):
        
        return self.CG()/len(self.recomended_tracks)

    
    def __str__(self):
        
        result = 'K_best ressults: \n'
        result+= f'CG = {self.CG()}\n'
        result+= f'DCG = {self.DCG()}\n'
        result+= f'NDCG = {self.NDCG()}\n'
        result+= f'P@K = {self.P_at_k()}\n'
        
        return result

## Data preparation

In [6]:
df_train['weights'] = df_train['rating']+1

### Filtre des Track_id

In [7]:
track_id_to_keep_tack = set(df_track_features['track_id'].unique())
track_id_to_keep_train = set(df_train['track_id'].unique())

track_id_to_keep = track_id_to_keep_tack.intersection(track_id_to_keep_train)

In [8]:
filter = df_train['track_id'].apply(lambda x: x in track_id_to_keep)
df_train = df_train[filter]

In [9]:
filter = df_track_features['track_id'].apply(lambda x: x in track_id_to_keep)
df_track_features = df_track_features[filter]

### Filtre des user_id

In [10]:
user_id_to_keep_user = set(df_user_features['user_id'].unique())
user_id_to_keep_train = set(df_train['user_id'].unique())

user_id_to_keep = user_id_to_keep_user.intersection(user_id_to_keep_train)

In [11]:
filter = df_train['user_id'].apply(lambda x: x in user_id_to_keep)
df_train = df_train[filter]

In [12]:
filter = df_user_features['user_id'].apply(lambda x: x in user_id_to_keep)
df_user_features = df_user_features[filter]

### Verification

In [13]:
print('train nb track: ', df_train['track_id'].nunique())
print('track_features nb track: ', df_track_features['track_id'].nunique())

train nb track:  42304
track_features nb track:  42304


In [14]:
print('train nb user: ', df_train['user_id'].nunique())
print('user_features nb user: ', df_user_features['user_id'].nunique())

train nb user:  964
user_features nb user:  964


### Creation des tags (user et item)

### User

In [15]:
df_user_features['user_tags'] = df_user_features[df_user_features.columns[1:5]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)

In [16]:
user_features_list = generate_feature_list(df_user_features,['user_tags'])

In [17]:
df_user_features['user_features'] = create_features(df_user_features, ['user_tags'], 'user_id')

### Item

In [18]:
df_track_features['item_tags'] = df_track_features[df_track_features.columns[3:25]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)

In [19]:
item_features_list = generate_feature_list(df_track_features,['item_tags'])

In [20]:
df_track_features['item_features'] = create_features(df_track_features, ['item_tags'], 'track_id')

## Dataset build

In [21]:
user_features_id = df_user_features['user_id'].unique()
track_features_id =  df_track_features['track_id'].unique()

In [22]:
########################
# Train Dataset building for lightfm
########################


dataset = Dataset()
dataset.fit(
    user_features_id, 
    track_features_id,
    item_features = item_features_list, 
    user_features = user_features_list)

In [23]:
track_features = dataset.build_item_features(df_track_features['item_features'])
user_features = dataset.build_user_features(df_user_features['user_features'])

### Train

In [24]:
df_train['user_track_id_tuple'] = list(zip(df_train.user_id, df_train.track_id, df_train.weights))

interactions_train, weights_train = dataset.build_interactions(df_train['user_track_id_tuple'])

### Modèle

In [25]:
################################
# Model building part
################################

# define lightfm model by specifying hyper-parametre
# then fit the model with ineteractions matrix, item and user features 
model = LightFM(
    no_components=80,
    learning_rate=0.95,
    learning_schedule='adagrad',
    loss='warp',
    max_sampled=7,
    random_state=42)

model.fit(
    interactions_train,
    item_features = track_features,
    user_features = user_features, sample_weight = weights_train,
    epochs=5, num_threads=3, verbose=True)

Epoch: 100%|████████████████████████████████████████████████████████████████████████████| 5/5 [11:02<00:00, 132.59s/it]


<lightfm.lightfm.LightFM at 0x213fea183d0>

## Evaluation

In [62]:
labels = np.array(df_track_features['artist_name']+'--'+df_track_features['track_name'])
id = np.array(df_track_features['track_id'])

In [72]:
def sample_recommendation(model, data, user_id, filter=[]):
    
    n_users, n_items = data.shape

    scores = model.predict(user_id,np.arange(n_items),track_features,user_features)
        
    top_items_for_user = labels[np.argsort(-scores)]
    top_items_filter = [True if x not in filter else False for x in id[np.argsort(-scores)]]
    
    top_items_for_user = top_items_for_user[top_items_filter]
    
    print("Top Recommended tracks For User: ", user_id)
    for x in top_items_for_user[:10]:
        print("     %s" % x)

   

### Utilisateur Jazz 1001

In [73]:
user = 1001

inner_user = int(np.where(user_features_id==user)[0][0])
recommendation = sample_recommendation(model,interactions_train,inner_user)

Top Recommended tracks For User:  959
     Etta James--At Last
     Chet Baker--My Funny Valentine
     Aaron Goldberg--Oam'S Blues
     Billie Holiday--Solitude
     Bob Marley & The Wailers--Lively Up Yourself
     Shirley Horn--Here'S To Life
     Billie Holiday--Fine And Mellow
     Dexter Gordon--I Guess I'Ll Hang My Tears Out To Dry
     Benny Goodman--King Porter Stomp
     Billie Holiday--Good Morning Heartache


In [74]:
user = 1001

user_tracks_train = set([x for x in df_train[df_train['user_id']==user]['track_id'].unique()])
recommendation = sample_recommendation(model,interactions_train,inner_user,user_tracks_train)

Top Recommended tracks For User:  959
     Etta James--At Last
     Chet Baker--My Funny Valentine
     Aaron Goldberg--Oam'S Blues
     Billie Holiday--Solitude
     Bob Marley & The Wailers--Lively Up Yourself
     Shirley Horn--Here'S To Life
     Billie Holiday--Fine And Mellow
     Dexter Gordon--I Guess I'Ll Hang My Tears Out To Dry
     Billie Holiday--Good Morning Heartache
     João Gilberto--De Conversa Em Conversa


### Utilisateur classic 1002

In [47]:
user = 1002

inner_user = int(np.where(user_features_id==user)[0][0])
recommendation = sample_recommendation(model,interactions_train,inner_user)

Top Recommended tracks For User:  960
     Gnarls Barkley--Crazy
     Queen--Bohemian Rhapsody
     Moby--Porcelain
     Michael Jackson--Beat It
     Gorillaz--Dare
     A-Ha--Take On Me
     Wolfgang Amadeus Mozart--Lacrimosa
     The Beatles--Hey Jude
     Moby--In This World
     Madonna--Hung Up


In [77]:
user = 1002

inner_user = int(np.where(user_features_id==user)[0][0])
user_tracks_train = set([x for x in df_train[df_train['user_id']==user]['track_id'].unique()])
recommendation = sample_recommendation(model,interactions_train,inner_user,user_tracks_train)

Top Recommended tracks For User:  960
     Gnarls Barkley--Crazy
     Queen--Bohemian Rhapsody
     Moby--Porcelain
     Michael Jackson--Beat It
     Gorillaz--Dare
     A-Ha--Take On Me
     The Beatles--Hey Jude
     Moby--In This World
     Madonna--Hung Up
     Alanis Morissette--You Oughta Know


### Utilisateur pop 1003

In [48]:
user = 1003

inner_user = int(np.where(user_features_id==user)[0][0])
recommendation = sample_recommendation(model,interactions_train,inner_user)

Top Recommended tracks For User:  961
     Britney Spears--Circus
     P!Nk--So What
     Christina Aguilera--Keeps Gettin' Better
     Ne-Yo--Closer
     Maroon 5--Wake Up Call
     Britney Spears--Hot As Ice
     Jennifer Lopez--Do It Well
     Katy Perry--Hot N Cold
     Nelly Furtado--Say It Right
     Britney Spears--Womanizer


In [78]:
user = 1003

inner_user = int(np.where(user_features_id==user)[0][0])
user_tracks_train = set([x for x in df_train[df_train['user_id']==user]['track_id'].unique()])
recommendation = sample_recommendation(model,interactions_train,inner_user,user_tracks_train)

Top Recommended tracks For User:  961
     Britney Spears--Circus
     P!Nk--So What
     Christina Aguilera--Keeps Gettin' Better
     Ne-Yo--Closer
     Maroon 5--Wake Up Call
     Jennifer Lopez--Do It Well
     Nelly Furtado--Say It Right
     Britney Spears--Womanizer
     Gwen Stefani--Wind It Up
     Destiny'S Child--Bootylicious


### Utilisateur rock 1004

In [49]:
user = 1004

inner_user = int(np.where(user_features_id==user)[0][0])
recommendation = sample_recommendation(model,interactions_train,inner_user)

Top Recommended tracks For User:  962
     System Of A Down--B.Y.O.B.
     System Of A Down--Cigaro
     System Of A Down--Tentative
     System Of A Down--Atwa
     System Of A Down--Violent Pornography
     System Of A Down--Old School Hollywood
     System Of A Down--Deer Dance
     System Of A Down--Toxicity
     System Of A Down--Science
     Rammstein--Keine Lust


In [79]:
user = 1004

inner_user = int(np.where(user_features_id==user)[0][0])
user_tracks_train = set([x for x in df_train[df_train['user_id']==user]['track_id'].unique()])
recommendation = sample_recommendation(model,interactions_train,inner_user,user_tracks_train)

Top Recommended tracks For User:  962
     System Of A Down--B.Y.O.B.
     System Of A Down--Cigaro
     System Of A Down--Tentative
     System Of A Down--Atwa
     System Of A Down--Violent Pornography
     System Of A Down--Old School Hollywood
     System Of A Down--Deer Dance
     System Of A Down--Toxicity
     System Of A Down--Science
     Rammstein--Keine Lust


### Utilisateur rap 1005

In [50]:
user = 1005

inner_user = int(np.where(user_features_id==user)[0][0])
recommendation = sample_recommendation(model,interactions_train,inner_user)

Top Recommended tracks For User:  963
     Eminem--Kim
     2Pac--Changes
     Cypress Hill--Hits From The Bong
     Dr. Dre--Forgot About Dre
     Sean Kingston--Beautiful Girls
     2Pac--Temptations
     Lil Wayne--Go Dj
     Paramore--Misery Business
     Eminem--Mosh
     Eminem--Just Don'T Give A Fuck


In [81]:
user = 1005

inner_user = int(np.where(user_features_id==user)[0][0])
user_tracks_train = set([x for x in df_train[df_train['user_id']==user]['track_id'].unique()])
recommendation = sample_recommendation(model,interactions_train,inner_user,user_tracks_train)

Top Recommended tracks For User:  963
     2Pac--Changes
     Cypress Hill--Hits From The Bong
     Dr. Dre--Forgot About Dre
     Sean Kingston--Beautiful Girls
     Lil Wayne--Go Dj
     Paramore--Misery Business
     Eminem--Mosh
     Eminem--Just Don'T Give A Fuck
     Lupe Fiasco--Streets On Fire
     Kanye West--Through The Wire
