# DATA PREPARATION LAST FM (Spotify API)

### Imports 

In [18]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from time import sleep
import json

import pandas as pd

### Class permetant de scraper l'API Spotify 

In [9]:
class SpotifyScraper():
    
    '''
    Class permetant de récuperer les features descriptives d'un track à partir du titre et du l'artiste
    
    Pour fonctionner un compte Spotify for Developpers est necessaire
    '''
    
    def __init__(self, cid, secret):
        
        '''
        Initialisation de la class
        
        cid --> identifiant du compre Spotify for Developpers
        secret --> clée secrete generée pour l'application
        
        cf Spotify for Developpers
        '''
        
        client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
        self.spotify = spotipy.Spotify(client_credentials_manager = client_credentials_manager, requests_timeout=10)
    
    def get_features(self, song_id):
        
        '''
        Récupére les feartures d'un titre à partir de son id Spotify 
        exemple id : '5ghIJDpPoe3CfHMGu71E6T'
        
        retourne un liste de features (float)
        '''
        
        sleep(0.1) # Tenmps d'attente pour eviter de saturer l'API
        
        try:
            features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
                        'acousticness', 'instrumentalness', 'liveness',
                        'valence', 'tempo']

            track = self.spotify.audio_features([song_id])
            result = list()

            for feature in features:
                a = track[0][feature]
                result.append(a)

            return result
    
            
        except:
            return None
    
    def search_song(self, txt, artist=None):
        
        '''
        recherhce un titre à partirt du titre de la chanson et d'un nom d'artise
        retourne l'id track Spotify correspondant si il est trouvé
        '''
        
        try:
            ans = self.spotify.search(q='track:' + txt, type='track')
            
            if artist==None:
                return ans['tracks']['items'][0]['id']

            for tr in ans['tracks']['items']:
                
                if tr['name'].lower() in txt.lower():
                    for art in tr['artists']:
                        
                        if artist.lower() in art['name'].lower():
                            return tr['id'], art['name'], tr['name']
            
            return None
        
        except:
            return None
        
    
    def search_song_features(self, txt, artist=None):
        
        '''
        recherhce un titre à partirt du titre de la chanson et d'un nom d'artise
        puis récupère les features associées
        retourne un dictionaire
        '''
        
        sleep(0.1) # Tenmps d'attente pour eviter de saturer l'API
        
        try:
            
            id, name, artist = self.search_song(txt, artist)
            features = self.get_features(id)
        
            return {'artist':artist, 'name': name, 'features': features}
        
        except:
            return None
        

### Creation d'un scraper

In [10]:
with open ('credential/Spotify_credential.json') as f:
    credentials = json.load(f)

In [11]:
# creation d'un scraper
scraper = SpotifyScraper(credentials['cid'], credentials['secret'])

In [12]:
# test de la méthode get_features
scraper.get_features('5ghIJDpPoe3CfHMGu71E6T')

[0.502, 0.912, 1, -4.556, 1, 0.0564, 2.55e-05, 0.000173, 0.106, 0.72, 116.761]

In [13]:
# test de la méthode search_song_features
scraper.search_song_features('The Girl And The Robot', 'Röyksopp')

{'artist': 'The Girl and the Robot',
 'name': 'Röyksopp',
 'features': [0.511,
  0.899,
  9,
  -5.323,
  0,
  0.0601,
  0.115,
  0.000105,
  0.107,
  0.0773,
  121.0]}

## Chargement du dataset 

In [17]:
df = pd.read_csv('filtered data/filtered_fm_dataset.csv', index_col=0)
df = df.groupby(['track_id']).first().reset_index()

  mask |= (ar1 == a)


In [19]:
track_names = df['track_name'].values
artist_names = df['artist_name'].values
track_ids = df['track_id'].values

## Creation de deux fichiers temporaire

Le grand nombre de donnés et le temps de traitement (environ 10h) nous oblige à faire un fichier de dump <br/>
Pour eviter d'être bannie, nous traiton les données en batch de 10k ~ 1h de traitement

**Fichiers:**<br/>
out.csv --> fichier txt où les features sont ajoutés les une aprés les autres <br/>
ou_id.csv --> fichier regroupant les id déjà traités

In [22]:
col = ['artist_name','track_name','track_id','danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
                        'acousticness', 'instrumentalness', 'liveness',
                        'valence', 'tempo']

In [23]:
with open('out.csv', 'w') as f:
                f.write(';'.join(col)+'\n')

In [24]:
try:
    with open('id_out.csv', 'r') as f:
        out_id = set([int (l) for l in f.readlines()])
    print(out_id)
except:
    out_id=set()

In [25]:
from tqdm import tqdm

col = ['artist_name','track_name','track_id','danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
                        'acousticness', 'instrumentalness', 'liveness',
                        'valence', 'tempo']

# recuperation des tack_id déjà traités
try:
    with open('id_out.csv', 'r') as f:
        out_id = set([int (l) for l in f.readlines()])
except:
    out_id=set()


# taille du batch
max=100

for artist, track,id in tqdm(zip(artist_names, track_names, track_ids)) :
    
    if id not in out_id:
        
        if max==0:
            break
        
        result = scraper.search_song_features(track, artist)
        max-=1

        try:
            with open('id_out.csv', 'a+') as f:
                f.write(str(id)+'\n')
            
            line = f'{artist};{track};{id};'+';'.join([str(x) for x in result['features']])+'\n'
            with open('out.csv', 'a+') as f:
                f.write(line)
        except:
                pass
    else:
        pass

100it [00:29,  3.44it/s]


## Performance 

In [34]:
with open('out.csv', 'r') as f:
    aa = len(f.readlines())

with open('id_out.csv', 'r') as f:
    bb = len(f.readlines())

print(aa/bb)

0.5283498759305211


## Transformation du fichier dump out.csv en DataFrame

In [26]:
columns = ['artist_name','track_name','track_id','danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
                        'acousticness', 'instrumentalness', 'liveness',
                        'valence', 'tempo']

df_2 = pd.DataFrame(columns=columns)

with open('out.csv', 'r') as f:
    temp = dict()
    for line in f.readlines()[1:]:
        for col, val in zip(columns,line[:-1].split(';')):
            temp[col] = val
        df_2 = df_2.append(pd.DataFrame.from_dict([temp]))
        

In [27]:
df_2.head()

Unnamed: 0,artist_name,track_name,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Minus 8,Elysian Fields,0,0.627,0.871,9,-8.742,0,0.0328,0.0962,0.358,0.109,0.956,89.102
0,Beanfield,Planetary Deadlock,1,0.775,0.624,7,-9.454,1,0.0475,0.137,0.849,0.101,0.466,89.997
0,Alif Tree,Deadly Species,2,0.583,0.428,9,-12.336,0,0.0271,0.778,0.687,0.172,0.335,140.041
0,4Hero,Dedication To The Horse,6,0.567,0.812,8,-6.049,1,0.0296,1.48e-05,0.0226,0.215,0.599,127.968
0,4Hero,Why Don'T You Talk?,7,0.571,0.711,7,-5.893,1,0.0366,0.00965,0.761,0.435,0.655,141.877


## Rectification des types des variables

In [28]:
columns = ['track_id', 'key', 'mode']

for col in columns:
    df_2[col] = df_2[col].astype('int')

In [29]:
columns = ['danceability', 'energy', 'loudness', 'speechiness',
                        'acousticness', 'instrumentalness', 'liveness',
                        'valence', 'tempo']

for col in columns:
        df_2[col] = df_2[col].apply(lambda x: x.replace(',','.'))

In [30]:
columns = ['danceability', 'energy', 'loudness', 'speechiness',
                        'acousticness', 'instrumentalness', 'liveness',
                        'valence', 'tempo']

for col in columns:
    df_2[col] = df_2[col].astype('float')

In [31]:
df_2 = df_2.reset_index().drop(columns=['index'])

In [32]:
df_2.to_csv('filtered data/track_features_fm_dataset.csv')

## Verification du DataFrame

In [33]:
df_2 = pd.read_csv('filtered data/track_features_fm_dataset.csv', index_col=0)
df_2.head(10)

Unnamed: 0,artist_name,track_name,track_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Minus 8,Elysian Fields,0,0.627,0.871,9,-8.742,0,0.0328,0.0962,0.358,0.109,0.956,89.102
1,Beanfield,Planetary Deadlock,1,0.775,0.624,7,-9.454,1,0.0475,0.137,0.849,0.101,0.466,89.997
2,Alif Tree,Deadly Species,2,0.583,0.428,9,-12.336,0,0.0271,0.778,0.687,0.172,0.335,140.041
3,4Hero,Dedication To The Horse,6,0.567,0.812,8,-6.049,1,0.0296,1.5e-05,0.0226,0.215,0.599,127.968
4,4Hero,Why Don'T You Talk?,7,0.571,0.711,7,-5.893,1,0.0366,0.00965,0.761,0.435,0.655,141.877
5,Röyksopp,The Girl And The Robot,10,0.511,0.899,9,-5.323,0,0.0601,0.115,0.000105,0.107,0.0773,121.0
6,Röyksopp,Vision One,11,0.515,0.95,8,-6.025,0,0.0652,0.0548,0.213,0.42,0.68,93.082
7,Röyksopp,Happy Up Here,12,0.676,0.877,10,-3.847,1,0.133,0.0102,0.186,0.255,0.685,103.538
8,Röyksopp,You Don'T Have A Clue,16,0.563,0.857,10,-4.7,0,0.074,0.02,0.187,0.189,0.621,116.106
9,Röyksopp,Miss It So Much,18,0.636,0.851,1,-5.488,0,0.0372,0.00962,0.817,0.363,0.805,134.003


In [34]:
df_2.shape

(58, 14)

In [35]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 58 entries, 0 to 57
Data columns (total 14 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   artist_name       58 non-null     object 
 1   track_name        58 non-null     object 
 2   track_id          58 non-null     int64  
 3   danceability      58 non-null     float64
 4   energy            58 non-null     float64
 5   key               58 non-null     int64  
 6   loudness          58 non-null     float64
 7   mode              58 non-null     int64  
 8   speechiness       58 non-null     float64
 9   acousticness      58 non-null     float64
 10  instrumentalness  58 non-null     float64
 11  liveness          58 non-null     float64
 12  valence           58 non-null     float64
 13  tempo             58 non-null     float64
dtypes: float64(9), int64(3), object(2)
memory usage: 6.8+ KB
