In [93]:
import pandas as pd
import time
import json
import numpy as np
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [94]:
key_file = '../keys.json'
with open(key_file) as f:
    keys = json.load(f)
    

client_credentials_manager = SpotifyClientCredentials(client_id=keys['spotify_client_id'],
                                                      client_secret=keys['spotify_client_secret'])
spotify = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [95]:
# genres
genre_list = [
    "acoustic",
    "afrobeat",
    "alt-rock",
    "alternative",
    "ambient",
    "anime",
    "black-metal",
    "bluegrass",
    "blues",
    "bossanova",
    "breakbeat",
    "cantopop",
    "chicago-house",
    "children",
    "chill",
    "christian",
    "classical",
    "club",
    "comedy",
    "country",
    "dance",
    "dancehall",
    "death-metal",
    "deep-house",
    "disco",
    "disney",
    "drum-and-bass",
    "dub",
    "dubstep",
    "edm",
    "electro",
    "electronic",
    "emo",
    "folk",
    "forro",
    "funk",
    "garage",
    "gospel",
    "goth",
    "grindcore",
    "groove",
    "grunge",
    "guitar",
    "happy",
    "hard-rock",
    "hardcore",
    "hardstyle",
    "heavy-metal",
    "hip hop",
    "holidays",
    "honky-tonk",
    "house",
    "idm",
    "indie",
    "indie-pop",
    "industrial",
    "j-dance",
    "j-idol",
    "j-pop",
    "j-rock",
    "jazz",
    "k-pop",
    "kids",
    "mandopop",
    "metal",
    "metal-misc",
    "metalcore",
    "minimal-techno",
    "movies",
    "mpb",
    "new-age",
    "new-release",
    "opera",
    "pagode",
    "party",
    "piano",
    "pop",
    "pop-film",
    "post-dubstep",
    "power-pop",
    "progressive-house",
    "psych-rock",
    "punk",
    "punk-rock",
    "r-n-b",
    "rainy-day",
    "rap",
    "reggae",
    "reggaeton",
    "road-trip",
    "rock",
    "rock-n-roll",
    "rockabilly",
    "romance",
    "sad",
    "salsa",
    "samba",
    "sertanejo",
    "show-tunes",
    "singer-songwriter",
    "ska",
    "sleep",
    "songwriter",
    "soul",
    "soundtracks",
    "study",
    "summer",
    "synth-pop",
    "tango",
    "techno",
    "trance",
    "trip-hop",
    "work-out",
    "world-music"
  ]

In [96]:
genres_df  = pd.DataFrame(columns = genre_list)
genres_df

Unnamed: 0,acoustic,afrobeat,alt-rock,alternative,ambient,anime,black-metal,bluegrass,blues,bossanova,...,soundtracks,study,summer,synth-pop,tango,techno,trance,trip-hop,work-out,world-music


In [97]:
songs = pd.read_csv('../data/lyrics/song_info.txt')
songs.drop_duplicates(subset=['song_id'], inplace=True)
print(len(songs))

44769


In [98]:
songs = songs.join(genres_df)

In [99]:
songs.fillna(0, inplace=True)
songs.head()

Unnamed: 0,song_id,artist,title,lyrics,lang,audio_features,acoustic,afrobeat,alt-rock,alternative,...,soundtracks,study,summer,synth-pop,tango,techno,trance,trip-hop,work-out,world-music
0,2bVisOYbfWI29XxWEdTKSX,Grupo Polo Montañes,Un Sueño Y Nada Mas,A veces tengo ganas de volver a verte\r\nY sé ...,es,"{'danceability': 0.607, 'energy': 0.542, 'key'...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0vQfzyjQFHQ208JCNjcE5H,Los Estramboticos,La Herida,Fue esta soledad y no el destino\r\nQuien nos ...,es,"{'danceability': 0.751, 'energy': 0.806, 'key'...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6hLo5Dg74PGHjSbcP6tNOr,La Castañeda,Cenit,Creí cuando dijeron que la luna se dormía cuan...,es,"{'danceability': 0.629, 'energy': 0.325, 'key'...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,284Ztj2XGMvoWKILVJGasd,Maskatesta,Tu Que Pediras,"Tee diiree una vez mass, lo quee pido por tii\...",es,"{'danceability': 0.623, 'energy': 0.79, 'key':...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,6FLSb8CJwrcfXfzfD4ySji,DLD,Por Siempre,Hoy he vuelto a ser quien soy\r\nla vida me re...,es,"{'danceability': 0.452, 'energy': 0.891, 'key'...",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [100]:
available_markets = ["AD","AR","AT","AU","BE","BG","BO","BR","CA","CH","CL","CO","CR","CY","CZ",
      "DE","DK","DO","EC","EE","ES","FI","FR","GB","GR","GT","HK","HN","HU","ID","IE","IL","IS",
      "IT","JP","LI","LT","LU","LV","MC","MT","MX","MY","NI","NL","NO","NZ","PA","PE","PH","PL",
      "PT","PY","RO","SE","SG","SK","SV","TH","TR","TW","US","UY","VN","ZA"]

num_songs = len(songs)
pbar = tqdm_notebook(total=num_songs)
for i in range(num_songs):
    time.sleep(.03)
    pbar.update(1)
    try:
        query = spotify.search(songs.loc[i]['artist'], type='artist', market='US')

        if len(query['artists']['items']) == 0:
                for location in available_markets:
                    query = spotify.search(q=row['artist'], market=location, type='artist')
                    if len(query['artists']['items']) != 0:
                        break
                continue
        
        # if multiple artists returned, get the most popular one
        pop_max_index = 0
        pop_max = 0
        idx = 0
        for item in query['artists']['items']:
            if int(item['popularity']) > pop_max:
                pop_max = int(item['popularity'])
                pop_max_index = idx
            idx += 1

        genres = query['artists']['items'][pop_max_index]['genres']
        if len(genres) == 0:
            continue
        
        # set one-hot encoding for genres
        for genre in genre_list:
            if genre in str(genres):
                songs.loc[i, genre] = 1
    except Exception:
        pass
pbar.close()

HBox(children=(IntProgress(value=0, max=44769), HTML(value='')))

In [101]:
songs.drop(axis='columns', labels=['artist', 'title', 'lyrics', 'lang', 'audio_features'], inplace=True)

In [102]:
songs.to_csv(path_or_buf='../data/genres.txt', index=False, encoding='utf-8')