# Building the Database

In [8]:
import re
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
_
# Spotify API 
import spotipy
from spotipy.oauth2 import SpotifyOAuth
_
# Web scraping
import time
import requests
from bs4 import BeautifulSoup
_
# Misc
import sys
sys.path.append('/Users/chantellefourlze/PycharmProjects/machine-learning/spotify')
import cred

### Spotify API

Instantiate Spotipy object:

In [3]:
SP = spotipy.Spotify(
    auth_manager=SpotifyOAuth(
        redirect_uri=cred.REDIRECT_URI, 
        client_id=cred.CLIENT_ID, 
        client_secret=cred.CLIENT_SECRET
    )
)

Compile Spotify playlist ID numbers for each artist:

In [4]:
ARTIST_ID = {
    'Drake': '37i9dQZF1DX7QOv5kjbU68',
    'Eminem': '37i9dQZF1DZ06evO4gTUOY',
    'Nicki Minaj': '37i9dQZF1DZ06evO0684jS',
    'Nas': '37i9dQZF1DZ06evO18oyZi',
    'Future': '37i9dQZF1DZ06evO133u6s',
    'Dave': '37i9dQZF1DZ06evO3Y4Tg4',
    '2Pac': '37i9dQZF1DZ06evO17QsVi',
    'Kendrick Lamar': '37i9dQZF1DZ06evO1IPOOk',
    'Rapsody': '37i9dQZF1DZ06evO41iwLu',
    'Skepta': '37i9dQZF1DZ06evO1nbU8U'
}

Grab top 50 tracks for each artist and extract track info:

In [28]:
data_dict = {'track_name': [], 'main_artist': [], 'key_artist': [], 'duration_ms': [], 'is_explicit': []}

for key_artist, id in ARTIST_ID.items():
    playlist_tracks = SP.playlist_items(playlist_id=id)
    # Extract track names, track duration and if track is explicit:
    track_names = [track['track']['name'] for track in playlist_tracks['items']][:50]
    main_artists = [track['track']['artists'][0]['name'] for track in playlist_tracks['items']][:50]
    is_explicit = [track['track']['explicit'] for track in playlist_tracks['items']][:50]
    track_duration = [track['track']['duration_ms'] for track in playlist_tracks['items']][:50]
    
    # Store values:
    data_dict['track_name'] += track_names
    data_dict['main_artist'] += main_artists
    data_dict['key_artist'] += [key_artist for _ in range(50)]
    data_dict['duration_ms'] += track_duration
    data_dict['is_explicit'] += is_explicit

Convert data into a Pandas dataframe:

In [29]:
df = pd.DataFrame(data_dict)
_
df.sample(10)

Unnamed: 0,track_name,main_artist,key_artist,duration_ms,is_explicit
217,U Are My High (with Future),DJ Snake,Future,211200,False
132,Ball For Me (feat. Nicki Minaj),Post Malone,Nicki Minaj,206266,True
24,Spin Bout U,Drake,Drake,214588,True
171,Street Dreams,Nas,Nas,279933,True
288,Paper Cuts,Dave,Dave,179626,True
474,Touching My Body,Skepta,Skepta,430727,False
79,Bitch Please II,Eminem,Eminem,288200,True
287,Nothing On You (feat. Paulo Londra & Dave),Ed Sheeran,Dave,200733,False
305,All Eyez On Me (ft. Big Syke),2Pac,2Pac,307773,True
279,Disaster (feat. J Hus),Dave,Dave,240160,True


### Rap Genius API

In [83]:
URL = 'https://genius.com/'

Get lyrics for each track:

In [86]:
for index, row in tqdm(df[df.isnull().any(axis=1)].iterrows(), total=len(df)):
    try:
        main_artist = row['main_artist']
        key_artist = row['key_artist']
        track_name = row['track_name']
    
        # Remove features from track name and reformat strings
        track_name = track_name.lower().split(' (')[0].strip().replace(' ', '-')
        main_artist = main_artist.replace(' ', '-')
    
        # Connect to webpage
        path_extension = f"{main_artist}-{track_name}-lyrics"
        r = requests.get(URL + path_extension)
    
        # Get raw text of lyrics 
        soup = BeautifulSoup(r.content, 'html.parser')
        lyrics = " ".join(
            tag.get_text() + '\n\n' for tag in \
            soup.find_all('div', {'class': 'Lyrics__Container-sc-1ynbvzw-1 kUgSbL'})
        )
        print(lyrics, '\n')
        # Add lyrics to current row
        df.loc[index, 'raw_lyrics'] = lyrics
    except Exception as e:
        print(f"Error occurred at index pos: {index} — {e} — {r.status_code}")

    # Avoid hitting rate limit
    time.sleep(1)

  0%|          | 0/500 [00:00<?, ?it/s]

Drake Search & Rescue
Drake search-&-rescue
 



In [65]:
# Save checkpoint
df.to_csv('raw_data/lyrics_raw.csv', index=False)

In [2]:
# Load checkpoint
df = pd.read_csv('raw_data/lyrics_raw.csv')

Extract lyrics performed by key artists only: