# Compile Data

In [1]:
import re
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
_
# Spotify API 
import spotipy
from spotipy.oauth2 import SpotifyOAuth
_
# Genius API
from lyricsgenius import Genius
_
# Web scraping
import time
import requests
from bs4 import BeautifulSoup
_
# Misc
import sys
sys.path.append('/Users/**********')
import cred

### Spotify API

Instantiate Spotipy object:

In [2]:
SP = spotipy.Spotify(
    auth_manager=SpotifyOAuth(
        redirect_uri=cred.SPOTIFY_REDIRECT_URI, 
        client_id=cred.SPOTIFY_CLIENT_ID, 
        client_secret=cred.SPOTIFY_CLIENT_SECRET
    )
)

Compile Spotify playlist ID numbers for each artist:

In [3]:
ARTIST_ID = {
    'Drake': '37i9dQZF1DX7QOv5kjbU68',
    'Eminem': '37i9dQZF1DZ06evO4gTUOY',
    'Nicki Minaj': '37i9dQZF1DZ06evO0684jS',
    'Nas': '37i9dQZF1DZ06evO18oyZi',
    'Future': '37i9dQZF1DZ06evO133u6s',
    'Dave': '37i9dQZF1DZ06evO3Y4Tg4',
    '2Pac': '37i9dQZF1DZ06evO17QsVi',
    'Kendrick Lamar': '37i9dQZF1DZ06evO1IPOOk',
    'Rapsody': '37i9dQZF1DZ06evO41iwLu',
    'Skepta': '37i9dQZF1DZ06evO1nbU8U',
    'J. Cole': '37i9dQZF1DZ06evO3K21mU'
}

Grab top 50 tracks for each artist and extract track info:<br><br>
**NOTE:** Since these playlists contain songs where the key artist is not the main aritst of the song I'll be adding a `main_artist` column to reference later on so I can access all lyrics correctly.

In [4]:
data_dict = {'track_name': [], 'main_artist': [], 'key_artist': [], 'duration_ms': [], 'is_explicit': [], 'track_uri': []}

for key_artist, id in tqdm(ARTIST_ID.items(), total=len(ARTIST_ID)):
    playlist_tracks = SP.playlist_items(playlist_id=id)
    # Extract track names, track duration and if track is explicit:
    track_names = [track['track']['name'] for track in playlist_tracks['items']][:50]
    main_artists = [track['track']['artists'][0]['name'] for track in playlist_tracks['items']][:50]
    is_explicit = [track['track']['explicit'] for track in playlist_tracks['items']][:50]
    track_duration = [track['track']['duration_ms'] for track in playlist_tracks['items']][:50]
    track_uri = [track['track']['uri'] for track in playlist_tracks['items']][:50]

    data_dict['track_name'] += track_names
    data_dict['main_artist'] += main_artists
    data_dict['key_artist'] += [key_artist for _ in range(50)]
    data_dict['duration_ms'] += track_duration
    data_dict['is_explicit'] += is_explicit
    data_dict['track_uri'] += track_uri

  0%|          | 0/11 [00:00<?, ?it/s]

Convert data into a Pandas dataframe:

In [5]:
df = pd.DataFrame(data_dict)
_
df.sample(10)

Unnamed: 0,track_name,main_artist,key_artist,duration_ms,is_explicit,track_uri
184,I Love This Feeling,Nas,Nas,197519,True,spotify:track:3UDor25dsBp55meFUeOpx2
404,The Law (feat. Mac Miller & Rapsody),Ab-Soul,Rapsody,329207,True,spotify:track:17vQTNbvhpvi92qOnDOFHB
192,We Are,Justin Bieber,Nas,202960,False,spotify:track:1m6FkAtWVloxIRHKl7pBDP
47,Papi’s Home,Drake,Drake,178623,True,spotify:track:6jy9yJfgCsMHdu2Oz4BGKX
494,Skepta x Fumez The Engineer - Plugged In,Fumez The Engineer,Skepta,158333,True,spotify:track:77LDURvNd8YwpQdynKs7oI
228,Stand On It (with Future),Yeat,Future,180800,True,spotify:track:7H3lr9bsPBGFx5NyTuBBGc
220,LOVE YOU BETTER,Future,Future,129479,True,spotify:track:4XJRpBOG0bU3Nxnvam3FnC
17,STAYING ALIVE (feat. Drake & Lil Baby),DJ Khaled,Drake,178176,True,spotify:track:0g2Bbgy7P41pFjMWJuzsf4
487,Reflecting,Skepta,Skepta,198493,False,spotify:track:7CQRAzkxJoD0thxkixCpSL
502,Work Out,J. Cole,J. Cole,235320,True,spotify:track:2wAJTrFhCnQyNSD3oUgTZO


Add track BPM:

In [6]:
for index, row in tqdm(df.iterrows(), total=len(df)):
    track_features = SP.audio_features(row['track_uri'])
    BPM = track_features[0]['tempo']
    
    df.loc[index, 'track_bpm'] = BPM

  0%|          | 0/550 [00:00<?, ?it/s]

### Rap Genius API

In [8]:
G = Genius(cred.GENIUS_ACCESS_TOKEN)

Get lyrics for each track:

In [38]:
for index, row in tqdm(df.iterrows(), total=len(df)):
    track_name = row['track_name']
    main_artist = row['main_artist']

    # Remove features and symbols from track name:
    track_name = track_name.split(' (')[0].strip().replace('&', 'and')

    # Fetch track data:
    track_data = G.search_song(track_name, main_artist)
    raw_lyrics = track_data.lyrics

    # Add lyrics to current row:
    df.loc[index, 'raw_lyrics'] = raw_lyrics

Drop NaN values:

In [39]:
df = df.dropna()

In [40]:
# SAVE CHECKPOINT
df.to_csv('test_data/test_lyrics.csv', index=False)

### Restructuring Text

Extract lyrics performed by key artists only:

In [41]:
for index, row in tqdm(df.iterrows(), total=len(df)):
    key_artist = row['key_artist']
    text = row['raw_lyrics']  # Lyrics

    # Grab verse credits:
    # NOTE: unfortunately, not all tracks have verse credits so it gets difficult 
    # trying to indentify which verses were performed by key artists
    verse_credits = re.findall("\[(.*?)\]", text)
    
    # Check if key artist is mentioned in any verse credits for current track:
    if key_artist in " ".join(verse_credits):
        key_artist_text = ""
        for credit in verse_credits:
            # If credit match found:
            if key_artist in credit:
                # Extract valid text
                extracted_text = text.split('[' + credit + ']')[1].split("[")[0].strip()
                key_artist_text += extracted_text

        # Add extracted lyrics to current row:
        df.loc[index, 'extracted_lyrics'] = key_artist_text
    
    # No credits available, so add all text:
    else:
        # Remove all credits and whitespace from text:
        text = re.sub("\[(.*?)\]", '', text)
        text = "\n".join(line for line in text.split('\n')[1:] if line)
        df.loc[index, 'extracted_lyrics'] = text

  0%|          | 0/549 [00:00<?, ?it/s]

Drop duplicated cells:

In [42]:
df = df[~df.duplicated(subset='extracted_lyrics')]
df = df[~df.duplicated(subset='track_name')]

Add newlines to separate bars accurately:

In [43]:
def format_bars(text: str) -> str:
    """
    Matches pattern to indentify where to insert newlines.
    The purpose of this is to count bars in each track later on.
    """
    index_matches = [
        (match.start(0), match.end(0)) \
        for match in re.finditer("[a-z0-9\?)](?=)[A-Z]", text)
    ]
    
    # To insert newlines, convert string to list
    text = list(text)
    offset = 1
    
    for index_start, index_end in index_matches:
        text.insert(index_start + offset, '\n')
        offset += 1  # Increase offset by 1 to balance newlines

    return "".join(text)


def format_bars_P2(text: str) -> str:
    """
    Removing front-end apostrophes from word and spacing newlines.
    """
    index_matches = [
        (match.start(0), match.end(0)) \
        for match in re.finditer("[a-zA-Z](?<=)['](?=)[A-Z]", text)
    ]
    
    text = list(text)
    offset = 1

    for index_start, index_end in index_matches:
        text.insert(index_start + offset, '\n')
        offset += 1

    return "".join(text)

_
df['corrected_newlines'] = df['extracted_lyrics'].apply(format_bars)
df['corrected_newlines'] = df['corrected_newlines'].apply(format_bars_P2)

Validate text extracted:

In [44]:
# Remove rows with a character count less than 100
df = df[~df['corrected_newlines'].apply(lambda x: len(x) < 100)]

Remove noise:

In [48]:
# Remove headers from raw lyrics for readability
df['raw_lyrics'] = df['raw_lyrics'].apply(lambda x: " ".join(x.split('\n')[1:]))
_
# -------- ONLY APPLYING FURTHER CHANGES TO CORRECTED_NEWLINES ---------------------
df['corrected_newlines'] = df['corrected_newlines'].str.replace('\u2005', ' ').str.replace('\u205f', ' ') # Remove unicode tags
df['corrected_newlines'] = df['corrected_newlines'].apply(lambda x: re.sub("[0-9](?<=)|[0-9](?=)[\nEmbed]", '', x))  # Remove embeded tags
df['corrected_newlines'] = df['corrected_newlines'].str.replace('Embed', '')

### Save data

Reformat dataframe:

In [49]:
df = (df
 .drop(columns=['extracted_lyrics', 'main_artist'])
 .rename(columns={'corrected_newlines': 'artist_verses', 'key_artist': 'artist'})
)

In [50]:
df.to_csv('raw_data/lyrics_raw.csv', index=False)