In [None]:
from bs4 import BeautifulSoup
from IPython.display import clear_output
import requests
import numpy as np
import pandas as pd
import re
import pprint
import time
from tqdm import tqdm
import csv

It looks like the Genius API needs special song IDs that we don't have. So, this web crawler will use the Genius website's URL structure to get the lyrics directly.

The URL structure for Genius is:
https://genius.com/{song-name}-{artist}-lyrics

## Loading the Dataset

In [None]:
track_data = pd.read_csv('misc/processed_music_info.csv')

file_path = 'misc/track_lyrics.csv'
track_lyrics = {}

try:
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        next(reader)
        for row in reader:
            track_id, lyrics = row
            track_lyrics[track_id] = lyrics
except FileNotFoundError:
    track_lyrics = {}

len(track_data)
track_data.info()

In [None]:
def save_lyrics(lyrics: dict):
    lyrics = pd.DataFrame.from_dict(lyrics, orient='index', columns=['lyrics'])
    lyrics.index.name = 'track_id' 
    lyrics.to_csv(file_path)   

In [None]:
separator = "|||"
avg_retrieval_time = 5

for index, row in track_data.iterrows():
    if index % 10 == 0:
        save_lyrics(track_lyrics) # save lyrics every 10 tracks
        clear_output(wait=True)
        print("{:<8} {:<20} {:<8} {:<15} {:<35} {:<30} {:<10.2f} {:<10}".format("Index", "Track ID", "Status", "Lyrics Length", 
                                                                               "Song Title", "Arist", "Runtime", "Estimated Time"))

    track_id = row['track_id']
    if track_id in track_lyrics: 
        print('Lyrics already retrieved. Skipping track_id:', track_id)
        continue
    title = re.sub(r'[^\w\s]', '',row['name'].lower()).replace(' ', '-')
    artist = row['artist'].lower().replace(' ', '-')
    endpoint = "https://genius.com/{}-{}-lyrics".format(artist, title)
    lyrics = None
    
    start_time = time.time()

    try:
        res = requests.get(endpoint, timeout=5)
    except requests.Timeout as e:
        print('Error occurred while fetching lyrics for "{}":'.format(endpoint), str(e))
    else:
        if res.status_code == 200:
            soup = BeautifulSoup(res.text, 'html.parser')
            lyrics_div = soup.find('div', attrs={'data-lyrics-container': 'true'})
            if lyrics_div is None: continue
            lyrics = lyrics_div.get_text(separator=separator)
            status = "+"
        else:
            status = "-"

    elapsed_time = time.time() - start_time
    avg_retrieval_time = (avg_retrieval_time + elapsed_time) / 2
    estimated_time = avg_retrieval_time * (len(track_data) - index - 1) / 60 # mins
    hours = int(estimated_time // 60)
    mins = int(estimated_time % 60)
    lyrics_length = len(lyrics) if lyrics else 0

    print("{:<8} {:<20} {:<8} {:<15} {:<35} {:<30} {:<10.2f} {:<10}".format(index, track_id, status, lyrics_length, row['name'], 
                                                                row['artist'], elapsed_time, 
                                                                "{:d} hours and {:d} minutes".format(hours, mins)))

    track_lyrics[track_id] = lyrics
    lyrics = None


In [None]:
save_lyrics(track_lyrics)