In [None]:
from bs4 import BeautifulSoup
from IPython.display import clear_output
from dotenv import load_dotenv
import os
import requests
import numpy as np
import pandas as pd
import re
import pprint
import time
from tqdm import tqdm
import csv

It looks like the Genius API needs special song IDs that we don't have. So, this web crawler will use the Genius website's URL structure to get the lyrics directly.

The URL structure for Genius is:
https://genius.com/{song-name}-{artist}-lyrics

## Loading the Dataset

In [None]:
track_data = pd.read_csv('misc/processed_music_info.csv')
env_path = os.path.join('misc', '.env')

file_path = 'misc/track_lyrics copy.csv'
track_lyrics = {}

load_dotenv(dotenv_path=env_path) 

GENIUS_ACCESS_TOKEN = os.getenv('CLIENT_ACCESS_TOKEN')
if GENIUS_ACCESS_TOKEN is None:
    raise ValueError('GENIUS_ACCESS_TOKEN is not set in the environment variables') 

try:
    with open(file_path, 'r', encoding='utf-8') as file:
        reader = csv.reader(file)
        next(reader)
        for row in reader:
            track_id, lyrics = row
            track_lyrics[track_id] = lyrics
except FileNotFoundError:
    track_lyrics = {}

len(track_lyrics)


In [None]:
len(track_data)
track_data.info()

In [None]:
def save_lyrics(lyrics: dict):
    lyrics = pd.DataFrame.from_dict(lyrics, orient='index', columns=['lyrics'])
    lyrics.index.name = 'track_id' 
    lyrics.to_csv(file_path)   

In [None]:
separator = "|||"
genius_api_endpoint = "https://api.genius.com/search"
avg_retrieval_time = 5
avg_minutes = []

In [None]:
headers = {
    "Authorization": f"Bearer {GENIUS_ACCESS_TOKEN}",
    "User-Agent": "CompuServe Classic/1.22",
    "Accept": "application/json",
}
for index, row in track_data.iterrows():
    track_id = row['track_id']
    if len(track_lyrics.get(track_id, "")) > 0: continue 
    if index % 10 == 0:
        save_lyrics(track_lyrics) # save lyrics every 10 tracks
        clear_output(wait=True)
        print("Completion: [{}/{}, {:.2f}%]".format(index, len(track_data), (index / len(track_data)) * 100))
        print("{:<8} {:<20} {:<8} {:<15} {:<35} {:<30} {:<10} {:<10}".format("Index", "Track ID", "Status", "Lyrics Length", 
                                                                               "Song Title", "Arist", "Runtime", "Estimated Time"))
    if index % 100 == 0:
        avg_minutes.clear()

    title = row['name']
    artist = row['artist']
    lyrics = ""
    status = 200
    start_time = time.time()

    try:        
        res = requests.get(genius_api_endpoint, headers=headers, params={"q": f"{title},{artist}"}, timeout=10)
        res = res.json()
        if res['meta']['status'] != 200: continue
        if len(res['response']['hits']) == 0: continue
        res = res['response']['hits'][0]['result']
        if res['primary_artist']['name'].lower() != artist.lower(): continue
        url = res['url']
        res = requests.get(url, timeout=10)
    except requests.Timeout as e:
        print('Error occurred while fetching lyrics for "{}":'.format(url), str(e))
    else:
        status = res.status_code
        if status == 200:
            soup = BeautifulSoup(res.text, 'html.parser')
            lyrics_div = soup.find('div', attrs={'data-lyrics-container': 'true'})
            if lyrics_div is None:
                instrumental = soup.find('div', string=re.compile('This song is an instrumental'))
                if instrumental: lyrics = "This song is instrumental."
            else:
                lyrics = lyrics_div.get_text(separator=separator)

    elapsed_time = time.time() - start_time
    avg_retrieval_time = (avg_retrieval_time + elapsed_time) / 2
    estimated_time = avg_retrieval_time * (len(track_data) - index - 1) / 60 # mins
    avg_minutes.append(estimated_time)
    estimated_time = np.mean(avg_minutes)
    hours = int(estimated_time // 60)
    mins = int(estimated_time % 60)
    lyrics_length = len(lyrics) if lyrics != "This song is instrumental." else "No lyrics."

    print("{:<8} {:<20} {:<8} {:<15} {:<35} {:<30} {:<10.2f} {:<10}".format(index, track_id, status, lyrics_length, row['name'], 
                                                                row['artist'], elapsed_time, 
                                                                "{:d} hours and {:d} minutes".format(hours, mins)))

    track_lyrics[track_id] = lyrics
    lyrics = ""


In [None]:
# cleanup
for track_id, lyrics in track_lyrics.items():
    if lyrics == "":
        track_lyrics[track_id] = "This song is instrumental."

In [None]:
save_lyrics(track_lyrics)