In [None]:
from bs4 import BeautifulSoup
from IPython.display import clear_output
from dotenv import load_dotenv
import os
import requests
import numpy as np
import pandas as pd
import re
import pprint
import time
from tqdm import tqdm
import csv

It looks like the Genius API needs special song IDs that we don't have. So, this web crawler will use the Genius website's URL structure to get the lyrics directly.

The URL structure for Genius is:
https://genius.com/{song-name}-{artist}-lyrics

## Loading the Dataset

In [None]:
track_data = pd.read_csv('misc/processed_music_info_extended.csv')
old_track_lyrics = pd.read_csv('misc/track_lyrics.csv')
env_path = os.path.join('misc', '.env')

outer_bound = len(track_data)
track_data = track_data[~track_data['track_id'].isin(old_track_lyrics['track_id'])]

file_path = 'misc/track_lyrics_extended.csv'
track_lyrics = {}

load_dotenv(dotenv_path=env_path) 

GENIUS_ACCESS_TOKEN = os.getenv('CLIENT_ACCESS_TOKEN')
if GENIUS_ACCESS_TOKEN is None:
    raise ValueError('GENIUS_ACCESS_TOKEN is not set in the environment variables') 

# try:
#     with open(file_path, 'r', encoding='utf-8') as file:
#         reader = csv.reader(file)
#         next(reader)
#         for row in reader:
#             track_id, lyrics = row
#             track_lyrics[track_id] = lyrics
# except FileNotFoundError:
#     track_lyrics = {}

track_lyrics = pd.read_csv(file_path)
null_lyrics = track_lyrics[track_lyrics['lyrics'].isnull()]
track_lyrics = track_lyrics.drop_duplicates(subset='track_id')
track_data = track_data[~track_data['track_id'].isin(track_lyrics['track_id'])]
track_lyrics = track_lyrics.to_dict('records')
track_data = track_data.to_dict('records')
print(len(track_data))
print(len(track_lyrics))


In [None]:
def save_lyrics(lyrics: dict):
    lyrics = pd.DataFrame.from_dict(lyrics, orient='index', columns=['lyrics'])
    lyrics.index.name = 'track_id' 
    lyrics.to_csv(file_path)   

def clean_text(text):
    if text == "This song is instrumental." or not isinstance(text, str):
        return ""
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^A-Za-z0-9 ]+', ' ', text)
    return text.strip().split()

def validate_track(track_names, artist_names):
    og_track_name = clean_text(track_names[1].lower())
    og_artist_name = clean_text(artist_names[1].lower())
    track_name = clean_text(track_names[0].lower())
    artist_name = clean_text(artist_names[0].lower())
    trutht = [True if word in track_name else False for word in og_track_name]
    trutha = [True if word in artist_name else False for word in og_artist_name]
    try:
        s = sum(trutht) / len(og_track_name) > 0.8 and sum(trutha) / len(og_artist_name) > 0.8
    except ZeroDivisionError:
        return False
    return s 


In [None]:
separator = "|||"
genius_api_endpoint = "https://api.genius.com/search"
avg_retrieval_time = 5
avg_minutes = []
outer_bound = len(track_data)

In [None]:
headers = {
    "Authorization": f"Bearer {GENIUS_ACCESS_TOKEN}",
    "User-Agent": "CompuServe Classic/1.22",
    "Accept": "application/json",
}
update_n = 0
save_n = 0

for index, row in enumerate(track_data[1194:]):
    track_id = row['track_id']
    if track_id in track_lyrics: continue
    if update_n % 10 == 0:
        clear_output(wait=True)
        print("Completion: [{}/{}, {:.2f}%]".format(index, outer_bound, (index / outer_bound) * 100))
        print("{:<8} {:<20} {:<8} {:<15} {:<35} {:<30} {:<10} {:<10}".format("Index", "Track ID", "Status", "Lyrics Length", 
                                                                               "Song Title", "Arist", "Runtime", "Estimated Time"))
    if len(avg_minutes) > 10:
        avg_minutes.clear()

    title = row['name']
    artist = row['artist']
    lyrics = ""
    status = 200
    start_time = time.time()

    try:        
        res = requests.get(genius_api_endpoint, headers=headers, params={"q": f"{title},{artist}"}, timeout=10)
        res = res.json()
        if res['meta']['status'] != 200: continue
        if len(res['response']['hits']) == 0: continue
        if res['response']['hits'][0]['type'] != 'song': continue
        res = res['response']['hits'][0]['result']
        if not validate_track([res['primary_artist']['name'], artist], [res['title'], title]): continue
        url = res['url']
        res = requests.get(url, timeout=10)
    except Exception as e:
        print('Error occurred while fetching lyrics for "{}":'.format(url), str(e))
    else:
        status = res.status_code
        if status == 200:
            soup = BeautifulSoup(res.text, 'html.parser')
            lyrics_div = soup.find('div', attrs={'data-lyrics-container': 'true'})
            if lyrics_div is None:
                instrumental = soup.find('div', string=re.compile('This song is an instrumental'))
                if instrumental: lyrics = "This song is instrumental."
            else:
                lyrics = lyrics_div.get_text(separator=separator)

    elapsed_time = time.time() - start_time
    avg_retrieval_time = (avg_retrieval_time + elapsed_time) / 2
    estimated_time = avg_retrieval_time * (outer_bound - index - 1) / 60 # mins
    avg_minutes.append(estimated_time)
    estimated_time = np.mean(avg_minutes)
    hours = int(estimated_time // 60)
    mins = int(estimated_time % 60)
    lyrics_length = len(lyrics) if lyrics != "This song is instrumental." else "No lyrics."

    print("{:<8} {:<20} {:<8} {:<15} {:<35} {:<30} {:<10.2f} {:<10}".format(index, track_id, status, lyrics_length, row['name'], 
                                                                row['artist'], elapsed_time, 
                                                                "{:d} hours and {:d} minutes".format(hours, mins)))

    track_lyrics[track_id] = lyrics
    update_n += 1
    save_n += 1
    if save_n % 50 == 0:
        save_lyrics(track_lyrics)
        save_n = 0
    lyrics = ""


In [None]:
# cleanup
for idx, row in track_lyrics.iterrows():
    lyrics = row['lyrics']
    if lyrics == "" or lyrics == None:
        track_lyrics[track_id] = "This song is instrumental."

In [None]:
track_lyrics = pd.read_csv('misc/track_lyrics.csv')
track_lyrics_extended = pd.read_csv('misc/track_lyrics_extended.csv')
track_lyrics = pd.concat([track_lyrics, track_lyrics_extended])

In [None]:
track_lyrics.info()

In [None]:
track_lyrics.to_csv('misc/track_lyrics.csv', index=False)

In [None]:
save_lyrics(track_lyrics)