# Scrape Eurobeat lyrics
This notebook scrapes the lyrics in the lyric URLs from Eurobeat-Prime.</br>
<a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/">Read more on data scraping with beautiful soup</a>.

In [8]:
import urllib.request
import csv
from bs4 import BeautifulSoup

## Parameters

In [6]:
LYRIC_URLS_LOCATION = "data/lyric_urls.txt" #Text file to get the lyric URLs from
LYRIC_DATASET_LOCATION = "data/lyric_dataset.csv" #CSV file to write the lyric data to
BAD_URLS_LOCATION = "data/bad_urls.txt" #Text file to write the bad URLs to
HEADER = ["Artist", "Title", "Lyrics", "URL"]
bad_urls = []

## Load the lyric URLs

In [None]:
lyric_urls = []

with open(LYRIC_URLS_LOCATION, "r", encoding="UTF8") as file:
    for lyric_url in file:
        lyric_urls.append(lyric_url)

dataset_length = len(lyric_urls)
print(f"Amount of lyric URLs in the text file: {dataset_length}")

## Scrape the lyric data
Read the lyrics from the lyric URLs and write them to the CSV file.
Add URLs with multiple " - " in the title to the bad_urls list. </br>
Data structure: </br>
<img src="images/lyric_page_html.PNG" width="500">

In [None]:
index = 0 #For printing the progress

with open(LYRIC_DATASET_LOCATION, "w", encoding="UTF8", newline='') as file:
    writer = csv.writer(file)

    #Write the header
    writer.writerow(HEADER)

    #Loop through URLs
    for url in lyric_urls:
        index += 1

        response = urllib.request.urlopen(url)
        soup = BeautifulSoup(response, "html.parser",
            from_encoding=response.info().get_param('charset'))

        #Find the div with the 'mmids' class, which contains the lyrics
        lyrics = soup.find("div", {"class": "mmids"})

        #Find the child 'b' element, which contains the artist and the title
        #Split the element from ' - '
        title_element = lyrics.findChild("b" , recursive=False).string.split(" - ")

        artist, title = "", ""

        #If there is more than one ' - ' in the title or the artist, the split doesn't work
        #Add this URL to the bad_urls list
        #Uncomment the #continue if you don't want these elements in the dataset
        if len(title_element) > 2:
            bad_urls.append(url)
            #continue
        
        #Assume the first element is the artist and the second the title
        #Naive approach
        else:
            artist, title = title_element

        #Get the lyrics from the <div> object
        #Very naive approach
        lyrics = str(lyrics).split("</a><br/><br/>", 1)[1].split("\n</div>", 1)[0].replace("<br/>", "").replace("\r", "")

        #Write the lyrics to the CSV file
        writer.writerow([artist, title, lyrics, url])

        #Print the progress
        print(f"{index}/{dataset_length}")

## Write the URLs with the wrong format
You will have to manually fix these entries in the dataset if you chose to leave them in.

In [14]:
#Write the text file
with open(BAD_URLS_LOCATION, "w", encoding="UTF8") as file:
    for skipped_url in bad_urls:
        file.write(f"{skipped_url}\n")