## Python Lyrics Data Collection Script Overview

This script gathers lyrics data from the Genius API. For details on how to register for the Genius API and obtain an API token, visit the [Genius API documentation](https://docs.genius.com/).



In [1]:
!pip install lyricsgenius

Collecting lyricsgenius
  Downloading lyricsgenius-3.0.1-py3-none-any.whl.metadata (6.0 kB)
Downloading lyricsgenius-3.0.1-py3-none-any.whl (59 kB)
   ---------------------------------------- 0.0/59.4 kB ? eta -:--:--
   ---------------------------------------- 0.0/59.4 kB ? eta -:--:--
   ------ --------------------------------- 10.2/59.4 kB ? eta -:--:--
   ------ --------------------------------- 10.2/59.4 kB ? eta -:--:--
   ------------- -------------------------- 20.5/59.4 kB 108.9 kB/s eta 0:00:01
   ------------- -------------------------- 20.5/59.4 kB 108.9 kB/s eta 0:00:01
   ------------- -------------------------- 20.5/59.4 kB 108.9 kB/s eta 0:00:01
   ------------- -------------------------- 20.5/59.4 kB 108.9 kB/s eta 0:00:01
   -------------------- ------------------- 30.7/59.4 kB 72.9 kB/s eta 0:00:01
   -------------------- ------------------- 30.7/59.4 kB 72.9 kB/s eta 0:00:01
   -------------------- ------------------- 30.7/59.4 kB 72.9 kB/s eta 0:00:01
   ----------

In [2]:
import re

def process_thestring(input_string):
    # Cut the beginning of the string till "Lyrics[Verse 1]\n"
    start_phrase = "Lyrics[Verse 1]\n"
    start_index = input_string.find(start_phrase)
    if start_index != -1:
        processed_string = input_string[start_index + len(start_phrase):]
    else:
        processed_string = input_string 

    # Remove all instances of text enclosed in square brackets
    processed_string = re.sub(r"\[[^\]]*\]", "", processed_string)

    # Cut the end of the string that contains a number followed by "Embed"
    # Also removing the number before "Embed"
    end_index = processed_string.rfind("Embed")
    if end_index != -1:
        # Find the start of the number preceding "Embed"
        number_start = end_index
        while number_start > 0 and processed_string[number_start-1].isdigit():
            number_start -= 1
        processed_string = processed_string[:number_start].rstrip()

    return processed_string

In [4]:
import re
from lyricsgenius import Genius

def sanitize_filename(name):
    # Replace special characters and spaces with underscores
    return re.sub(r'[^\w\-_\. ]', '_', name)

genius = Genius('_4Q_dzEnahi4dcflTwo2QXpOItAVYsaSX8WGtTLgd1aAZYiEVSzkoa9zM27p2ugh')
albums = ["Made in the A.M. (Deluxe)", "FOUR (The Ultimate Edition)", "Midnight Memories (The Ultimate Edition)", 'Take Me Home (Expanded Edition)', "Up All Night (The Souvenir Edition)"]
for name in albums:
    album = genius.search_album(name, "One Direction")
    for track in album.tracks:
        print(track.song.title)
        result = process_thestring(track.song.lyrics)
        song_lyrics = process_thestring(result)
        # Create a sanitized file name
        filename = f"{sanitize_filename(name)}_%_{album.release_date_components.year}_%_{sanitize_filename(track.song.title)}.txt"
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(song_lyrics)


Searching for "Made in the A.M. (Deluxe)" by One Direction...
Hey Angel
Drag Me Down
Perfect
Infinity
End of the Day
If I Could Fly
Long Way Down
Never Enough
Olivia
What a Feeling
Love You Goodbye
I Want to Write You a Song
History
Temporary Fix
Walking in the Wind
Wolves
A.M.
Searching for "FOUR (The Ultimate Edition)" by One Direction...
Steal My Girl
Ready to Run
Where Do Broken Hearts Go
18
Girl Almighty
Fool’s Gold
Night Changes
No Control
Fireproof
Spaces
Stockholm Syndrome
Clouds
Change Your Ticket
Illusion
Once in a Lifetime
Act My Age
Searching for "Midnight Memories (The Ultimate Edition)" by One Direction...
Best Song Ever
Story of My Life
Diana
Midnight Memories
You & I
Don’t Forget Where You Belong
Strong
Happily
Right Now
Little Black Dress
Through the Dark
Something Great
Little White Lies
Better Than Words
Why Don’t We Go There?
Does He Know?
Alive
Half a Heart
Searching for "Take Me Home (Expanded Edition)" by One Direction...
Live While We’re Young
Kiss You
Little Th

In [7]:
import os
print("Current working directory:", os.getcwd())


Current working directory: c:\Users\Ananya\anaconda3\Dissertation - UL\Music-Generation-with-AI\Project\LyricsScrapper


In [11]:
%pip install chardet


Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
   ---------------------------------------- 0.0/199.4 kB ? eta -:--:--
   ---------------------------------------- 0.0/199.4 kB ? eta -:--:--
   ---------------------------------------- 0.0/199.4 kB ? eta -:--:--
   -- ------------------------------------- 10.2/199.4 kB ? eta -:--:--
   -- ------------------------------------- 10.2/199.4 kB ? eta -:--:--
   -- ------------------------------------- 10.2/199.4 kB ? eta -:--:--
   -- ------------------------------------- 10.2/199.4 kB ? eta -:--:--
   -- ------------------------------------- 10.2/199.4 kB ? eta -:--:--
   -- ------------------------------------- 10.2/199.4 kB ? eta -:--:--
   ------ --------------------------------- 30.7/199.4 kB 72.9 kB/s eta 0:00:03
   ------ --------------------------------- 30.7/199.4 kB 72.9 kB/s eta 0:00:03
   ------ --------------------------------- 30.7/199.4 kB 72.

In [12]:
import os
import pandas as pd
import chardet  # Import the chardet library

directory = r"c:\Users\Ananya\anaconda3\Dissertation - UL\Music-Generation-with-AI\Project\LyricsScrapper"

data = []

# Process each file in the directory
i = 0
save_album = None
files_list = os.listdir(directory)
files_list.sort(reverse=True)

skipped_files = []

for filename in files_list:
    # Extract album, year, and track_title from the filename
    parts = filename.split('_%_')
    if len(parts) != 3:  # Check if the filename format is correct
        skipped_files.append(filename)  # Log skipped file
        continue
    album, year, track_title = parts[0], parts[1], parts[2]
    if save_album and save_album != album:
        i = 0
    save_album = album

    file_path = os.path.join(directory, filename)

    # Detect the file encoding
    with open(file_path, 'rb') as rawdata:
        result = chardet.detect(rawdata.read(10000))  # Read enough sample for detection
        encoding = result['encoding']

    # Open the file with the detected encoding
    with open(file_path, 'r', encoding=encoding) as file:
        lines = file.readlines()

    # Process each line in the file
    for line_number, line in enumerate(lines, start=1):
        if line.strip():  # Exclude empty lines
            data.append({
                'artist': 'One Direction',
                'album': album,
                'track_title': track_title,
                'track_n': i + 1,
                'lyric': line.strip(),
                'line': line_number,
                'year': year
            })
    i += 1

df_1d = pd.DataFrame(data)
df_1d['track_title'] = df_1d['track_title'].str.replace('\u200b', '', regex=False)

# Output the skipped files for review
if skipped_files:
    print("Skipped files:")
    for file in skipped_files:
        print(file)

df_1d.head()


Skipped files:
ss
scrapper.ipynb


Unnamed: 0,artist,album,track_title,track_n,lyric,line,year
0,One Direction,Up All Night _The Souvenir Edition_,What Makes You Beautiful.txt,1,134 ContributorsTranslationsEspañolPortuguêsFr...,1,2011
1,One Direction,Up All Night _The Souvenir Edition_,What Makes You Beautiful.txt,1,"You're insecure, don't know what for",2,2011
2,One Direction,Up All Night _The Souvenir Edition_,What Makes You Beautiful.txt,1,You're turnin' heads when you walk through the...,3,2011
3,One Direction,Up All Night _The Souvenir Edition_,What Makes You Beautiful.txt,1,Don't need makeup to cover up (Huh),4,2011
4,One Direction,Up All Night _The Souvenir Edition_,What Makes You Beautiful.txt,1,Bein' the way that you are is enough,5,2011


In [15]:
%pip install pyarrow
%pip install fastparquet



Collecting pyarrow
  Downloading pyarrow-16.0.0-cp311-cp311-win_amd64.whl.metadata (3.1 kB)
Downloading pyarrow-16.0.0-cp311-cp311-win_amd64.whl (25.9 MB)
   ---------------------------------------- 0.0/25.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/25.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/25.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/25.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/25.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/25.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/25.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/25.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/25.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/25.9 MB 62.4 kB/s eta 0:06:54
   ---------------------------------------- 0.0/25.9 MB 62.4 kB/s eta 0:06:54
   ---------------------------------------- 0.0/25.9 MB 62.

In [17]:
import os

# Specify the directory where you want to save the Parquet file
directory = '../data'

# Check if the directory exists, and create it if it does not
if not os.path.exists(directory):
    os.makedirs(directory)

# Now try saving the DataFrame as a Parquet file
df_1d.to_parquet(os.path.join(directory, 'One_Direction_all_albums.parquet'))


#df_1d.to_parquet('../data/One_Direction_all_albums.parquet')