# Scrape Arctic Monkeys tunes

In [1]:
import re
import pandas as pd
from lyricsgenius import Genius

### Functions

In [2]:
def get_songs(album):
    """Extract and structure scraped songs for a given album."""
    album_name = album.name
    
    for track in album.tracks:  # iterate Track objects
        number = track.number
        song = track.song  # Song object
        title = song.title
        lyrics = song.lyrics

        row = {"album": album_name, "song_nr": number, "song_title": title, "lyrics": lyrics}
        
        yield row
        
def clean_lyrics(lyrics, song_title):
    """Strip away some ugly parts scraped with the lyrics."""
    REGEX_PATTERS_TO_CLEAN = [  # (regex pattern, replacement)
        ("[0-9]+Embed", ""),  # drop the '{numbers}Embed' at the end
        (f"{song_title} Lyrics", ""),  # drop the '{title} Lyrics' at the front
        ("\n", " "), # remove new lines
        ("\s\s+", " "),  # bring multiple spaces back to a single space
        ("Related Songs", "")  # drop faulty scraped 'Related Songs' bit
    ]
    
    for regex, repl in REGEX_PATTERS_TO_CLEAN:
        lyrics = re.sub(regex, repl, lyrics)
    
    # strip away beginning and ending whitespace
    lyrics = lyrics.strip()
    
    return lyrics

def get_song_sections(lyrics):
    """Denoted between [] brackets."""
    return re.findall("\[(.*?)\]", lyrics)

def harmonize_song_sections(song_sections):
    """Clean and harmonize naming of song sections. Other harmonization is done manually."""
    song_sections = [re.findall("^[^:]*", sc)[0] for sc in song_sections]  # drop after ':' (e.g. 'Verse 1: Alex Turner')
    song_sections = [re.sub("(\s[0-9])", "", sc) for sc in song_sections]  # drop section numbers
    song_sections = [sc.upper() for sc in song_sections]  # capitalize all sections
    return song_sections

def split_lyrics_into_sections(lyrics, song_sections):
    regex_split_pattern = "|".join(['\s?\[' + sc + '\]\s?' for sc in song_sections])
    lyrics_split = re.split(regex_split_pattern, lyrics)[1:]  # first section is empty due to split
    
    df_sections = pd.DataFrame({"song_section": harmonize_song_sections(song_sections),
                                "song_section_lyrics": lyrics_split})
    
    return df_sections

### Scrape albums

In [3]:
ACCESS_TOKEN_GENIUS = "myLittleSecret"

In [4]:
ARTIST = "Arctic Monkeys"

ALBUMS = [
    "Whatever People Say I Am, That’s What I’m Not",
    "Favourite Worst Nightmare",
    "Humbug",
    "Suck It and See",
    "AM",
    "Tranquility Base Hotel & Casino",
    "The Car"
]

ALBUM_KEYS = dict(zip(ALBUMS,
                      ["1. WPSIATWIN",
                       "2. Favourite WN",
                       "3. Humbug",
                       "4. Suck It and See",
                       "5. AM",
                       "6. TBH & Casino",
                       "7. The Car"]))

In [5]:
genius = Genius(access_token=ACCESS_TOKEN_GENIUS)

In [6]:
scraped_albums = []
for album_title in ALBUMS:
    album = genius.search_album(album_title, ARTIST)
    scraped_albums.append(album)

Searching for "Whatever People Say I Am, That’s What I’m Not" by Arctic Monkeys...
Searching for "Favourite Worst Nightmare" by Arctic Monkeys...
Searching for "Humbug" by Arctic Monkeys...
Searching for "Suck It and See" by Arctic Monkeys...
Searching for "AM" by Arctic Monkeys...
Searching for "Tranquility Base Hotel & Casino" by Arctic Monkeys...
Searching for "The Car" by Arctic Monkeys...


### Reformat scraped data

In [7]:
dfs = []
for album in scraped_albums:
    songs_list = []
    for row in get_songs(album):
        songs_list.append(row)

    df = pd.DataFrame(songs_list)
    dfs.append(df)
    
df_lyrics = pd.concat(dfs)
df_lyrics = df_lyrics.reset_index(drop=True)

In [8]:
df_lyrics["album_key"] = df_lyrics["album"].apply(lambda album: ALBUM_KEYS[album])

In [9]:
df_lyrics = df_lyrics[["album", "album_key", "song_nr", "song_title", "lyrics"]]

In [10]:
df_lyrics.shape

(81, 5)

In [11]:
df_lyrics.head()

Unnamed: 0,album,album_key,song_nr,song_title,lyrics
0,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,1.0,The View from the Afternoon,The View from the Afternoon Lyrics[Verse 1]\nA...
1,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,2.0,I Bet You Look Good on the Dancefloor,I Bet You Look Good on the Dancefloor Lyrics[V...
2,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,3.0,Fake Tales of San Francisco,Fake Tales of San Francisco Lyrics[Verse 1]\nF...
3,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,4.0,Dancing Shoes,Dancing Shoes Lyrics[Verse 1]\nGet on your dan...
4,"Whatever People Say I Am, That’s What I’m Not",1. WPSIATWIN,5.0,You Probably Couldn’t See for the Lights But Y...,You Probably Couldn’t See for the Lights But Y...


In [12]:
list_df_sections = []
for i, song_line in df_lyrics.iterrows():
    title, lyrics = song_line.loc[["song_title", "lyrics"]]
    
    lyrics = clean_lyrics(lyrics, title)
    
    song_sections = get_song_sections(lyrics)
    
    df_sections = split_lyrics_into_sections(lyrics, song_sections)
    df_sections["song_title"] = title
    
    list_df_sections.append(df_sections)
    
df_sections_all = pd.concat(list_df_sections)

In [13]:
data = df_lyrics.merge(df_sections_all, on="song_title").copy()

In [14]:
data.drop(columns="lyrics", inplace=True)
data = data.dropna(subset=["song_nr"])  # e.g. non-song "Humbug [Booklet]" was also scraped
data["song_nr"] = data["song_nr"].astype(int)

In [15]:
data.shape

(494, 6)

In [16]:
data.tail(10)

Unnamed: 0,album,album_key,song_nr,song_title,song_section,song_section_lyrics
485,The Car,7. The Car,9,Mr Schwartz,VERSE,Put your heavy metal to the test There might b...
486,The Car,7. The Car,9,Mr Schwartz,CHORUS,Mr. Schwartz is stayin' strong for the crew Wa...
487,The Car,7. The Car,9,Mr Schwartz,VERSE,"Gradually, it's coming into view It's like you..."
488,The Car,7. The Car,9,Mr Schwartz,CHORUS,Mr. Schwartz is stayin' strong for the crew Wa...
489,The Car,7. The Car,9,Mr Schwartz,BRIDGE,And if wе guess who I'm pretending to be Do we...
490,The Car,7. The Car,9,Mr Schwartz,VERSE,The gloved hand's reachin' in to hit the switc...
491,The Car,7. The Car,9,Mr Schwartz,CHORUS,Mr. Schwartz is havin' tea with the grips Aski...
492,The Car,7. The Car,10,Perfect Sense,VERSE,"Richard of York, the Executive Branch Having s..."
493,The Car,7. The Car,10,Perfect Sense,VERSE,A four-figure sum on a hotel notepad A revelat...
494,The Car,7. The Car,10,Perfect Sense,VERSE,Keep remindin' me that it ain't a race When my...


### Store clean scraped data for manual processing

In [17]:
data.to_excel("../data/data_lyrics_arctic_monkeys_scraped.xlsx", index=False)