### Import Libraries

In [1]:
import re
import yaml
import pandas as pd
from lyricsgenius import Genius
from lyricsgenius.song import Song

### Data Extraction

In [2]:
# Get Genius API credentials
with open("../config/genius-credentials.yml") as file:
    # Load YAML data
    config = yaml.load(file, Loader=yaml.FullLoader)

In [3]:
genius = Genius(config["CLIENT-ACCESS-TOKEN"], timeout=7200)

##### Get songs in "retro" genre - Method 1

In [None]:
page = 1
retro_songs_df = pd.DataFrame(columns=['url', 'title_with_artists', 'title', 'artists', 'featured_artists', 'num_artists', 'lyrics', 'release_date'])

for page_num in range(10):
    print("\n\nReading page {}...".format(page))
    res = genius.tag('retro', page=page)
    
    temp_df = pd.DataFrame(res['hits'])
    temp_df['num_artists'] = temp_df['artists'].apply(len)
    
    for indx, hit in enumerate(res['hits']):
        try:
            print("\nExtracting details of song {}: {}".format(indx, hit['title']))
            song_lyrics = genius.lyrics(song_url=hit['url'])
            temp_df.loc[temp_df['url']==hit['url'], ['lyrics']] = song_lyrics
            
            song_response = genius.search_songs(search_term=hit['title'])
            song_response_title = song_response['hits'][0]['result']['title']
            song_response_artist = song_response['hits'][0]['result']['primary_artist']['name']
            
            if (song_response_title==hit['title']) and (song_response_artist==hit['artists'][0]):
                print("\t Title and primary artist match: SUCCESSFUL")
                temp_df.loc[temp_df['url']==song_response['hits'][0]['result']['url'], ['release_date']] = song_response['hits'][0]['result']['release_date_components']['year']
            else:
                print("\t Title and primary artist match: FAILED")
            
        except Exception as e:
            print(str(e))
    
    retro_songs_df = pd.concat([retro_songs_df, temp_df], ignore_index=True)
    
    page = res['next_page']

In [None]:
retro_songs_df.head(5)

In [None]:
retro_songs_df.to_csv("../data/retro_songs.csv", index=False)

In [None]:
retro_songs_df.isnull().sum()

##### Get songs in "retro" genre - Method 2

In [4]:
genre = 'retro'
page = 1
retro_songs_df = pd.DataFrame(columns=['url', 'title_with_artists', 'title', 'artists', 'featured_artists', 'num_artists', 'genre', 'lyrics', 'release_date', 'primary_artist'])

for page_num in range(30):
    print("\n\nReading page {}...".format(page))
    res = genius.tag(genre, page=page)
    
    temp_df = pd.DataFrame(res['hits'])
    temp_df['num_artists'] = temp_df['artists'].apply(len)
    temp_df['genre'] = genre
    
    for indx, hit in enumerate(res['hits']):
        try:
            print("\n\nExtracting details of song {}: {}".format(indx, hit['title']))
            # Validate the title, artist and language
            songs_response = genius.search_songs(search_term=hit['title'])
            for song in songs_response['hits']:
                if (song['result']['title']==hit['title']) and (song['result']['url']==hit['url']) and (song['result']['language']=="en"):
                    print("Valid English song found!")
                    released_year_from_songs_api = song['result']['release_date_components']['year']
                    search_song_response = genius.search_song(hit['title'], get_full_info=True)
                    song_id = search_song_response.id
                    lyrics = search_song_response.lyrics
                    if lyrics.strip():
                        # Split each line to a list and remove the first line
                        lines = lyrics.splitlines()[1:]

                        # Remove empty lines
                        lines = [line for line in lines if line.strip()]

                        # Remove whole line where it contains '[]' brackets and some alphanumeric characters inside the []
                        lines = [line for line in lines if not re.match(r'(^.*\[\w+\].*$)|(\[.*\])', line)]

                        # Join the remaining lines back into a single string
                        modified_lyrics = ' '.join(lines)
                    else:
                        modified_lyrics = ''
                    if song_id:
                        print("Song ID: {}".format(song_id))
                        song_response = genius.song(song_id=song_id)
                        song_response_details = Song(song_response)

                        if hit['url']==song_response_details.url:
                            print("URLs are matching!")
                            temp_df.loc[temp_df['url']==hit['url'], ['lyrics']] = modified_lyrics
                            temp_df.loc[temp_df['url']==hit['url'], ['release_date']] = int(released_year_from_songs_api) or int(song_response_details.year[:4])
                            temp_df.loc[temp_df['url']==hit['url'], ['primary_artist']] = song_response_details.artist
                        else:
                            print("URLs are NOT matching!")
                    else:
                        print("No valid song ID found!")
                    break
                else:
                    print("Not an English song!")
        except Exception as e:
            print(str(e))
    
    retro_songs_df = pd.concat([retro_songs_df, temp_df], ignore_index=True)
    
    page = res['next_page']



Reading page 1...


Extracting details of song 0: Over the Rainbow
Valid English song found!
Searching for "Over the Rainbow"...
Done.
Song ID: 67962
URLs are matching!


Extracting details of song 1: Strange Fruit
Valid English song found!
Searching for "Strange Fruit"...
Done.
Song ID: 152651
URLs are matching!


Extracting details of song 2: Day-O (Banana Boat Song)
Valid English song found!
Searching for "Day-O (Banana Boat Song)"...
Done.
Song ID: 2643
URLs are matching!


Extracting details of song 3: La Bamba
Not an English song!
Not an English song!
Not an English song!
Not an English song!
Not an English song!
Not an English song!
Not an English song!
Not an English song!
Not an English song!
Not an English song!


Extracting details of song 4: Lovesick Blues
Valid English song found!
Searching for "Lovesick Blues"...
Done.
Song ID: 191597
URLs are matching!


Extracting details of song 5: That's Life
Not an English song!
Not an English song!
Not an English song!
Not an Engl

In [5]:
retro_songs_df.head(20)

Unnamed: 0,url,title_with_artists,title,artists,featured_artists,num_artists,genre,lyrics,release_date,primary_artist
0,https://genius.com/Judy-garland-over-the-rainb...,Over the Rainbow by Judy Garland,Over the Rainbow,[Judy Garland],[],1,retro,Somewhere over the rainbow Way up high There's...,1939.0,Judy Garland
1,https://genius.com/Billie-holiday-strange-frui...,Strange Fruit by Billie Holiday,Strange Fruit,[Billie Holiday],[],1,retro,Southern trees bear a strange fruit Blood on t...,1939.0,Billie Holiday
2,https://genius.com/Harry-belafonte-day-o-banan...,Day-O (Banana Boat Song) by Harry Belafonte,Day-O (Banana Boat Song),[Harry Belafonte],[],1,retro,"Daylight come and me wan' go home Day, me say ...",1956.0,Harry Belafonte
3,https://genius.com/Ritchie-valens-la-bamba-lyrics,La Bamba by Ritchie Valens,La Bamba,[Ritchie Valens],[],1,retro,,,
4,https://genius.com/Hank-williams-lovesick-blue...,Lovesick Blues by Hank Williams,Lovesick Blues,[Hank Williams],[],1,retro,"I got a feeling called the blues, oh Lord Sinc...",1950.0,Hank Williams
5,https://genius.com/Frank-sinatra-thats-life-ly...,That's Life by Frank Sinatra,That's Life,[Frank Sinatra],[],1,retro,,,
6,https://genius.com/Johnny-cash-folsom-prison-b...,Folsom Prison Blues by Johnny Cash,Folsom Prison Blues,[Johnny Cash],[],1,retro,"I hear the train a-comin', it's rolling 'round...",1955.0,Johnny Cash
7,https://genius.com/Dean-martin-baby-its-cold-o...,"Baby, It's Cold Outside by Dean Martin","Baby, It's Cold Outside",[Dean Martin],[],1,retro,,,
8,https://genius.com/Paul-anka-put-your-head-on-...,Put Your Head on My Shoulder by Paul Anka,Put Your Head on My Shoulder,[Paul Anka],[],1,retro,Put your head on my shoulder Hold me in your a...,1959.0,Paul Anka
9,https://genius.com/Johnny-cash-i-walk-the-line...,I Walk The Line by Johnny Cash,I Walk The Line,[Johnny Cash],[],1,retro,I keep a close watch on this heart of mine I k...,1956.0,Johnny Cash


In [6]:
final_cols = ['artist_name', 'track_name', 'release_date', 'genre', 'lyrics']

retro_songs_df.rename(columns={'title': 'track_name', 'primary_artist': 'artist_name'}, inplace=True)

students_dataset_df = retro_songs_df[final_cols]
students_dataset_df = students_dataset_df[students_dataset_df.notnull().all(axis=1)]

students_dataset_df.drop_duplicates(subset=['artist_name', 'track_name'], keep='first', inplace=True)

students_dataset_df.to_csv("../data/Student_dataset.csv", index=False)

##### Create merged dataset

In [4]:
mendeley_df = pd.read_csv("../data/mendeley_dataset.csv")
mendeley_df = mendeley_df[['artist_name', 'track_name', 'release_date', 'genre', 'lyrics']]

print("shape: {}\n".format(mendeley_df.shape))

mendeley_df.head()

shape: (28372, 5)



Unnamed: 0,artist_name,track_name,release_date,genre,lyrics
0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...
1,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...
2,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...
3,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...
4,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...


In [5]:
student_df = pd.read_csv("../data/Student_dataset.csv")

print("shape: {}\n".format(student_df.shape))

student_df.head()

shape: (187, 5)



Unnamed: 0,artist_name,track_name,release_date,genre,lyrics
0,Judy Garland,Over the Rainbow,1939,retro,Somewhere over the rainbow Way up high There's...
1,Billie Holiday,Strange Fruit,1939,retro,Southern trees bear a strange fruit Blood on t...
2,Harry Belafonte,Day-O (Banana Boat Song),1956,retro,"Daylight come and me wan' go home Day, me say ..."
3,Hank Williams,Lovesick Blues,1950,retro,"I got a feeling called the blues, oh Lord Sinc..."
4,Johnny Cash,Folsom Prison Blues,1955,retro,"I hear the train a-comin', it's rolling 'round..."


In [6]:
merged_df = pd.concat([mendeley_df, student_df], ignore_index=True)

merged_df.shape

(28559, 5)

In [7]:
merged_df.to_csv("../data/Merged_dataset.csv", index=False)