### Import Libraries

In [1]:
import re
import yaml
import pandas as pd
from lyricsgenius import Genius
from lyricsgenius.song import Song

### Data Extraction

In [None]:
# Get Genius API credentials
with open("../config/genius-credentials.yml") as file:
    # Load YAML data
    config = yaml.load(file, Loader=yaml.FullLoader)

In [None]:
genius = Genius(config["CLIENT-ACCESS-TOKEN"], timeout=7200)

##### Get songs in "retro" genre - Method 1

In [None]:
page = 1
retro_songs_df = pd.DataFrame(columns=['url', 'title_with_artists', 'title', 'artists', 'featured_artists', 'num_artists', 'lyrics', 'release_date'])

for page_num in range(10):
    print("\n\nReading page {}...".format(page))
    res = genius.tag('retro', page=page)
    
    temp_df = pd.DataFrame(res['hits'])
    temp_df['num_artists'] = temp_df['artists'].apply(len)
    
    for indx, hit in enumerate(res['hits']):
        try:
            print("\nExtracting details of song {}: {}".format(indx, hit['title']))
            song_lyrics = genius.lyrics(song_url=hit['url'])
            temp_df.loc[temp_df['url']==hit['url'], ['lyrics']] = song_lyrics
            
            song_response = genius.search_songs(search_term=hit['title'])
            song_response_title = song_response['hits'][0]['result']['title']
            song_response_artist = song_response['hits'][0]['result']['primary_artist']['name']
            
            if (song_response_title==hit['title']) and (song_response_artist==hit['artists'][0]):
                print("\t Title and primary artist match: SUCCESSFUL")
                temp_df.loc[temp_df['url']==song_response['hits'][0]['result']['url'], ['release_date']] = song_response['hits'][0]['result']['release_date_components']['year']
            else:
                print("\t Title and primary artist match: FAILED")
            
        except Exception as e:
            print(str(e))
    
    retro_songs_df = pd.concat([retro_songs_df, temp_df], ignore_index=True)
    
    page = res['next_page']

In [None]:
retro_songs_df.head(5)

In [None]:
retro_songs_df.to_csv("../data/retro_songs.csv", index=False)

In [None]:
retro_songs_df.isnull().sum()

##### Get songs in "retro" genre - Method 2

In [None]:
genre = 'retro'
page = 1
retro_songs_df = pd.DataFrame(columns=['url', 'title_with_artists', 'title', 'artists', 'featured_artists', 'num_artists', 'genre', 'lyrics', 'release_date', 'primary_artist'])

for page_num in range(30):
    print("\n\nReading page {}...".format(page))
    res = genius.tag(genre, page=page)
    
    temp_df = pd.DataFrame(res['hits'])
    temp_df['num_artists'] = temp_df['artists'].apply(len)
    temp_df['genre'] = genre
    
    for indx, hit in enumerate(res['hits']):
        try:
            print("\n\nExtracting details of song {}: {}".format(indx, hit['title']))
            # Validate the title, artist and language
            songs_response = genius.search_songs(search_term=hit['title'])
            for song in songs_response['hits']:
                if (song['result']['title']==hit['title']) and (song['result']['url']==hit['url']) and (song['result']['language']=="en"):
                    print("Valid English song found!")
                    released_year_from_songs_api = song['result']['release_date_components']['year']
                    search_song_response = genius.search_song(hit['title'], get_full_info=True)
                    song_id = search_song_response.id
                    lyrics = search_song_response.lyrics
                    if lyrics.strip():
                        # Split each line to a list and remove the first line
                        lines = lyrics.splitlines()[1:]

                        # Remove empty lines
                        lines = [line for line in lines if line.strip()]

                        # Remove whole line where it contains '[]' brackets and some alphanumeric characters inside the []
                        lines = [line for line in lines if not re.match(r'(^.*\[\w+\].*$)|(\[.*\])', line)]

                        # Join the remaining lines back into a single string
                        modified_lyrics = ' '.join(lines)
                    else:
                        modified_lyrics = ''
                    if song_id:
                        print("Song ID: {}".format(song_id))
                        song_response = genius.song(song_id=song_id)
                        song_response_details = Song(song_response)

                        if hit['url']==song_response_details.url:
                            print("URLs are matching!")
                            temp_df.loc[temp_df['url']==hit['url'], ['lyrics']] = modified_lyrics
                            temp_df.loc[temp_df['url']==hit['url'], ['release_date']] = int(released_year_from_songs_api) or int(song_response_details.year[:4])
                            temp_df.loc[temp_df['url']==hit['url'], ['primary_artist']] = song_response_details.artist
                        else:
                            print("URLs are NOT matching!")
                    else:
                        print("No valid song ID found!")
                    break
                else:
                    print("Not an English song!")
        except Exception as e:
            print(str(e))
    
    retro_songs_df = pd.concat([retro_songs_df, temp_df], ignore_index=True)
    
    page = res['next_page']

In [None]:
retro_songs_df.head(20)

In [None]:
final_cols = ['artist_name', 'track_name', 'release_date', 'genre', 'lyrics']

retro_songs_df.rename(columns={'title': 'track_name', 'primary_artist': 'artist_name'}, inplace=True)

students_dataset_df = retro_songs_df[final_cols]
students_dataset_df = students_dataset_df[students_dataset_df.notnull().all(axis=1)]

students_dataset_df.drop_duplicates(subset=['artist_name', 'track_name'], keep='first', inplace=True)

students_dataset_df.to_csv("../data/Student_dataset.csv", index=False)

##### Get songs in "soul" genre - Method 1

In [None]:
genre = 'soul'
page = 1
retro_songs_df = pd.DataFrame(columns=['url', 'title_with_artists', 'title', 'artists', 'featured_artists', 'num_artists', 'genre', 'lyrics', 'release_date', 'primary_artist'])

for page_num in range(30):
    print("\n\nReading page {}...".format(page))
    res = genius.tag(genre, page=page)
    
    temp_df = pd.DataFrame(res['hits'])
    temp_df['num_artists'] = temp_df['artists'].apply(len)
    temp_df['genre'] = genre
    
    for indx, hit in enumerate(res['hits']):
        try:
            print("\n\nExtracting details of song {}: {}".format(indx, hit['title']))
            # Validate the title, artist and language
            songs_response = genius.search_songs(search_term=hit['title'])
            for song in songs_response['hits']:
                if (song['result']['title']==hit['title']) and (song['result']['url']==hit['url']) and (song['result']['language']=="en"):
                    print("Valid English song found!")
                    released_year_from_songs_api = song['result']['release_date_components']['year']
                    search_song_response = genius.search_song(hit['title'], get_full_info=True)
                    song_id = search_song_response.id
                    lyrics = search_song_response.lyrics
                    if lyrics.strip():
                        # Split each line to a list and remove the first line
                        lines = lyrics.splitlines()[1:]

                        # Remove empty lines
                        lines = [line for line in lines if line.strip()]

                        # Remove whole line where it contains '[]' brackets and some alphanumeric characters inside the []
                        lines = [line for line in lines if not re.match(r'(^.*\[\w+\].*$)|(\[.*\])', line)]

                        # Join the remaining lines back into a single string
                        modified_lyrics = ' '.join(lines)
                    else:
                        modified_lyrics = ''
                    if song_id:
                        print("Song ID: {}".format(song_id))
                        song_response = genius.song(song_id=song_id)
                        song_response_details = Song(song_response)

                        if hit['url']==song_response_details.url:
                            print("URLs are matching!")
                            temp_df.loc[temp_df['url']==hit['url'], ['lyrics']] = modified_lyrics
                            temp_df.loc[temp_df['url']==hit['url'], ['release_date']] = int(released_year_from_songs_api) or int(song_response_details.year[:4])
                            temp_df.loc[temp_df['url']==hit['url'], ['primary_artist']] = song_response_details.artist
                        else:
                            print("URLs are NOT matching!")
                    else:
                        print("No valid song ID found!")
                    break
                else:
                    print("Not an English song!")
        except Exception as e:
            print(str(e))
    
    retro_songs_df = pd.concat([retro_songs_df, temp_df], ignore_index=True)
    
    page = res['next_page']

In [None]:
retro_songs_df.head(20)

In [None]:
final_cols = ['artist_name', 'track_name', 'release_date', 'genre', 'lyrics']

retro_songs_df.rename(columns={'title': 'track_name', 'primary_artist': 'artist_name'}, inplace=True)

students_dataset_df = retro_songs_df[final_cols]
students_dataset_df = students_dataset_df[students_dataset_df.notnull().all(axis=1)]

students_dataset_df.drop_duplicates(subset=['artist_name', 'track_name'], keep='first', inplace=True)

students_dataset_df.to_csv("../data/Student_dataset_Shadhini.csv", index=False)

##### Create merged dataset

In [20]:
mendeley_df = pd.read_csv("../data/mendeley_dataset.csv")
mendeley_df = mendeley_df[['artist_name', 'track_name', 'release_date', 'genre', 'lyrics']]

print("shape: {}\n".format(mendeley_df.shape))

mendeley_df.head()

shape: (28372, 5)



Unnamed: 0,artist_name,track_name,release_date,genre,lyrics
0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...
1,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...
2,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...
3,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...
4,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...


In [3]:
student_df = pd.read_csv("../data/Student_dataset_soul_org.csv")

# Pre-process student dataset
student_df['lyrics'] = student_df['lyrics'].str.replace(r'(You might also like\d{1,4}Embed$)|(\d+Embed$)|(Embed$)', '', regex=True)

print("shape: {}\n".format(student_df.shape))

student_df.head()

student_df.to_csv("../data/Student_dataset_soul_cleaned.csv", header=True, index=False)

shape: (436, 5)



In [35]:
merged_df = pd.concat([mendeley_df, student_df], ignore_index=True)

merged_df.shape

(28808, 5)

In [36]:
merged_df.to_csv("../data/Merged_dataset_soul.csv", index=False)