In [43]:
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
from unidecode import unidecode
pd.set_option('display.max_rows', 500)
import warnings
warnings.filterwarnings('ignore')
from env import api_token

import lyricsgenius as genius
def get_lyrics(title, artist):
    api = genius.Genius(api_token, verbose=False)
    song = api.search_song(title, artist)
    lyrics = song.lyrics.replace('\n', ' ')
    return lyrics

from nltk.corpus import stopwords
def remove_stopwords(text, 
                     stopword_en=stopwords.words('english'),
                     stopword_es=stopwords.words('spanish')):
    stopword_list = stopword_en + stopword_es
    words = text.split()
    filtered_words = [word for word in words if word not in stopword_list]
    return ' '.join(filtered_words)

In [2]:
songs = pd.read_csv('songs.csv').drop(columns=['Unnamed: 0'])
songs = songs[~songs.lyrics.isna()]

songs_2 = pd.read_csv('songs_2.csv').drop(columns=['Unnamed: 0'])
songs_2 = songs_2[~songs_2.lyrics.isna()]

songs_3 = pd.read_csv('songs_3.csv').drop(columns=['Unnamed: 0'])
songs_3 = songs_3[~songs_3.lyrics.isna()]

songs_4 = pd.read_csv('songs_4.csv').drop(columns=['Unnamed: 0'])
songs_4 = songs_4[~songs_4.lyrics.isna()]

songs_5 = pd.read_csv('songs_5.csv')
songs_5 = songs_5[~songs_5.lyrics.isna()]

df = pd.concat([songs, songs_2, songs_3, songs_4, songs_5])

In [39]:
phrases = ['part 1', 'part 2', 'parts 1', 'part I', 'part II', 'parts I', 
           'part one', 'part two', 'parts one & two', 'parts one and two',
           'radio version', 'single version', 'original version', 
           'live version', 'solo version', 'album version']

weird_apostrophe = '’'

# pull the title portion out of the lyrics string
df['test_title'] = df.lyrics.apply(lambda x: x.split('Lyrics')[0])

# clean it up

# lowercase
df['test_title'] = df.test_title.str.lower()
# remove 'Part 1', 'Part 2', 'Live Version', etc
for phrase in phrases:
    df['test_title'] = df.test_title.str.replace(phrase, '')
# remove everything in parentheses
df['test_title'] = df.test_title.str.replace(r'\((.*)\)', '')
# remove everything in square brackets
df['test_title'] = df.test_title.str.replace(r'\[(.*)\]', '')
# remove everything after ' - '
df['test_title'] = df.test_title.str.replace(r'\s\-\s(.+)', '')
# remove periods
df['test_title'] = df.test_title.str.replace(".", '', regex=True)
# replace accented characters
df['test_title'] = df.test_title.apply(unidecode)
# replace weird apostrophe character
df['test_title'] = df.test_title.str.replace(weird_apostrophe, "'")
# replace other special characters with spaces
df['test_title'] = df.test_title.str.replace(r'[^a-zA-Z\s\:]', ' ', regex=True)
# strip leading/trailing whitespace
df['test_title'] = df.test_title.str.strip()
# remove stopwords
df['test_title'] = df.test_title.apply(remove_stopwords)

# clean up the title to match the test title

# lowercase
df['clean_title'] = df.title.str.lower()
# remove 'Part 1', 'Part 2', 'Live Version', etc
for phrase in phrases:
    df['clean_title'] = df.clean_title.str.replace(phrase, '')
# remove everything in parentheses
df['clean_title'] = df.clean_title.str.replace(r'\((.*)\)', '')
# remove everything in square brackets
df['clean_title'] = df.clean_title.str.replace(r'\[(.*)\]', '')
# remove everything after ' - '
df['clean_title'] = df.clean_title.str.replace(r'\s\-\s(.+)', '')
# remove periods
df['clean_title'] = df.clean_title.str.replace(".", '', regex=True)
# replace accented characters
df['test_title'] = df.test_title.apply(unidecode)
# replace weird apostrophe character
df['clean_title'] = df.clean_title.str.replace(weird_apostrophe, "'")
# replace other special characters with spaces
df['clean_title'] = df.clean_title.str.replace(r'[^a-zA-Z\s\:]', ' ', regex=True)
# remove leading/trailing whitespace
df['clean_title'] = df.clean_title.str.strip()
# remove stopwords
df['clean_title'] = df.clean_title.apply(remove_stopwords)

In [23]:
df[['title', 'clean_title', 'test_title', 'lyrics']][df.test_title != df.clean_title].shape#sample(100, random_state=42)

(4037, 4)

In [40]:
df[['title', 'clean_title', 'test_title', 'lyrics']][df.test_title != df.clean_title].shape#sample(100, random_state=42)

(4037, 4)

In [41]:
df[['title', 'lyrics', 'clean_title', 'test_title']].loc[[4138]]

Unnamed: 0,title,lyrics,clean_title,test_title
4138,Childs Play,"Child’s Play Lyrics[Intro] Breaking news, my n...",childs play,child play


In [87]:
df.loc[15728].lyrics

"Lovergirl Lyrics[Intro] Hee...  Shoop-de-bop Funky pops  [Verse 1] Coffee, tea or me, baby, touché au lait My opening line might be a bit passé, yes But don't think that I don't know what I'm feeling for you 'Cause I got a vibe on you the first time that I saw you, saw you [Pre-Chorus] I need your love and I won't bring no pain A little birdie told me that you feel the same I'm for the real and for you I'm true blue Let's make a deal, sugar, all I wanna do is be your one and only lover  [Chorus] I just want to be your lovergirl I just want to rock your world Hey, hey, hey  [Verse 2] Hook, line and sinker baby, that's how you caught me My second verse might be a bit old hat But don't think that I don't know what it's doing to me 'Cause I got a vibe on you the first time you saw through me, through me  [Pre-Chorus] I need your love and I won't bring no pain A little birdie told me that you feel the same I'm for the real and for you I'm true blue Let's make a deal, sugar, all I wanna do 

In [72]:
print(df[['title', 'lyrics']].loc[[8866]].lyrics.values)

['DJ Khaled - GREECE ft. Drake (Traducción al Español) Lyrics[Letra de "DJ Khaled - GREECE ft. Drake (Traducción al Español)"]  [Intro: DJ Khaled & Drake] We The Best Music Another one DJ Khaled  [Coro: Drake] Ven conmigo, deja todas tus cosas, sí Podemos hacer una parada\u2005en\u2005Gucci, una parada\u2005en Louis V, sí Ven conmigo, volarte\u2005a Grecia A toda velocidad, volar sobre París, sí Ven conmigo, deja todas tus cosas, sí Podemos hacer una parada en Gucci, una parada en Louis V, sí Ven conmigo, volarte a Grecia A toda velocidad, volar sobre París, sí [Verso 1: Drake] Motoras, bebé, en Playa Nikki Olas en mis orejas, fumando hierba (Wi, wi) Andando por la arena en una Jeep (Wi, wi) Todo por lo que hice con el ritmo, bebé La vida es dulce, bebé, con muchas cadenas, bebé Tú sólo vete a alistarte, salimos, bebé Por mucho tiempo buscando el rebote, sí OZ tenía el rebote, sí  [Coro: Drake] Ven conmigo, deja todas tus cosas, sí Podemos hacer una parada en Gucci, una parada en Louis

In [88]:
(df[['title', 'lyrics', 'artist', ]][df.test_title != df.clean_title]).iloc[2500:2999, :]


Unnamed: 0,title,lyrics,artist
19544,Pump It (Nice An' Hard),Good Country People LyricsBesides the neutral ...,Icy Blu
19555,Puppet Man/Resurrection Shuffle,Names of People (Chapter 1) Lyrics ...,Tom Jones
19579,Push Push,June 2019 Singles Release Calendar Lyrics6/1 J...,Austin Taylor
19583,"Pushin' Inside You (From ""How Bout It"")",96 Freestyle Lyrics[Cormega](Nas) Ayo check t...,Sons Of Funk
19587,Pushin' Your Luck,Illipsis vs. Krome Lyrics[Round 1: Krome] Aigh...,Sleepy King
19590,Pussy Cat,Behn Grym vs. Adam Lyrics[Round 1: Behn Grym] ...,The Ames Brothers
19597,Put Away Your Love,Spotify Singles: Complete Collection LyricsThe...,Alessi
19599,Put It In A Magazine,Angela’s Ashes LyricsAngela's Ashes A Memoir ...,Sonny Charles
19610,Put Your Arms Around Me Honey,October 2020 Singles Release Calendar Lyrics10...,Ray Smith
19617,Put Your Mind At Ease,"The Bad Seed, Icon, Meat Pie, and Pumpkinhead ...",Every Mothers' Son


In [89]:
df.shape

(28190, 13)