# 1. LastFM API calls to get top love songs

In [None]:
#Dependencies
import pandas as pd
import requests
import json
from pprint import pprint
from config import lastfm_api

In [None]:
#Create Top Love Songs DataFrame to populate with info retrieved from LastFM
top2000_lovesongs_df=pd.DataFrame({
    "Rank": [],
    "Song Name": [],
    "Artist": [],
    "Duration": [],
    "Album": []
})


In [None]:
#Set the parameters to LastFM API calls. Method and Limit are empty so we can adjust them to specific calls\
#like get_top_tracks or track.getInfo

params={
    "tag": "love",
    "api_key": lastfm_api,
    "format": "json",
    "method":"",
    "limit": ""
}

base_url= "http://ws.audioscrobbler.com/2.0/?"
example_url= "/2.0/?method=tag.gettoptracks&tag=disco&api_key=YOUR_API_KEY&format=json"    #taken from website


#API call for 2000 songs using tag.getTopTracks
params["limit"]=2000
params["method"]= "tag.gettoptracks"

response=requests.get(base_url, params=params).json()
# pprint(response)

#The 2000 songs are in a list under "response["tracks"]["track"]"
results_df=pd.DataFrame(response["tracks"]["track"])
results_df.head()

In [None]:
#The 2000 songs are in a list under "response["tracks"]["track"]"
results=response["tracks"]["track"]

for index, row in results_df.iterrows():
    song= results[index]["name"]     
    artist= results[index]["artist"]["name"]
    print(f'Info retrieved for song:{song} by {artist}')
    
    #Retrieve info about date release of album and track duration
    try:
        song_url=f'http://ws.audioscrobbler.com/2.0/?method=track.getInfo&api_key={lastfm_api}\
                    &track={song}&artist={artist}&format=json'
        song_response=requests.get(song_url).json()
        print(f'-->Retrieving album name for {song} by {artist}')
        album=song_response["track"]["album"]["title"]

    except (KeyError, IndexError, ValueError):
        print(f'----Missing field/result for {album} by {artist}. Skipping----')
    
    #Populate df
    try:
        top2000_lovesongs_df.loc[index, "Rank"]=results[index]["@attr"]["rank"]
        top2000_lovesongs_df.loc[index, "Song Name"]=results[index]["name"]
        top2000_lovesongs_df.loc[index, "Artist"]=results[index]["artist"]["name"]
        top2000_lovesongs_df.loc[index, "Duration"]=results[index]["duration"] 
        top2000_lovesongs_df.loc[index, "Album"]=song_response["track"]["album"]["title"]
            
    except (KeyError, IndexError):
        print(f'----Missing field/result for {song} by {artist}. Skipping----')


In [None]:
#Display dataframe and save as csv
top2000_lovesongs_df.to_csv("csv/top2000_lovesongs_df.csv")
top2000_lovesongs_df

In [None]:
#Check if all rows of the df have been populated
#NOTE: not all songs have duration info (0). If we want to use duration to answer questions, we need to fill in the gaps
#NOTE: some albums are missing?
top2000_lovesongs_df.count()  

In [None]:
#MAYBE  GOOD TO DO JUST FOR THE PRESENTATION, DURATION IN SECONDS IS EASIER TO MANIPULATE FOR THE ANALYSIS
#Convert seconds in HH:MM:SS format
def convert(seconds): 
    seconds = seconds % (24 * 3600) 
#     hour = seconds // 3600    #we don't need hours
    seconds %= 3600
    minutes = seconds // 60
    seconds %= 60
      
#     return "%d:%02d:%02d" % (hour, minutes, seconds)
    return "%02d:%02d" % (minutes, seconds)

# Example 
n = 259
print(convert(n))

# 2. Retrieve release date and lyrics from Genius
* Find instructions here https://github.com/johnwmillr/LyricsGenius/blob/master/README.md

In [None]:
# #Install lyrics genius module
# !pip install lyricsgenius

In [None]:
#Import lyricsgenius module and config with token
import lyricsgenius
from config import genius_token
genius = lyricsgenius.Genius(genius_token)

In [None]:
#Open top_love_songs csv file
top2000_lovesongs_df=pd.read_csv("csv/top2000_lovesongs_df.csv")

#add new columns to populate with year and lyrics
top2000_lovesongs_df["Year"]=""
top2000_lovesongs_df["Lyrics"]=""
# top2000_lovesongs_df

In [None]:
#Search year and lyrics for TOP2000 songs:

for index, row in top2000_lovesongs_df.iterrows():
    title=row["Song Name"]
    artist=row["Artist"]
    print(f'Retrieving info for Index {index}: {title} by {artist}')
    
    try:       
        song = genius.search_song(title, artist=artist)
        top2000_lovesongs_df.loc[index, "Year"]= song.year        
        top2000_lovesongs_df.loc[index, "Lyrics"]= song.lyrics

        
    except:
        print(f'----- missing info for {title} by {artist}')

print("FIN")

In [None]:
# store new csv to prevent re-runnning API call
top2000_lovesongs_df.to_csv("top2000_yearlyrics.csv")

In [None]:
# Create dataframe from new csv
song_year_df=pd.read_csv("top2000_yearlyrics.csv")
song_year_df.head()

### Retrieve year and create bins for decades

In [None]:
# Split the song year on '-'
song_year_df[['Split Year', 'xyz', 'abc']] = song_year_df.Year.str.split("-",expand=True,)

# drop irrelevent columns
songs_df=song_year_df.drop(["xyz", "abc", "Year"], axis=1)

# drop na years
song_years_df = songs_df[songs_df['Split Year'].notna()]

song_years_df.head()

In [None]:
#Drop weird columns created for don't know what reason
year_df=song_years_df.drop(["Unnamed: 0"], axis=1)

# Rename Split Year Column
year_df = year_df.rename(columns={"Split Year": "Year"})

year_df

### ANALYSIS: Number of top love songs per decade

In [None]:
# Cast Year strings to int
year_df['Year'] = year_df['Year'].astype(int)

year_df.dtypes

# bin years to see distribution
bins = [0, 1949, 1959, 1969, 1979, 1989, 1999, 2009, 2020]

# Create the names for the bins
labels = ["40s", "50s", "60s", "70s", "80s", "90s", "2000s", "2010s"]

year_df["bins"] = pd.cut(year_df["Year"], bins, labels=labels, include_lowest=True)

# count songs per decade
year_df["bins"].value_counts()

### Clean data 
* drop data from 40s & 50s, remove songs from 2000s not ranked in top 300

In [None]:
# Drop data from 40s & 50s, remove songs from 2000s not ranked in top 300
final_songs_df = year_df.loc[(year_df["bins"] != "40s") & (year_df["bins"] != "50s") & (year_df["bins"] != "2000s")]

songs_df = year_df.loc[(year_df["bins"]=='2000s') & (year_df["Rank"]<300)]

# merge songs from 2000s back into dataframe
final_df = pd.merge(songs_df, final_songs_df, on=["Song Name", "Artist", "Album", "Duration", "Rank", "Year", "bins", "Lyrics"], how="outer")
final_df


In [None]:
#Store info in a new csv
final_df.to_csv("csv/final_data.csv")

In [None]:
#Count the number of unique songs for each bin (decade)
final_df["bins"].value_counts()

# 3. Lyrics Cleanup
* Moludes needed: Natural Language Toolkit (NLTK) 
* NLTK is a Python package for natural language processing
* for info: https://www.nltk.org/data.html

In [None]:
# #Natural Language Toolkit (NLTK) is a Python package for natural language processing
# #Install nltk module (for info: https://www.nltk.org/data.html) *takes a while to download*
# !pip install nltk

In [None]:
#Import Dependencies 

#for nlkt (remove stop words)    ---> all available datasets/models:CORPORA: http://www.nltk.org/nltk_data/
import nltk
nltk.download("stopwords") 

# for punctuation, import string library function  
import string  

In [None]:
#Open file with songs from top populated eras
song_with_lyrics_df=pd.read_csv("csv/final_data.csv")

# #Drop weird column created for don't know what reason
weird_columns=["Unnamed: 0", "Unnamed: 0.1", "Unnamed: 0.1.1"]
song_list_df=song_with_lyrics_df.drop(weird_columns, axis=1)


song_list_df.head()

In [None]:
#Check for duplicates in the list of songs
duplicates=song_list_df.duplicated(subset=["Song Name", "Artist"], keep=False)
song_list_df[duplicates]

## Cleaning lyrics
* stop words (a, about, above, after, again, against, all....)
* punctuation characters (. ; : [] ? ...)
* words related to song structure (intro, chorus, verse)
* numbers
#### Create lists for:
* word count for each song
* unique words for each song
* unique words count for each song

In [None]:
#Split lyrics into words, create a new df

#punctuation= [',', '.', ';', ':', '[', ']', '?', '!', '(', ')', '"', '%', "&", "-", "--"]  #old punctuation
punctuation=list(string.punctuation)

#(for info: https://www.geeksforgeeks.org/removing-stop-words-nltk-python/)
stop_words= set(stopwords.words('english')) 

#possible verse number (as strings)
numbers=[str(n) for n in range(10000)]

#possible song structure terms and artist names
song_structure_words=["intro", "verse", "chorus", "bridge", "outro", "hook"]  #"chris", "martin", 

#list of total words, unique words (arrays) and unique_word_counts (series)
tot_words_list=[]            #list of total number of words per song
unique_words_list=[]         #list of unique words per song
unique_words_count_list=[]   #list of count of unique words per song


for index, row in song_list_df.iterrows():
    
    #Store song lyrics in a variable
    song_lyrics=song_list_df["Lyrics"][index]

    #Before splitting lyrics into words, remove punctation characters
    song_lyrics_clean= song_lyrics

    for x in punctuation:
        if not x == "'":
            song_lyrics_clean=song_lyrics_clean.replace(x,"")

    #Split string into list of words
    words_list= song_lyrics_clean.split() 

    #Make df of lowercase words (stop words are all lowercase)
    words_list_lower=[words_list[x].lower() for x in range(len(words_list))]

    #Remove stop words such as “the”, “a”, “an”, “in” 
    filtered_1=[k for k in words_list_lower if not k in stop_words]

    #remove possible verse number (as strings)
    filtered_2=[k for k in filtered_1 if not k in numbers] 

    #remove song structure words     
    filtered_lyrics_index=[k for k in filtered_2 if not k in song_structure_words] 
    
    #Create a df for lyrics analysis
    lyrics_index_df=pd.DataFrame()
 
    #Save song_lyrics_clean to a new dataframe
    lyrics_index_df[index]=filtered_lyrics_index
    
    #Number of words
    tot_words=len(lyrics_index_df[index])
    tot_words_list.append(tot_words)
    
    #Unique words
    unique_words=lyrics_index_df[index].unique()
    unique_words_list.append(unique_words)
    
    #Count of Unique words
    unique_words_count=lyrics_index_df[index].value_counts()
    unique_words_count_list.append(unique_words_count)    
    
    
    #Rename column and export to csv as lyrics_index.csv
    column_name= f'{song_list_df["Song Name"][index]}_{song_list_df["Artist"][index]}'
    lyrics_index_df=lyrics_index_df.rename(columns={index: column_name})
    lyrics_index_df.to_csv(f'Lyrics/lyrics_{index}_df.csv')

lyrics_index_df

In [None]:
#list of word count for each song
tot_words_list

In [None]:
#list of unique words for each song
unique_words_list

In [None]:
#list of unique words counts for each song
unique_words_count_list

## ANALYSIS: Word count per decade

In [None]:
#Create a new df with year, bin categoty and word count list
words_count_df=pd.DataFrame({
    "Song Name": song_list_df["Song Name"],
    "Artist": song_list_df["Artist"],
    "Words count": tot_words_list,
    "Year": song_list_df["Year"],
    "Bin": song_list_df["bins"]
})

words_count_df

In [None]:
#Plot
words_count_df.boxplot("Words count", by="Bin", figsize=(20,10))
plt.title("Song word count per decade (era)", fontsize=25, fontweight="bold")
plt.xlabel("Decade", fontsize=20)
plt.ylabel("Number of words", fontsize=20)
plt.savefig("Plots/word_count_boxplot_original.png")

### Check outliers

In [None]:
#Identify outliers from words_count_df with number of words >2000
outliers=words_count_df.loc[words_count_df["Words count"] > 2000]
outliers

In [None]:
#Retrieve info for outliers in song_list_df
outliers_df=song_list_df.loc[(song_list_df.index == 753) | (song_list_df.index == 1022)]
outliers_df

In [None]:
#CHECK OUTLIERS LYRICS

#A Rocket to the Moon - shhh.. Just listEn :)
outliers_df["Lyrics"][1022]

#Comment: A lot info in the lyrics, like " CAMERA EXPLORES, ATTIC - NEW ANGLE - DAY" -----> DROP!

In [None]:
#Beck - Everybody's Gotta Learn Sometimes
outliers_df["Lyrics"][753]

#Comment: the song does exist but the lyrics belong to another song (Genius problem) (The Devil Glitch by Chris Butler\
#also known as the longest song ever! Full version is 1h long. https://www.youtube.com/watch?v=10SnNfxjAI8)  -----> DROP!

In [None]:
#Drop outliers and plot
words_count_clean_df=words_count_df.drop([753, 1022])
words_count_clean_df

#Plot
words_count_clean_df.boxplot("Words count", by="Bin", figsize=(20,10))
plt.title("Song word count per decade (Clean dataset)", fontsize=25, fontweight="bold")
plt.xlabel("Decade", fontsize=20)
plt.ylabel("Number of words", fontsize=20)
plt.savefig("Plots/word_count_boxplot_clean.png")


### Check outliers: round 2

In [None]:
#Identify outliers from words_count_df with number of words >600
outliers_2=words_count_clean_df.loc[words_count_clean_df["Words count"] > 600] 
outliers_2

In [None]:
#Retrieve info for outliers in song_list_df
outliers_2_df=song_list_df.loc[(song_list_df.index == 551) | (song_list_df.index == 990)]
outliers_2_df

In [None]:
#Ellie Goulding - Love Me Like You Do - From "Fifty Shades of Grey"
outliers_2_df["Lyrics"][551]

#Comment: lyrics match from Genius is list of 2016 Grammys Nominees (https://genius.com/Grammys-2016-nominees-lyrics)
# ----> DROP

In [None]:
#Black Star - Brown Skin Lady"
outliers_2_df["Lyrics"][990]

#Comment: Song really has long lyrics, both members of the band (Talib Kweli and Mos Def) sing at the same time 
#(https://genius.com/Black-star-brown-skin-lady-lyrics)  ----> Could keep but it's indeed an outlier

In [None]:
#Drop outliers
words_count_clean_2_df=words_count_clean_df.drop([551, 990])
words_count_clean_2_df

# #Plot
# words_count_clean_2_df.boxplot("Words count", by="Bin", figsize=(20,10))
# plt.title("Song word count per decade (final)", fontsize=25, fontweight="bold")
# plt.xlabel("Decade", fontsize=20)
# plt.ylabel("Number of words", fontsize=20)
# plt.savefig("Plots/word_count_boxplot_final_raw.png")

### Statistical analysis and plot

In [None]:
#Statistical analysis OneWay ANOVA

#create separate subsets per bin
words_2010s=words_count_clean_2_df.loc[words_count_clean_2_df["Bin"] == "2010s"]["Words count"]
words_2000s=words_count_clean_2_df.loc[words_count_clean_2_df["Bin"] == "2000s"]["Words count"]
words_90s=words_count_clean_2_df.loc[words_count_clean_2_df["Bin"] == "90s"]["Words count"]
words_80s=words_count_clean_2_df.loc[words_count_clean_2_df["Bin"] == "80s"]["Words count"]
words_70s=words_count_clean_2_df.loc[words_count_clean_2_df["Bin"] == "70s"]["Words count"]
words_60s=words_count_clean_2_df.loc[words_count_clean_2_df["Bin"] == "60s"]["Words count"]

#run OneWay ANOVA
(statistic, pvalue)=stats.f_oneway(words_2010s, words_2000s, words_90s, words_80s, words_70s, words_60s)
pvalue

In [None]:
#PLOT USING MATPLOTLIB, CHANGE COLORS
fig= plt.plot(figsize=(40,20))
labels=["60s", "70s", "80s", "90s", "2000s", "2010s"]
data=[words_60s, words_70s,words_80s, words_90s, words_2000s, words_2010s]
# colors=["lightblue", "pink", "lightgreen", "purple", "lightorange", "grey"]

# rectangular box plot
bplot = plt.boxplot(data,patch_artist=True,  # fill with color,
                    labels=labels)  # will be used to label x-ticks

plt.title("Song word count per decade", fontsize=15, fontweight="bold")
plt.xlabel("Decade", fontsize=12)
plt.ylabel("Number of words", fontsize=12)



# fill with colors
#Note: bplot is a dict, these are the keys: ['whiskers', 'caps', 'boxes', 'medians', 'fliers', 'means']
colors=["lightblue", "pink", "lightgreen", "darkorchid", "tan", "silver"]

for patch, color in zip(bplot["boxes"], colors):
        patch.set_facecolor(color)

#Annotate pvalue
string_pvalue= f'pvalue=1.12e-21'
plt.annotate(string_pvalue, (1, 400), fontsize=12, color="black")
        

plt.savefig("Plots/word_count_boxplot_final_colors.png")