In [None]:
#Class: IST 652  
#Section: M403  
#Assignment: Final Project Predicting Song Popularity 
#Authors: Tyler Gigot, Annie Titus, Daniel Stalica
#Due Date: 6/13/2021

In [None]:
## import pandas library
import pandas as pd
from pandas import DataFrame
import matplotlib.pyplot as plt
import numpy as np
from mlxtend.preprocessing import TransactionEncoder

In [None]:
# read in the 4 datasets used for this project
# the first two data sets can be downloaded from kaggle
# https://www.kaggle.com/edalrami/19000-spotify-songs/discussion/73524

song_data=pd.read_csv('song_data.csv')
song_info=pd.read_csv('song_info.csv')

# the second two datasets are provided seperately
# these two datasets come from web scraping wikipedia

master_artist=pd.read_csv('master_artist_T.csv')
master_song=pd.read_csv('master_song_T.csv', encoding = ('ISO-8859-1'))

# used ISO 8859-1 because without it, i received a UTF-8 error.
# The ISO 8859-1 is a single byte encoding that can represent the first 256 Unicode characters

In [None]:
# The song_data table is an original dataset from Kaggle, which consists of a
# collection of songs that were parsed via the Organize Your Music tool.

# Note: there were many duplicate song names, without a way of uniquely identifying
#them. For this reason, duplicate song names were removed.

# original dataset dimensions
song_data.shape

In [None]:
# clean up the song_data df

# remove duplicate song names
song_data = song_data.drop_duplicates(subset = "song_name", keep = False)

# cleaned dataset dimensions
song_data.shape

In [None]:
# view the song_data df
song_data.head()

In [None]:
# The song_info table is an original dataset from Kaggle which contains the
# corresponding artist, album, and playlist for each song in the song_data table.

# Note: as in the song_data table, there were duplication errors in this table
# as well. This is resolved in the same manner by removing duplicate song names.

# original dataset dimensions
song_info.shape

In [None]:
# clean up the song_info df

# remove duplicate song names
song_info = song_info.drop_duplicates(subset = "song_name", keep = False)

# cleaned dataset dimensions
song_info.shape

In [None]:
# view the song_info df
song_info.head()

In [None]:
# The master_song_T table was collected by web scraping wikipedia pages
# for additional information about the songs.

# original dataset dimensions
master_song.shape

In [None]:
# clean up the master_song df

# see which songs from master_song exist in song_data
master_song = master_song.merge(song_data['song_name'], indicator = True, how = 'outer', on = 'song_name')

# keep only rows that exist in both dataframes
master_song = master_song[master_song._merge != 'left_only']

# cleaned dataset dimensions
master_song.shape

In [None]:
# view the master_song df
master_song.head()

In [None]:
# The master_artist_T table was constructed by web scraping wikipedia pages
# for additional artist information.

# original dataset dimensions
master_artist.shape

In [None]:
# view the master_artist df
master_artist.head()

In [None]:
# combine all the dataframes into one
song_main = master_song.merge(song_info, how = 'outer', on = 'song_name')  # merge master song and song info
song_main = song_main.merge(song_data, how = 'outer', on = 'song_name')  # merge song main with song data

# view the shape of the song main df
song_main.shape

In [None]:
# clean up the song_main df

# keep only selected columns
# these are the columns that will be dropped
song_main = song_main.drop(['song_id',
                            'artist_name_x',
                            'song_label',
                            'song_songwriter',
                            'song_producer',
                            '_merge'], axis = 1)

# remove all NAs
# we only want to keep data where the information was available from wikipedia
song_main = song_main.dropna()

# view the cleaned df dimensions
song_main.shape

In [None]:
# view the song_main df
song_main.head()

In [None]:
# performing some initial data exploration 
plt.hist(master_song.song_released, range = (1900, 2020))
plt.title('Frequency of Number of Songs by Year')

In [None]:
# Observations
## there are more newer generation songs than older generation songs  
## there is a particularly larger number of songs between 2010 and 2020  
## the bias is probably due to the preferences of the creator of the data  

In [None]:
# unpacking the song genres

# create a function which searches for a substring
# this will be used to unpack the song genres

def check(string, sub_str):
    if (string.find(sub_str) == -1):
        return 0
    else:
        return 1

# here is the list of all of the parent genres
# note that hip and hop will get put together into hiphop

parent_genre_list = ['alt',
                     'rock',
                     'metal',
                     'punk',
                     'pop',
                     'hip',
                     'hop',
                     'r&b',
                     'rap',
                     'jazz',
                     'blues',
                     'folk',
                     'country',
                     'elect',
                     'other']

# create an empty variable for each of the genres
# these will be appended to throughout the for loop

altcount = []
rockcount = []
metalcount = []
punkcount = []
popcount = []
hiphopcount = []
rbcount = []
rapcount = []
jazzcount = []
bluescount = []
folkcount = []
countrycount = []
electcount = []
othercount = []

# iterate through each of the songs and corresponding genres
# if it contains the genre it will be given a 1 for true
# if does not contain the genre it wil be given a 0 for false

for nestedgenre in list(song_main['song_genre']):
    altcount.append(check(nestedgenre, 'alt'))
    rockcount.append(check(nestedgenre, 'rock'))
    metalcount.append(check(nestedgenre, 'metal'))
    punkcount.append(check(nestedgenre, 'punk'))
    popcount.append(check(nestedgenre, 'pop'))
    hiphopcount.append(max(check(nestedgenre, 'hip'), check(nestedgenre, 'hop')))
    rbcount.append(check(nestedgenre, 'r&b'))
    rapcount.append(check(nestedgenre, 'rap'))
    jazzcount.append(check(nestedgenre, 'jazz'))
    bluescount.append(check(nestedgenre, 'blues'))
    folkcount.append(check(nestedgenre, 'folk'))
    countrycount.append(check(nestedgenre, 'country'))
    electcount.append(check(nestedgenre, 'elect'))
    othercount.append(check(nestedgenre, 'other'))

# add the columns onto the song main dataframe
song_main['alt'] = altcount
song_main['rock'] = rockcount
song_main['metal'] = metalcount
song_main['punk'] = punkcount
song_main['pop'] = popcount
song_main['hiphop'] = hiphopcount
song_main['rb'] = rbcount
song_main['rap'] = rapcount
song_main['jazz'] = jazzcount
song_main['blues'] = bluescount
song_main['folk'] = folkcount
song_main['country'] = countrycount
song_main['elect'] = electcount
song_main['other'] = othercount

# view the resulting dataframe
song_main.head()

In [None]:
# average song popularity for alternative genre
altpopularity = song_main[['song_popularity', 'alt']]
altpopularity = altpopularity[altpopularity.alt == 1]
altpopularity = round(sum(altpopularity['song_popularity']) / len(altpopularity['song_popularity']))
altpopularity

In [None]:
# average song popularity for rock genre
rockpopularity = song_main[['song_popularity', 'rock']]
rockpopularity = rockpopularity[rockpopularity.rock == 1]
rockpopularity = round(sum(rockpopularity['song_popularity']) / len(rockpopularity['song_popularity']))
rockpopularity

In [None]:
# average song popularity for metal genre
metalpopularity = song_main[['song_popularity', 'metal']]
metalpopularity = metalpopularity[metalpopularity.metal == 1]
metalpopularity = round(sum(metalpopularity['song_popularity']) / len(metalpopularity['song_popularity']))
metalpopularity

In [None]:
# average song popularity for punk genre
punkpopularity = song_main[['song_popularity', 'punk']]
punkpopularity = punkpopularity[punkpopularity.punk == 1]
punkpopularity = round(sum(punkpopularity['song_popularity']) / len(punkpopularity['song_popularity']))
punkpopularity

In [None]:
# average song popularity for pop genre
poppopularity = song_main[['song_popularity', 'pop']]
poppopularity.columns = ['song_popularity', 'popp']
poppopularity = poppopularity[poppopularity.popp == 1]
poppopularity = round(sum(poppopularity['song_popularity']) / len(poppopularity['song_popularity']))
poppopularity

In [None]:
# average song popularity for hiphop genre
hiphoppopularity = song_main[['song_popularity', 'hiphop']]
hiphoppopularity = hiphoppopularity[hiphoppopularity.hiphop == 1]
hiphoppopularity = round(sum(hiphoppopularity['song_popularity']) / len(hiphoppopularity['song_popularity']))
hiphoppopularity

In [None]:
# average song popularity for rb genre
rbpopularity = song_main[['song_popularity', 'rb']]
rbpopularity = rbpopularity[rbpopularity.rb == 1]
rbpopularity = round(sum(rbpopularity['song_popularity']) / len(rbpopularity['song_popularity']))
rbpopularity

In [None]:
# average song popularity for rap genre
rappopularity = song_main[['song_popularity', 'rap']]
rappopularity = rappopularity[rappopularity.rap == 1]
rappopularity = round(sum(rappopularity['song_popularity']) / len(rappopularity['song_popularity']))
rappopularity

In [None]:
# average song popularity for jazz genre
jazzpopularity = song_main[['song_popularity', 'jazz']]
jazzpopularity = jazzpopularity[jazzpopularity.jazz == 1]
jazzpopularity = round(sum(jazzpopularity['song_popularity']) / len(jazzpopularity['song_popularity']))
jazzpopularity

In [None]:
# average song popularity for blues genre
bluespopularity = song_main[['song_popularity', 'blues']]
bluespopularity = bluespopularity[bluespopularity.blues == 1]
bluespopularity = round(sum(bluespopularity['song_popularity']) / len(bluespopularity['song_popularity']))
bluespopularity

In [None]:
# average song popularity for folk genre
folkpopularity = song_main[['song_popularity', 'folk']]
folkpopularity = folkpopularity[folkpopularity.folk == 1]
folkpopularity = round(sum(folkpopularity['song_popularity']) / len(folkpopularity['song_popularity']))
folkpopularity

In [None]:
# average song popularity for country genre
countrypopularity = song_main[['song_popularity', 'country']]
countrypopularity = countrypopularity[countrypopularity.country == 1]
countrypopularity = round(sum(countrypopularity['song_popularity']) / len(countrypopularity['song_popularity']))
countrypopularity

In [None]:
# average song popularity for elect genre
electpopularity = song_main[['song_popularity', 'elect']]
electpopularity = electpopularity[electpopularity.elect == 1]
electpopularity = round(sum(electpopularity['song_popularity']) / len(electpopularity['song_popularity']))
electpopularity

In [None]:
# average song popularity for other genre
otherpopularity = song_main[['song_popularity', 'other']]
otherpopularity = otherpopularity[otherpopularity.other == 1]
otherpopularity = round(sum(otherpopularity['song_popularity']) / len(otherpopularity['song_popularity']))
otherpopularity

In [None]:
# Question1: What song genre has the most popular songs?
# create a dictionary that will be used to answer the question
                                                                                   
# create the dictionary with all of the results
popularitybygenre = {'Alternative': altpopularity,
                     'Rock': rockpopularity,
                     'Metal': metalpopularity,
                     'Punk': punkpopularity,
                     'Pop': poppopularity,
                     'HipHop': hiphoppopularity,
                     'R&B': rbpopularity,
                     'Rap': rappopularity,
                     'Jazz': jazzpopularity,
                     'Blues': bluespopularity,
                     'Folk': folkpopularity,
                     'Country': countrypopularity,
                     'Electronic': electpopularity,
                     'Other': otherpopularity}
                                                                                   
# show the resulting dictionary
popularitybygenre

In [None]:
# show a plot of the average song popularity by genre
plt.plot(*zip(*sorted(popularitybygenre.items())))
plt.xticks(rotation=45)
plt.show()