In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string
import operator
import warnings
warnings.simplefilter('ignore')
%matplotlib inline

# Some info about what our questions are

## First we will clean the data

In [2]:
columns = ['Artist', 'Title', 'Album', '#_of_views', 'Release_date', '#_feat_artists', 
           'Artist_pop', 'Genre', 'Followers', 'Danceability', 'Track_pop', 'Explicit', 
           'Energy', 'Valence', 'Duration', 'Loudness', '#_words', 'Lyrics']
data = pd.read_csv('lyricDatasetUpdated.csv', names = columns)

Based on the greatest similarity between track titles, the data from Spotify was concatenated to the current Genius dataset. Now the dataset has additional features from Spotify. Some of the feature descriptions are grabbed straight from the Spotify API. The full list of features are as follow:<br>

1. **`Artist`** - Song artist<br>
DESCRIPTION: Name of artist
2. **`Title`** - Song title<br>
DESCRIPTION: Name of song
3. **`Album`** - Album Title<br>
DESCRIPTION: Name of album the song is from.
4. **`#_of_views`** - Number of pageviews for the lyric<br>
DESCRIPTION: Amount of views the lyric page got on the Genius website.
5. **`Release_date`** - Release date of song<br>
DESCRIPTION: The date the song was released
6. **`#_feat_artists`** - How many featured artists<br>
DESCRIPTION: The amount of artists featured on song
7. **`Artist_pop`** - Artist Popularity<br>
DESCRIPTION: How popular an artist is on Spotify. Range is 0-100.
8. **`Genre`** - Genre(s)<br>
DESCRIPTION: Genre the artist is considered as, separated by ' / '.
9. **`Followers`**<br>
DESCRIPTION: The amount of users following an artist on Spotify
10. **`Danceability`**<br>
DESCRIPTION: Danceability describes how suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity. A value of 0.0 is least danceable and 1.0 is most danceable. (Gotten from Spotify)
11. **`Track_pop`**<br>
DESCRIPTION: Track popularity is separate from artist popularity. It's the metric grabbed from Spotify on how popular a track is.
12. **`Explicit`**<br>
DESCRIPTION: If a song has lyrics on mature themes, or foul language, it is considered explicit. This is a binary value.
13. **`Energy`**<br>
DESCRIPTION: Energy is a measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity. Typically, energetic tracks feel fast, loud, and noisy. For example, death metal has high energy, while a Bach prelude scores low on the scale. Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy.
14. **`Valence`**<br>
DESCRIPTION: A measure from 0.0 to 1.0 describing the musical positiveness conveyed by a track. Tracks with high valence sound more positive (e.g. happy, cheerful, euphoric), while tracks with low valence sound more negative (e.g. sad, depressed, angry).
15. **`Duration`** - Duration (in Milliseconds)<br>
DESCRIPTION: The length of the song in milliseconds
16. **`Loudness`**<br>
DESCRIPTION: The overall loudness of a track in decibels (dB). Loudness values are averaged across the entire track and are useful for comparing relative loudness of tracks. Loudness is the quality of a sound that is the primary psychological correlate of physical strength (amplitude). Values typical range between -60 and 0 db.
17. **`#_words`** - Total words in lyrics<br>
DESCRIPTION: How many words each lyric has.
18. **`Lyrics`**<br>
DESCRIPTION: The lyrics for the specific song.

In [3]:
data['Genre'].fillna('hip hop', inplace = True)

In [4]:
data.dropna(subset = ['Lyrics'], inplace = True)
data.reset_index(drop = True, inplace = True)

In [5]:
genres = data['Genre']
g_list = list(genres)
genre_list = ['country', 'hip hop', 'pop', 'rock', 'punk', 'metal', 
              'australian'] #Australian reggae is outlier

In [6]:
correct_guesses = {}
for index, genre in enumerate(g_list):
    if type(genre) == float:
        continue
    else:
        if len(genre) == 1:
            print(genre)
            split_genres = genre
        else:
            split_genres = genre.split(' / ') 
    for g in genre_list:
        correct = 0
        if len(split_genres) == 1 and split_genres[0] == 'rap':
            correct_guesses['hip hop'] = correct
        else:
            for split in split_genres:
                if g in split:
                    correct += 1
            correct_guesses[g] = correct
    
    correct_genre = max(correct_guesses, key=lambda key: correct_guesses[key])
    if correct_genre == 'australian':
        g_list[index] = 'pop'
    elif correct_genre == 'punk':
        g_list[index] = 'rock'
    else:
        g_list[index] = correct_genre
    correct_guesses.clear()

In [7]:
data['Genre'] = g_list

In [8]:
data['Lyrics'] = data['Lyrics'].astype(str)
data['Lyrics'] = data['Lyrics'].str.lower()
data['Lyrics'] = data['Lyrics'].str.translate(string.punctuation)
data['Lyrics'] = data['Lyrics'].str.replace('+', ' ').replace('\\', '').replace('"', '')

In [9]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sentiment = []
analyzer = SentimentIntensityAnalyzer()
for label, content in data['Lyrics'].items():
    sentiment.append(analyzer.polarity_scores(content))

In [10]:
neg = []
neu = []
pos = []
compound = []

In [11]:
for sentiments in sentiment:
    neg.append(sentiments['neg'])
    neu.append(sentiments['neu'])
    pos.append(sentiments['pos'])
    compound.append(sentiments['compound'])

In [12]:
data['negative_score'] = neg
data['neutral_score'] = neu
data['positive_score'] = pos
data['compound_score'] = compound

In [13]:
data.head(1)

Unnamed: 0,Artist,Title,Album,#_of_views,Release_date,#_feat_artists,Artist_pop,Genre,Followers,Danceability,...,Energy,Valence,Duration,Loudness,#_words,Lyrics,negative_score,neutral_score,positive_score,compound_score
0,Miranda Lambert,Vice,The Weight of These Wings,42261,2016-07-18,0,72,country,2205152,0.568,...,0.562,0.247,240280,-7.775,233,sting of the needle dropping on a vinyl neon s...,0.034,0.868,0.098,0.9311


# Classification goes below here
We will conduct some experiments on which models to use, and which feature combinations work best with these models.

# Predict Popularity

In [14]:
data_pop = data.loc[:,'Danceability':'compound_score']
#data_pop = data.loc[:,'Danceability':'Loudness']

In [15]:
data_pop['Genre'] = data['Genre']

In [16]:
data_pop.isnull().sum()

Danceability       0
Track_pop          0
Explicit          16
Energy             0
Valence            0
Duration           0
Loudness           0
#_words            0
Lyrics             0
negative_score     0
neutral_score      0
positive_score     0
compound_score     0
Genre              0
dtype: int64

In [17]:
data_pop.dropna(subset = ['Explicit'], inplace = True)
data_pop.reset_index(drop = True, inplace = True)

In [18]:
data_pop['Explicit'] = data_pop['Explicit'].astype(int)

In [19]:
data_pop = pd.concat([data_pop, pd.get_dummies(data_pop['Genre'], 
                    dummy_na=True)], axis = 1).drop(['Genre'], axis = 1)

In [20]:
data_pop.drop(columns = ['Lyrics'], inplace = True)
data_pop.head(1)

Unnamed: 0,Danceability,Track_pop,Explicit,Energy,Valence,Duration,Loudness,#_words,negative_score,neutral_score,positive_score,compound_score,country,hip hop,metal,pop,rock,NaN
0,0.568,57,0,0.562,0.247,240280,-7.775,233,0.034,0.868,0.098,0.9311,1,0,0,0,0,0


In [24]:
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

<br><br>
Linear Regression for track popularity

In [25]:
X = data_pop.drop(['Track_pop'], axis = 1)
y = data_pop['Track_pop']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [27]:
pipeline = make_pipeline(StandardScaler(), SGDRegressor())

pipeline.fit(X = X_train, y = y_train)
y_predict = pipeline.predict(X = X_test)
np.absolute(y_predict - y_test).mean()

16.323284886464826

<br><br>
AdaBoostRegressor and DecisionTreeRegressor

In [28]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

regr_1 = DecisionTreeRegressor(max_depth=4)
regr_2 = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=300)

pipeline = make_pipeline(StandardScaler(), regr_2)
pipeline.fit(X = X_train, y = y_train)
    
y_predict = pipeline.predict(X = X_test)
np.absolute(y_predict - y_test).mean()

16.53934650262033

<br><br>
GradientBoostingRegressor & RandomForestRegressor & LinearRegression<br>
VotingRegressor

In [29]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import VotingRegressor

reg1 = GradientBoostingRegressor(random_state=1, n_estimators=10)
reg2 = RandomForestRegressor(random_state=1, n_estimators=10)
reg3 = LinearRegression()
ereg = VotingRegressor(estimators=[('gb', reg1), ('rf', reg2), ('lr', reg3)])

pipeline = make_pipeline(StandardScaler(), ereg)
pipeline.fit(X = X_train, y = y_train)

y_predict = pipeline.predict(X = X_test)
np.absolute(y_predict - y_test).mean()

16.078295259508778

<br><br>
Break popularity into <70 and >=70

In [30]:
y = np.where(data_pop['Track_pop'] < 70, 'looser', 'winner')

In [31]:
np.unique(y, return_counts = True)

(array(['looser', 'winner'], dtype='<U6'), array([2766,  227]))

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

<br>
AdaBoost Classifier

In [33]:
from sklearn.ensemble import AdaBoostClassifier

clf = AdaBoostClassifier(n_estimators = 100, random_state = 0)
pipeline = make_pipeline(StandardScaler(), clf)
pipeline.fit(X = X_train, y = y_train)
pipeline.score(X = X_test, y = y_test)

0.9225634178905207

<br>
Random Forest Classifier

In [34]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth = 4, random_state=0)
pipeline = make_pipeline(StandardScaler(), clf)
pipeline.fit(X = X_train, y = y_train)
pipeline.score(X = X_test, y = y_test)

0.9265687583444593