In [1]:
import numpy as np
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import re
import os
import zlib
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import sys
import seaborn as sns
from sklearn.cluster import KMeans
import nltk
nltk.downloader.download('vader_lexicon')
from langdetect import detect

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/thomasdorveaux/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


ModuleNotFoundError: No module named 'langdetect'

In [2]:
Spotify = pd.read_csv('SpotifyFeatures.csv')
lyrics = pd.read_csv('only_lyrics.csv')

## Merging lyrics and the songs

In [3]:
Spotify['track_name']=Spotify['track_name'].str.lower()
Spotify['track_name']=Spotify['track_name'].str.strip()
Spotify['artist_name']=Spotify['artist_name'].str.lower()
Spotify['artist_name']=Spotify['artist_name'].str.strip()
lyrics['song']=lyrics['song'].str.lower()
lyrics['song']=lyrics['song'].str.strip()
lyrics['artist']=lyrics['artist'].str.lower()
lyrics['artist']=lyrics['artist'].str.strip()

In [4]:
song_lyrics = Spotify.merge(lyrics, left_on=['track_name', 'artist_name'], right_on=['song', 'artist'])  

In [5]:
song_lyrics =song_lyrics.sort_values(by=['popularity'],ascending=False)
song_lyrics = song_lyrics.reset_index()

In [6]:
song_lyrics=song_lyrics.drop(['index','artist', 'song', 'link'], axis=1)

In [7]:
#clean_lyrics(song_lyrics.iloc[675,-1])

## Pre-processing lyrics

In [8]:
def clean_lyrics(lyrics):
    new_lyrics = re.sub(r'[\(\[].*?[\)\]]', ' ', lyrics)
    new_lyrics = new_lyrics.replace("\n", " ")
    #new_lyrics = new_lyrics.replace(""\"","")
    new_lyrics= new_lyrics.strip()
    new_lyrics= new_lyrics.replace("    ", " ")
    new_lyrics= new_lyrics.replace("   ", " ")
    new_lyrics= new_lyrics.replace("  ", " ")
    new_lyrics = os.linesep.join([s for s in new_lyrics.splitlines() if s])
    return(new_lyrics)

In [9]:
def get_compression_rate(lyrics):
    original = lyrics.encode('utf-8')
    compressed = zlib.compress(original)
    decompressed = zlib.decompress(compressed)
    
    compression_rate = (len(original)-len(compressed))/len(original)
    return compression_rate

In [10]:
#Apply clean_lyrics function to text column
song_lyrics['text'] = song_lyrics['text'].map(clean_lyrics)

#Append new column with compression rate
song_lyrics['compression_rate'] = song_lyrics['text'].map(get_compression_rate)

In [11]:
#Drop duplicates
song_lyrics = song_lyrics.sort_values(by='popularity', ascending=False)
song_lyrics = song_lyrics.drop_duplicates(subset='track_id', keep="first")

#Keep only english songs
song_lyrics['language'] = song_lyrics['text'].map(detect)
song_lyrics = song_lyrics[song_lyrics['language'] =="en"]

NameError: name 'detect' is not defined

### Addition of sentment analysis (+,-,=) values

In [12]:
sia=SentimentIntensityAnalyzer()
sentiment = pd.DataFrame(columns=('id','senti_positive', 'senti_neutral', 'senti_negative'))
for i in range(len(song_lyrics['text'])):
    num_positive = 0
    num_negative = 0
    num_neutral = 0
    lyric= song_lyrics.iloc[i,-2]
    for j in lyric.split():
        comp = sia.polarity_scores(j)
        comp = comp['compound']
        #print(test)
        if comp >= 0.5:
            num_positive += 1
            #print('positive:',i)
        elif comp > -0.5 and comp < 0.5:
            num_neutral += 1
        else:
            num_negative += 1
        #print('negative:',i)
    sentiment.loc[i] = (song_lyrics.iloc[i,3],num_positive,num_neutral,num_negative)
#num_total = num_negative + num_neutral + num_positive
#print('Total score:',num_total)
#print('Negative score:',num_negative)
#print('Neutral score:',num_neutral)
#print('Positive score:',num_positive)

In [None]:
song_lyrics = song_lyrics.merge(sentiment, left_on=['track_id'], right_on=['id'])
song_lyrics=song_lyrics.drop(['id'], axis=1)
song_lyrics['senti_total']=song_lyrics['senti_positive']-song_lyrics['senti_negative']

## Addition of classes

1. Popularity

In [None]:
quantile=song_lyrics['popularity'].quantile(np.arange(0, 1.01, 0.01).tolist())
quantile=quantile.reset_index()
quantile.rename(columns={'index':'quantile'}, inplace=True)
sns.lineplot(quantile['popularity'],quantile['quantile'])

In [None]:
kmeans = KMeans(n_clusters=5)
kmeans.fit(quantile)
quantile['cluster']=kmeans.predict(quantile)

In [None]:
plt.scatter(quantile['popularity'],quantile['cluster'])

In [None]:
one=quantile[quantile.cluster==1]
two=quantile[quantile.cluster==2]
three=quantile[quantile.cluster==3]
zero=quantile[quantile.cluster==0]
four=quantile[quantile.cluster==4]
min_one=np.min(one['popularity'])
max_one=np.max(one['popularity'])
min_two=np.min(two['popularity'])
max_two=np.max(two['popularity'])
min_three=np.min(three['popularity'])
max_three=np.max(three['popularity'])
min_zero=np.min(zero['popularity'])
max_zero=np.max(zero['popularity'])
min_four=np.min(four['popularity'])
max_four=np.max(four['popularity'])
print(1,min_one,max_one)
print(2,min_two,max_two)
print(3,min_three,max_three)
print(4,min_four,max_four)
print(0,min_zero,max_zero)

In [None]:
conditions = [
    (song_lyrics['popularity'] >= 78)&(song_lyrics['popularity'] <=100),
    (song_lyrics['popularity'] >= 55)&(song_lyrics['popularity'] <=77),
    (song_lyrics['popularity'] >= 39)&(song_lyrics['popularity'] <=54),
    (song_lyrics['popularity'] >= 19)&(song_lyrics['popularity'] <=38),
    (song_lyrics['popularity'] >= 18)&(song_lyrics['popularity'] <=0)]
choices = [4, 3, 2,1,0]
song_lyrics['label'] = np.select(conditions, choices)
song_lyrics.tail(1)

2. Acousticness

In [None]:
quantile_acou=song_lyrics['acousticness'].quantile(np.arange(0, 1.01, 0.01).tolist())
quantile_acou=quantile_acou.reset_index()
quantile_acou.rename(columns={'index':'quantile'}, inplace=True)
sns.lineplot(quantile_acou['acousticness'],quantile_acou['quantile'])

In [None]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(quantile_acou)
quantile_acou['cluster']=kmeans.predict(quantile_acou)
plt.scatter(quantile_acou['acousticness'],quantile_acou['cluster'])

In [None]:
zero=quantile_acou[quantile_acou.cluster==0]
one=quantile_acou[quantile_acou.cluster==1]
two=quantile_acou[quantile_acou.cluster==2]
three=quantile_acou[quantile_acou.cluster==3]
four=quantile_acou[quantile_acou.cluster==4]

min_zero=np.min(zero['acousticness'])
max_zero=np.max(zero['acousticness'])
min_one=np.min(one['acousticness'])
max_one=np.max(one['acousticness'])
min_two=np.min(two['acousticness'])
max_two=np.max(two['acousticness'])
min_three=np.min(three['acousticness'])
max_three=np.max(three['acousticness'])

print(0,min_zero,max_zero)
print(1,min_one,max_one)
print(2,min_two,max_two)
print(3,min_three,max_three)

In [None]:
conditions_acou = [
    (song_lyrics['acousticness'] >= 0.6158599999999996)&(song_lyrics['acousticness'] <=0.995),
    (song_lyrics['acousticness'] >= 0.251)& (song_lyrics['acousticness'] <=0.594),
    (song_lyrics['acousticness'] >= 0.0363)&(song_lyrics['acousticness'] <=0.239),
    (song_lyrics['acousticness'] >= 1.39e-06)&(song_lyrics['acousticness'] <=0.03276299999999999)]
choices_acou = [3, 2,1,0]
song_lyrics['label_acou'] = np.select(conditions_acou, choices_acou)

# Features Engineering

1. Analysing features distributions

In [None]:
sns.distplot(song_lyrics['duration_ms'])

In [None]:
f, axes = plt.subplots(2, 5, figsize=(14, 8), sharex=False)
sns.distplot(song_lyrics['popularity'],ax=axes[0,0])
sns.distplot(song_lyrics['acousticness'],ax=axes[0,1])
sns.distplot(song_lyrics['danceability'],ax=axes[0,2])
sns.distplot(song_lyrics['energy'],ax=axes[0,3])
sns.distplot(song_lyrics['instrumentalness'],ax=axes[0, 4])
sns.distplot(song_lyrics['liveness'],ax=axes[1,0])
sns.distplot(song_lyrics['loudness'],ax=axes[1,1])
sns.distplot(song_lyrics['speechiness'],ax=axes[1,2])
sns.distplot(song_lyrics['valence'],ax=axes[1,3])
sns.distplot(song_lyrics['tempo'],ax=axes[1,4])

In [None]:
#turning duration_ms to improve this feature
ln_duration_ms = np.log(song_lyrics['duration_ms'])
ln_liveness = np.log(song_lyrics['liveness'])
#sns.distplot(ln_duration_ms)
#sns.distplot(ln_liveness)

In [None]:
plt.hist(song_lyrics['popularity'], bins=20)

In [None]:
sns.distplot(song_lyrics['compression_rate'])

In [None]:
#plt.scatter(song_lyrics['popularity'],song_lyrics['senti_positive'] )
#plt.scatter(song_lyrics['popularity'],song_lyrics['senti_neutral'] )
#plt.scatter(song_lyrics['popularity'],song_lyrics['senti_total'] )
#plt.scatter(song_lyrics['popularity'],song_lyrics['speechiness'] )
#plt.scatter(song_lyrics['popularity'],song_lyrics['speechiness'] )
#plt.scatter(song_lyrics['popularity'],song_lyrics['valence'] )
#plt.scatter(song_lyrics['popularity'],song_lyrics['compression_rate'] )
#plt.scatter(song_lyrics['popularity'],song_lyrics['acousticness'] )

2. Encoding

In [None]:
# renaming data before classification

# track features
genre = song_lyrics['genre']
artist_name = song_lyrics['artist_name']
track_name = song_lyrics['track_name']
track_id = song_lyrics['track_id']

# spotify features
popularity = song_lyrics['popularity']
acousticness = song_lyrics['acousticness']
class_acou = song_lyrics['label_acou']
danceability = song_lyrics['danceability']
duration_ms = song_lyrics['duration_ms']  # ln_duration_ms should be used for any models (centered distribution)
energy = song_lyrics ['energy']
instrumentalness = song_lyrics['instrumentalness']
key = song_lyrics['key']
liveness = song_lyrics['liveness']  #ln_liveness should be better also
loudness = song_lyrics['loudness']
mode = song_lyrics['mode']
speechiness = song_lyrics['speechiness']
tempo = song_lyrics['tempo']
time_signature = song_lyrics['time_signature']
valence = song_lyrics['valence']

# sentimental features
text = song_lyrics['text']
compression_rate = song_lyrics['compression_rate']
senti_positive = song_lyrics['senti_positive']
senti_neutral = song_lyrics['senti_neutral']
senti_negative = song_lyrics['senti_negative']
senti_total = song_lyrics['senti_total']

In [None]:
# encoding discrete data

# track features
genre_encoded = pd.get_dummies(genre,prefix='genre', dummy_na=False)
artist_name_encoded = pd.get_dummies(genre,prefix='artist_name', dummy_na=False)

# spotify features 
key_encoded = pd.get_dummies(key,prefix='key', dummy_na=False)
mode_encoded = pd.get_dummies(mode,prefix='mode', dummy_na=False)
time_signature_encoded = pd.get_dummies(time_signature,prefix='time_signature', dummy_na=False)
class_acou_encoded = pd.get_dummies(time_signature,prefix='class_acou', dummy_na=False)

#sentimal features 
senti_positive_enoded = pd.get_dummies(senti_positive,prefix='senti_positive', dummy_na=False)
senti_neutral_encoded = pd.get_dummies(senti_neutral,prefix='senti_neutral', dummy_na=False)
senti_negative_encoded = pd.get_dummies(senti_negative,prefix='senti_negative', dummy_na=False)
senti_total_encoded = pd.get_dummies(senti_total,prefix='senti_total', dummy_na=False)