# Sentiment Analysis on Lyrics by Genre
2023-04-21<br>
Evangeline Chang

In [1]:
%pip install xlrd

[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import re
import xlrd
import numpy as np
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

import warnings
warnings.filterwarnings("ignore")

In [3]:
all_lyrics = pd.read_excel('data/RQ1/lyrics1.xlsx')

In [4]:
# data cleaning
all_lyrics['lyrics'] = all_lyrics['lyrics'].apply(lambda x: re.sub(r'^.*?Lyrics', '', str(x)))
all_lyrics['lyrics'] = all_lyrics['lyrics'].apply(lambda x: re.sub(r'\[(.*?)\]', '', str(x)))
all_lyrics['lyrics'] = all_lyrics['lyrics'].apply(lambda x: re.sub(r'\n', ' ', str(x)))
all_lyrics['lyrics'] = all_lyrics['lyrics'].apply(lambda x: re.sub(r'\d+Embed', '', str(x)))

all_lyrics['genres'] = all_lyrics['genres'].fillna('').apply(lambda x: str(x).lower())

all_lyrics.head()

Unnamed: 0,year,No.,Title,Artist(s),lyrics,genres,subgenres,time
0,2003,1,"""In da Club""",50 Cent,"Go, go Go, go, go, go Go, shawty, it's your...",hip hop,"east coast hip hop, gangster rap, hip hop, pop...",03:13:00
1,2003,2,"""Ignition""",R. Kelly,You remind me of something I just can't think...,,,03:06:00
2,2003,3,"""Get Busy""",Sean Paul,"Shake dat ting, miss Cana, Cana Shake dat tin...","pop, world/traditional, hip hop","dance pop, dancehall, pop rap",03:31:00
3,2003,4,"""Crazy in Love""",Beyoncé featuring Jay-Z,"Yes! (Whoo, ow!) So crazy right now Most incr...","pop, hip hop, r&b","dance pop, pop, r&b, east coast hip hop, hip h...",03:56:00
4,2003,5,"""When I'm Gone""",3 Doors Down,"Yeah, it's my life In my own words, I guess ...","metal, pop","alternative metal, nu metal, pop rock, post-gr...",04:20:00


In [5]:
df = all_lyrics[["lyrics"]]
df.head()

Unnamed: 0,lyrics
0,"Go, go Go, go, go, go Go, shawty, it's your..."
1,You remind me of something I just can't think...
2,"Shake dat ting, miss Cana, Cana Shake dat tin..."
3,"Yes! (Whoo, ow!) So crazy right now Most incr..."
4,"Yeah, it's my life In my own words, I guess ..."


In [6]:
df1 = df.copy()
df2 = df.copy()

## TextBlob

In [7]:
from textblob import TextBlob

In [8]:
for index, row in df1.iterrows():
    lyrics = row['lyrics']
    analysis = TextBlob(lyrics)
    df1.at[index, 'polarity'] = analysis.sentiment[0]
    df1.at[index, 'subjectivity'] = analysis.sentiment[1]
    if analysis.sentiment[0] > 0:
        df1.at[index, 'Sentiment'] = "Positive"
    elif analysis.sentiment[0] < 0:
        df1.at[index, 'Sentiment'] = "Negative"
    else:
        df1.at[index, 'Sentiment'] = "Neutral"

In [9]:
df1.head()

Unnamed: 0,lyrics,polarity,subjectivity,Sentiment
0,"Go, go Go, go, go, go Go, shawty, it's your...",0.144514,0.577083,Positive
1,You remind me of something I just can't think...,-0.08384,0.408799,Negative
2,"Shake dat ting, miss Cana, Cana Shake dat tin...",0.316818,0.4575,Positive
3,"Yes! (Whoo, ow!) So crazy right now Most incr...",0.018005,0.643786,Positive
4,"Yeah, it's my life In my own words, I guess ...",0.095136,0.385112,Positive


## VADER

In [10]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/evangeline/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [11]:
for index, row in df2.iterrows():
    score = SentimentIntensityAnalyzer().polarity_scores(row['lyrics'])
    if score['neg'] > score['pos']:
        df2.loc[index, "Sentiment"] = "Negative"
    elif score['pos'] > score['neg']:
        df2.loc[index, "Sentiment"] = "Positive"
    else:
        df2.loc[index, "Sentiment"] = "Neutral"
        
    df2.loc[index, 'neg'] = score['neg']
    df2.loc[index, 'neu'] = score['neu']
    df2.loc[index, 'pos'] = score['pos']
    df2.loc[index, 'compound'] = score['compound']

In [12]:
df2.head()

Unnamed: 0,lyrics,Sentiment,neg,neu,pos,compound
0,"Go, go Go, go, go, go Go, shawty, it's your...",Negative,0.11,0.794,0.096,-0.9618
1,You remind me of something I just can't think...,Negative,0.054,0.911,0.035,-0.9375
2,"Shake dat ting, miss Cana, Cana Shake dat tin...",Positive,0.082,0.807,0.111,0.9873
3,"Yes! (Whoo, ow!) So crazy right now Most incr...",Negative,0.161,0.717,0.121,-0.9654
4,"Yeah, it's my life In my own words, I guess ...",Positive,0.086,0.788,0.125,0.9896


## Comparison

In [13]:
df1.rename(columns={'Sentiment': 'Textblob'}, inplace=True)
df1.rename(columns={'polarity': 'TB_Polarity','subjectivity':'TB_Subjectivity'}, inplace=True)
df1.head()

Unnamed: 0,lyrics,TB_Polarity,TB_Subjectivity,Textblob
0,"Go, go Go, go, go, go Go, shawty, it's your...",0.144514,0.577083,Positive
1,You remind me of something I just can't think...,-0.08384,0.408799,Negative
2,"Shake dat ting, miss Cana, Cana Shake dat tin...",0.316818,0.4575,Positive
3,"Yes! (Whoo, ow!) So crazy right now Most incr...",0.018005,0.643786,Positive
4,"Yeah, it's my life In my own words, I guess ...",0.095136,0.385112,Positive


In [14]:
df2.rename(columns={'Sentiment':'Vader', 'compound':'VD_Polarity'},inplace=True)
df2.drop(['lyrics'], axis=1, inplace=True)
df2.head()

Unnamed: 0,Vader,neg,neu,pos,VD_Polarity
0,Negative,0.11,0.794,0.096,-0.9618
1,Negative,0.054,0.911,0.035,-0.9375
2,Positive,0.082,0.807,0.111,0.9873
3,Negative,0.161,0.717,0.121,-0.9654
4,Positive,0.086,0.788,0.125,0.9896


In [15]:
pdList = [df1, df2]
df_comp = pd.concat((pdList), axis=1)

df_comp = df_comp[["lyrics", "Textblob", "Vader", "TB_Polarity", "VD_Polarity", "neg"]]
df_comp.head()

Unnamed: 0,lyrics,Textblob,Vader,TB_Polarity,VD_Polarity,neg
0,"Go, go Go, go, go, go Go, shawty, it's your...",Positive,Negative,0.144514,-0.9618,0.11
1,You remind me of something I just can't think...,Negative,Negative,-0.08384,-0.9375,0.054
2,"Shake dat ting, miss Cana, Cana Shake dat tin...",Positive,Positive,0.316818,0.9873,0.082
3,"Yes! (Whoo, ow!) So crazy right now Most incr...",Positive,Negative,0.018005,-0.9654,0.161
4,"Yeah, it's my life In my own words, I guess ...",Positive,Positive,0.095136,0.9896,0.086


In [16]:
df_comp.drop(['lyrics'], axis=1, inplace=True)
merged_df = all_lyrics.join(df_comp)
merged_df.head()

Unnamed: 0,year,No.,Title,Artist(s),lyrics,genres,subgenres,time,Textblob,Vader,TB_Polarity,VD_Polarity,neg
0,2003,1,"""In da Club""",50 Cent,"Go, go Go, go, go, go Go, shawty, it's your...",hip hop,"east coast hip hop, gangster rap, hip hop, pop...",03:13:00,Positive,Negative,0.144514,-0.9618,0.11
1,2003,2,"""Ignition""",R. Kelly,You remind me of something I just can't think...,,,03:06:00,Negative,Negative,-0.08384,-0.9375,0.054
2,2003,3,"""Get Busy""",Sean Paul,"Shake dat ting, miss Cana, Cana Shake dat tin...","pop, world/traditional, hip hop","dance pop, dancehall, pop rap",03:31:00,Positive,Positive,0.316818,0.9873,0.082
3,2003,4,"""Crazy in Love""",Beyoncé featuring Jay-Z,"Yes! (Whoo, ow!) So crazy right now Most incr...","pop, hip hop, r&b","dance pop, pop, r&b, east coast hip hop, hip h...",03:56:00,Positive,Negative,0.018005,-0.9654,0.161
4,2003,5,"""When I'm Gone""",3 Doors Down,"Yeah, it's my life In my own words, I guess ...","metal, pop","alternative metal, nu metal, pop rock, post-gr...",04:20:00,Positive,Positive,0.095136,0.9896,0.086


In [17]:
# calculate count of each genre
genres_list = ['pop', 'hip hop', 'rock', 'latin', 'dance/electronic', 'r&b', 'country', 
               'folk/acoustic', 'metal', 'jazz', 'easy listening', 'blues', 'world/traditional']

genre_counts = [merged_df['genres'].str.contains(g).sum() for g in genres_list]

genre_count_dict = dict(zip(genres_list, genre_counts))
genre_count_dict = dict(sorted(genre_count_dict.items(), key=lambda x: x[1], reverse=True))

for key, value in genre_count_dict.items():
    print(f"{key}: {value}")
print('\nSum: ', sum(genre_counts), sep='')

pop: 1427
hip hop: 913
r&b: 545
country: 210
rock: 152
folk/acoustic: 134
dance/electronic: 133
metal: 67
latin: 57
world/traditional: 29
easy listening: 18
blues: 10
jazz: 2

Sum: 3697


In [18]:
a = merged_df[merged_df['genres'].str.contains('world/traditional')]
a['TB_Polarity'].describe()

count    29.000000
mean      0.176388
std       0.183263
min      -0.098958
25%       0.081852
50%       0.156250
75%       0.282791
max       0.750000
Name: TB_Polarity, dtype: float64

In [19]:
# sentiment score for all genres from the past 20 years
genres_mean = {}

# genres_list = ['pop', 'hip hop', 'rock', 'latin', 'dance/electronic', 'r&b', 'country', 
#                'folk/acoustic', 'metal', 'jazz', 'easy listening', 'blues', 'world/traditional']

for genre in genres_list:
    genre_df = merged_df[merged_df['genres'].str.contains(genre)]
    genre_polarity = genre_df['TB_Polarity'].mean()
    std = genre_df['TB_Polarity'].std()
    number = merged_df['genres'].str.contains(genre).sum()
    genres_mean[genre] = (genre_polarity, number, std)


sorted_genres = dict(sorted(genres_mean.items(), key=lambda x: x[1], reverse=True))

genres_sentiment_df = pd.DataFrame.from_dict(sorted_genres, orient='index')
genres_sentiment_df.rename(columns={0:'Average Sentiment Score', 1:'Count', 2: 'STD'}, inplace=True)
display(genres_sentiment_df)
genres_sentiment_df.to_csv('data/RQ1/genres_sentiment.csv')

Unnamed: 0,Average Sentiment Score,Count,STD
world/traditional,0.176388,29,0.183263
easy listening,0.167961,18,0.14941
jazz,0.148346,2,0.27334
folk/acoustic,0.110295,134,0.175103
dance/electronic,0.092778,133,0.198814
metal,0.092507,67,0.174314
r&b,0.08866,545,0.161038
pop,0.087315,1427,0.173498
country,0.079891,210,0.149796
rock,0.078955,152,0.162191


In [20]:
# keeping only the songs that belong to the three genres
keep = ['pop', 'hip hop', 'r&b']
select_genre_df = merged_df[merged_df.genres.fillna('').str.contains('|'.join(keep))]
select_genre_df.drop(['No.', 'Artist(s)', 'Title', 'subgenres', 'time'], axis=1, inplace=True)
display(select_genre_df)

Unnamed: 0,year,lyrics,genres,Textblob,Vader,TB_Polarity,VD_Polarity,neg
0,2003,"Go, go Go, go, go, go Go, shawty, it's your...",hip hop,Positive,Negative,0.144514,-0.9618,0.110
2,2003,"Shake dat ting, miss Cana, Cana Shake dat tin...","pop, world/traditional, hip hop",Positive,Positive,0.316818,0.9873,0.082
3,2003,"Yes! (Whoo, ow!) So crazy right now Most incr...","pop, hip hop, r&b",Positive,Negative,0.018005,-0.9654,0.161
4,2003,"Yeah, it's my life In my own words, I guess ...","metal, pop",Positive,Positive,0.095136,0.9896,0.086
5,2003,"All day Starin' at the ceilin', makin' Friend...","folk/acoustic, pop, metal",Positive,Positive,0.082143,0.9396,0.041
...,...,...,...,...,...,...,...,...
1989,2022,"We hug and, yes, we make love And always just...","pop, r&b",Positive,Positive,0.171171,0.9953,0.062
1990,2022,"She don't like it, catching cabs downtown The...","country, folk/acoustic, pop",Positive,Positive,0.026492,0.9855,0.078
1993,2022,"(Ooh, ooh-ooh, ooh-ooh) (Ooh, ooh-ooh, ooh-oo...",pop,Negative,Negative,-0.354428,-0.9959,0.180
1997,2022,Mummy don't know Daddy's getting hot At the B...,pop,Positive,Positive,0.053283,0.9911,0.021


In [21]:
# all songs from the three chosen genres
years = list(range(2003, 2023))

pop_dict = {}
hip_dict = {}
rb_dict = {}

for year in years:
    year_df = select_genre_df[select_genre_df['year'] == year]
    pop_df = year_df[year_df['genres'].str.contains('pop')]
    hip_df = year_df[year_df['genres'].str.contains('hip hop')]
    rb_df = year_df[year_df['genres'].str.contains('r&b')]
    
    pop_dict[year] = pop_df
    hip_dict[year] = hip_df
    rb_dict[year] = rb_df

In [22]:
# Average score of the top three genres by year
avg_dict = {}

for genre_dict, genre_name in zip([pop_dict, hip_dict, rb_dict], ['pop', 'hip hop', 'r&b']):
    genre_avg_dict = {}
    
    for year in range(2003, 2023):
        year_genre_df = select_genre_df[(select_genre_df['year'] == year) & (select_genre_df['genres'].str.contains(genre_name))]
        avg_polarity = year_genre_df['TB_Polarity'].mean()
        genre_avg_dict[year] = avg_polarity
    
    avg_dict[genre_name] = genre_avg_dict

In [23]:
# Average sentiment score for each year
all_mean = {}

for year in range(2003, 2023):
    year_df = merged_df[merged_df['year'] == year]
    avg_polarity = year_df['TB_Polarity'].mean()
    all_mean[year] = avg_polarity

all_pol = {'All': all_mean}

In [24]:
avg_dict.update(all_pol)
genre_year_df = pd.DataFrame.from_dict(avg_dict, orient='index').T
genre_year_df.to_csv('data/RQ1/genre_year.csv')
genre_year_df.style.background_gradient(cmap='PiYG')

Unnamed: 0,pop,hip hop,r&b,All
2003,0.088982,0.083506,0.079589,0.09787
2004,0.096085,0.084949,0.077571,0.083725
2005,0.063687,0.083028,0.078962,0.06839
2006,0.07902,0.106982,0.087766,0.071945
2007,0.102858,0.095703,0.123575,0.102561
2008,0.110697,0.114913,0.108469,0.120089
2009,0.094357,0.112534,0.154958,0.094003
2010,0.123186,0.108619,0.119068,0.123576
2011,0.097761,0.074568,0.090422,0.102527
2012,0.087864,0.066245,0.111625,0.088902


In [25]:
print(merged_df['TB_Polarity'].describe())
all_range = merged_df['TB_Polarity'].describe()[7] - merged_df['TB_Polarity'].describe()[3]
print(all_range)

count    2000.000000
mean        0.082474
std         0.168582
min        -0.700000
25%        -0.021370
50%         0.071078
75%         0.184148
max         0.764229
Name: TB_Polarity, dtype: float64
1.464229009983727
