In [None]:
!pip install transformers

In [None]:
import math
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import re
import json

from transformers import pipeline
from collections import defaultdict

In [None]:
# importing all datasets (1 per album)
rough_data1 = pd.read_csv("01-taylor_swift.csv")
rough_data2 = pd.read_csv("02-fearless_taylors_version.csv")
rough_data3 = pd.read_csv("03-speak_now_deluxe_package.csv")
rough_data4 = pd.read_csv("04-red_deluxe_edition.csv")
rough_data5 = pd.read_csv("05-1989_deluxe.csv")
rough_data6 = pd.read_csv("06-reputation.csv")
rough_data7 = pd.read_csv("07-lover.csv")
rough_data8 = pd.read_csv("08-folklore_deluxe_version.csv")
rough_data9 = pd.read_csv("09-evermore_deluxe_version.csv")
rough_datasets = [rough_data1, rough_data2, rough_data3, rough_data4, rough_data5, rough_data6, rough_data7, rough_data8, rough_data9]

In [None]:
# Using pre-trained classifier to label the lyrics
classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

def label_data(data):
    labels = defaultdict(dict)
    for i, line in data.iterrows():
        track_n = line["track_n"]
        lyric = line["lyric"]
        key = (track_n, lyric)
        if lyric not in labels[track_n].keys():
            label = classifier(lyric)
            labels[track_n][lyric] = label
    return labels

In [None]:
# Label all albums with emotions
labelled_data1 = label_data(rough_data1)
labelled_data2 = label_data(rough_data2)
labelled_data3 = label_data(rough_data3)
labelled_data4 = label_data(rough_data4)
labelled_data5 = label_data(rough_data5)
labelled_data6 = label_data(rough_data6)
labelled_data7 = label_data(rough_data7)
labelled_data8 = label_data(rough_data8)
labelled_data9 = label_data(rough_data9)
labelled_datasets = [labelled_data1, labelled_data2, labelled_data3, labelled_data4, labelled_data5, labelled_data6, labelled_data7, labelled_data8, labelled_data9]

In [None]:
# Select highest emotion per lyric (for all lyrics of all songs of one album)
def label_songs_one_emotion_per_lyric(data):
    emotions = {}
    for song_key in data.keys():
        song = data[song_key]
        dic = {}
        for lyric_key in song:
            lyric = song[lyric_key][0]
            highest_emotion = ""
            highest_score = 0
            for emotion in lyric:
                if emotion['score'] > highest_score:
                    highest_emotion = emotion['label']
                    highest_score = emotion['score']
            dic[lyric_key] = (highest_emotion, highest_score)
        emotions[song_key] = dic
    return emotions


# Sum emotion scores per song (for all songs of one album)
def label_songs_summed_emotions(data):
    emotions = {}
    for song_key in data.keys():
        song = data[song_key]
        dic = {"anger": 0, "disgust": 0, "fear": 0, "joy": 0, "neutral": 0, "sadness": 0, "surprise": 0}
        for lyric_key in song:
            lyric = song[lyric_key][0]
            for emotion in lyric:
                dic[emotion['label']] += emotion['score']
        N_lines = len(song)
        normalised_dic = {key: value / N_lines for key, value in dic.items()}
        emotions[song_key] = normalised_dic
    return emotions


# Count occurences of emotions per song (for all songs of one album)
# where each lyric has one emotion (from label_songs_one_emotion_per_lyric(data))
def label_songs_overview(data):
    overview = {}
    for song_key in data.keys():
        song = data[song_key]
        counter = {"anger": 0, "disgust": 0, "fear": 0, "joy": 0, "neutral": 0, "sadness": 0, "surprise": 0}
        for lyric_key in song:
            emotion, _ = song[lyric_key]
            counter[emotion] += 1
        overview[song_key] = counter
    return overview


# Emotion with most occurences per song (for all songs of one album)
def label_album_one_emotion_per_song(data):
    emotions = {}
    for song_key in data.keys():
        song = data[song_key]
        highest_emotion = max(song, key=song.get)
        highest_count = song[highest_emotion]
        emotions[song_key] = (highest_emotion, highest_count)
    return emotions


# Count occurences of emotions per album
# where each song has one emotion (from label_album_one_emotion_per_song(data))
def label_album_overview(data):
    overview = {"anger": 0, "disgust": 0, "fear": 0, "joy": 0, "neutral": 0, "sadness": 0, "surprise": 0}
    for song_key in data.keys():
        emotion, _ = data[song_key]
        overview[emotion] += 1
    return overview


# Sum emotion scores per album
def label_album_summed_emotions(data):
    dic = {"anger": 0, "disgust": 0, "fear": 0, "joy": 0, "neutral": 0, "sadness": 0, "surprise": 0}
    for song_key in data.keys():
        song = data[song_key]
        for emotion in song.keys():
            dic[emotion] += song[emotion]
    N_lines = len(data)
    normalised_dic = {key: value / N_lines for key, value in dic.items()}
    return normalised_dic


# Plot data in a given axis of subplot or as separate plot
def plot_emotion_distribution(data, ax=None, subplot=False, title=""):
    if subplot:
        values = list(data.values())
        labels = list(data.keys())
        ax.bar(range(len(values)), values, tick_label=labels)
    else:
        values = list(data.values())
        labels = list(data.keys())
        plt.bar(range(len(values)), values, tick_label=labels)
        plt.title(title)
        plt.plot()

In [None]:
# per album: counts of songs with that emotion (based on that emotion occuring most when each lyric has one label)
    # per song: counts of lyrics with that emotion
for i, labelled_data in enumerate(labelled_datasets):
    rough_data = rough_datasets[i]
    album_name = rough_data.iloc[0]["album_name"]
    N_songs = max(rough_data["track_n"])    

    one_emotion_lyric = label_songs_one_emotion_per_lyric(labelled_data)
    overview_emotion_song = label_songs_overview(one_emotion_lyric)
    one_emotion_song = label_album_one_emotion_per_song(overview_emotion_song)
    overview_emotions_album = label_album_overview(one_emotion_song)

    plot_emotion_distribution(overview_emotions_album, title=album_name.upper())

    width = 5
    height = math.ceil(N_songs/width)
    fig, axes = plt.subplots(height, width, figsize=(15, height*3), sharey=True)

    for i, ax in enumerate(axes.flatten()):
        if i < N_songs:
            song_n = i+1
            song_title = rough_data.loc[rough_data["track_n"]==song_n].iloc[0]["track_title"]
            plot_emotion_distribution(overview_emotion_song[song_n], ax, subplot=True)
            short_title = re.sub("[\(\[].*?[\)\]]", "", song_title).strip()
            ax.set_title(short_title)
            ax.tick_params(labelrotation=45)

    plt.tight_layout()
    plt.show()

In [None]:
# per album: counts of songs with that emotion (based on that emotion occuring most when each lyric has one label)
fig, axes = plt.subplots(3, 3, figsize=(10, 10), sharey=True)

for i, ax in enumerate(axes.flatten()):
    rough_data = rough_datasets[i]
    album_name = rough_data.iloc[0]["album_name"]

    one_emotion_lyric = label_songs_one_emotion_per_lyric(labelled_datasets[i])
    overview_emotion_song = label_songs_overview(one_emotion_lyric)
    one_emotion_song = label_album_one_emotion_per_song(overview_emotion_song)
    overview_emotions_album = label_album_overview(one_emotion_song)

    plot_emotion_distribution(overview_emotions_album, ax, subplot=True)
    ax.set_title(album_name)
    ax.tick_params(labelrotation=45)

plt.tight_layout()

In [None]:
for i, labelled_data in enumerate(labelled_datasets):
    rough_data = rough_datasets[i]
    album_name = rough_data.iloc[0]["album_name"]
    N_songs = max(rough_data["track_n"])    

    summed_emotions_song = label_songs_summed_emotions(labelled_data)
    overview_summed_album = label_album_summed_emotions(summed_emotions_song)

    plot_emotion_distribution(overview_summed_album, title=album_name.upper())

    width = 5
    height = math.ceil(N_songs/width)
    fig, axes = plt.subplots(height, width, figsize=(15, height*3), sharey=True)

    for i, ax in enumerate(axes.flatten()):
        if i < N_songs:
            song_n = i+1
            song_title = rough_data.loc[rough_data["track_n"]==song_n].iloc[0]["track_title"]
            plot_emotion_distribution(summed_emotions_song[song_n], ax, subplot=True)
            short_title = re.sub("[\(\[].*?[\)\]]", "", song_title).strip()
            ax.set_title(short_title)
            ax.tick_params(labelrotation=45)

    plt.tight_layout()
    plt.show()

In [None]:
# per album: normalised overview of emotions (per emotion all lyrics counted, over multiple songs)
fig, axes = plt.subplots(3, 3, figsize=(10, 10), sharey=True)
import operator
for i, ax in enumerate(axes.flatten()):
    rough_data = rough_datasets[i]
    album_name = rough_data.iloc[0]["album_name"]

    summed_emotions_song = label_songs_summed_emotions(labelled_datasets[i])
    overview_summed_album = label_album_summed_emotions(summed_emotions_song)
    test = overview_summed_album.pop("neutral")
    
    plot_emotion_distribution(overview_summed_album, ax, subplot=True)
    ax.set_title(album_name)
    ax.tick_params(labelrotation=45)

plt.tight_layout()

In [None]:
index_list = []
for data in rough_datasets:
    idx = np.random.choice(range(1, data['track_n'].max()), 1, replace=False)
    index_list.append(idx)
    print(data.iloc[0]['album_name'], ": ", sorted(idx))

In [None]:
replies = {}
for i, data in enumerate(rough_datasets):
    emotions = {}
    album_name = data.iloc[0]['album_name'].upper()
    print(album_name)
    print(data.loc[data['track_n']==index_list[i][0]].iloc[0]['track_title'])
    for lyric in data.loc[data['track_n']==index_list[i][0]]['lyric'].drop_duplicates():
        reply = input(lyric)
        emotions[lyric] = reply
    replies[album_name] = emotions
    print('\n')

In [None]:
with open('manually_annotated_lyrics.json', 'w') as file:
    json.dump(replies, file)

file = open('manually_annotated_lyrics.json')
manually_annotated_lyrics = json.load(file)
file.close()

In [None]:
all_emotions = {}
for i, data in enumerate(rough_datasets):
    emotions = {}
    for lyric in data.loc[data['track_n']==index_list[i][0]]['lyric'].drop_duplicates():
        annotation = manually_annotated_lyrics[lyric]
        if annotation != "-":
            emotions[lyric] = (annotation, 1)
    all_emotions[data.iloc[0]['album_name']] = emotions

with open('manually_annotated_lyrics_per_album.json', 'w') as file:
    json.dump(all_emotions, file)

file = open('manually_annotated_lyrics_per_album.json')
manually_annotated_lyrics_per_album = json.load(file)
file.close()

In [None]:
# Reversing [] brackets into () to be able to use the same plotting function as above
# Got changed when saving and loading from file
for album in manually_annotated_lyrics_per_album.keys():
    lyrics = manually_annotated_lyrics_per_album[album]
    for lyric in lyrics.keys():
        annotation = manually_annotated_lyrics_per_album[album][lyric]
        manually_annotated_lyrics_per_album[album][lyric] = (annotation[0], annotation[1])

In [None]:
# WITH OWN LABELLING

# per album: counts of songs with that emotion (based on that emotion occuring most when each lyric has one label)
fig, axes = plt.subplots(3, 3, figsize=(10, 10), sharey=True)

for i, ax in enumerate(axes.flatten()):
    rough_data = rough_datasets[i]
    album_name = rough_data.iloc[0]["album_name"]

    # one_emotion_lyric = label_songs_one_emotion_per_lyric(labelled_datasets[i])
    dic = {album_name: manually_annotated_lyrics_per_album[album_name]}
    overview_emotion_song = label_songs_overview(dic)

    plot_emotion_distribution(overview_emotion_song[album_name], ax, subplot=True)
    ax.set_title(album_name)
    ax.tick_params(labelrotation=45)

plt.tight_layout()