# Topic Modeling

## Import

In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import matplotlib.dates

import seaborn as sns

from src.progress_bar import printProgressBar

from ast import literal_eval

import gensim
import gensim.corpora as corpora
from gensim.models import TfidfModel

import pyLDAvis
import pyLDAvis.gensim_models

import spacy

import json

import os
for dirname, _, filenames in os.walk('../../data/lyrics'):
    for filename in filenames:
        print(os.path.join(dirname, filename).replace("\\", "/"))

print(50 * '-')

for dirname, _, filenames in os.walk('../../data/billboard'):
    for filename in filenames:
        print(os.path.join(dirname, filename).replace("\\", "/"))

../../data/lyrics/artist_song_lyrics.csv
../../data/lyrics/artist_song_lyrics_new.csv
../../data/lyrics/bb-t100-lyrics.csv
../../data/lyrics/bb_t100_lyrics_en.csv
../../data/lyrics/bb_t100_lyrics_en_v2.csv
../../data/lyrics/corona-lyrics.csv
../../data/lyrics/covid_lyrics_bb.csv
../../data/lyrics/lyrics_invalid.json
../../data/lyrics/lyrics_invalid_updated.csv
../../data/lyrics/backups/artist_song_lyrics_bak.csv
../../data/lyrics/backups/bb-t100-lyrics.csv
../../data/lyrics/backups/bb-t100-lyrics_new.csv
../../data/lyrics/backups/bb-t100-lyrics_old.csv
../../data/lyrics/backups/bb_t100_lyrics_en.csv
../../data/lyrics/backups/bb_t100_lyrics_en_v1.csv
../../data/lyrics/backups/bb_t100_lyrics_en_v2.csv
../../data/lyrics/backups/bb_t100_lyrics_en_v3.csv
../../data/lyrics/backups/corona-lyrics.csv
../../data/lyrics/backups/lyrics_invalid_updated.csv
--------------------------------------------------
../../data/billboard/bb_t100_en.csv
../../data/billboard/billboard.csv
../../data/billboard/

## Loading Data

### Lyrics

In [2]:
lyrics = pd.read_csv('../../data/lyrics/bb_t100_lyrics_en.csv', index_col=0, encoding='utf-8')
lyrics['first_appearance'] = pd.to_datetime(lyrics['first_appearance'], format='%Y-%m-%d')
lyrics['release_date'] = pd.to_datetime(lyrics['release_date'], format='%Y-%m-%d')
lyrics['genius_annotations'] = lyrics['genius_annotations'].apply(literal_eval)
lyrics['genius_comments'] = lyrics['genius_comments'].apply(literal_eval)
lyrics.reset_index(inplace=True)
lyrics.head()

Unnamed: 0,index,billboard_id,lyrics_id,artist,first_artist,song,weeks_on_chart,peak_rank,genius_id,lyrics,...,word_count,language,language_score,first_appearance,genius_primary_artist,genius_description,genius_annotations,genius_comments,release_date,annotation_ids
0,0,0,0,Ariana Grande,Ariana Grande,"Thank U, Next",28.0,1.0,4063065,Thought I'd end up with Sean\nBut he wasn't a ...,...,460,en,0.999997,2019-01-05,Ariana Grande,On the lead single and titular track to her fi...,[(One taught me love\n One taught me patience\...,"[The Mac shoutout has me fully in tears, this ...",2018-11-03,"['15720075', '15720076', '15720054', '15720247..."
1,1,1,1,Halsey,Halsey,Without Me,52.0,1.0,3977187,Found you when your heart was broke\nI filled ...,...,435,en,0.999995,2019-01-05,Halsey,“Without Me” is the first new song released by...,[(Gave love ’bout a hundred tries (Hundred tri...,[The queen is ready to snatch our wigs once ag...,2018-10-04,"['15517989', '15520369', '15518283', '15518820..."
2,2,2,2,Mariah Carey,Mariah Carey,All I Want For Christmas Is You,43.0,1.0,204233,I don't want a lot for Christmas\nThere is jus...,...,388,en,0.999996,2019-01-05,Mariah Carey,“All I Want For Christmas Is You” is an uptemp...,[(I don’t need to hang my stocking\n There upo...,"[i really like this song, it’s about that time...",1994-11-01,"['8393500', '8393500', '21611023', '8393500', ..."
3,3,3,3,Travis Scott,Travis Scott,Sicko Mode,52.0,1.0,3876994,"Astro, yeah\nSun is down, freezin' cold\nThat'...",...,771,en,0.999998,2019-01-05,Travis Scott,“SICKO MODE” refers to Travis and Drake’s work...,"[(She’s in love with who I am, [['Since Drake ...",[HAD ME OUT LIKE A LIGHT (ayy) LIKE A LIGHT (y...,2018-08-03,"['15114078', '17948214', '15113868', '15113778..."
4,4,4,4,Post Malone & Swae Lee,Post Malone,Sunflower (Spider-Man: Into The Spider-Verse),53.0,1.0,3993850,"Ayy, ayy, ayy, ayy (Ooh)\nOoh, ooh, ooh, ooh (...",...,305,en,0.999997,2019-01-05,Post Malone & Swae Lee,“Sunflower” marks the second collaboration by ...,"[(Then you’re left in the dust, unless I stuck...",[Me enjoying “Sunflower” and someone then inte...,2018-10-18,"['16057378', '16057378']"


### Music Match - Million Songs Data Set (MXM)

In [3]:
with open('../../data/input/mxm_dataset_train.txt','r') as f:
    mxm_lines = [next(f) for i in range(2018)]
f.close()

track_ids = []
mxm_track_ids = []
mxm_corpus = []
i = 0
l = len(mxm_lines[18:])
printProgressBar(i, l, prefix='Progress:', suffix='Complete', length=50)
for line in mxm_lines[18:]:
    line = line[:-1].split(",")
    track_ids.append(line[0])
    mxm_track_ids.append(line[1])
    bow = []
    for item in line[2:]:
        bow.append(tuple((int(item.split(":")[0])-1, int(item.split(":")[1]))))
    mxm_corpus.append(bow)
    i+=1
    printProgressBar(i, l, prefix='Progress:', suffix='Complete', length=50)

Progress: |██████████████████████████████████████████████████| 100.0% Complete


In [4]:
mxm_df = pd.DataFrame()
mxm_df['track_id'] = track_ids
mxm_df['mxm_track_id'] = mxm_track_ids
mxm_df['corpus'] = mxm_corpus
mxm_df.head()

Unnamed: 0,track_id,mxm_track_id,corpus
0,TRAAAAV128F421A322,4623710,"[(0, 6), (1, 4), (2, 2), (3, 2), (4, 5), (5, 3..."
1,TRAAABD128F429CF47,6477168,"[(0, 10), (2, 17), (3, 8), (4, 2), (5, 2), (6,..."
2,TRAAAED128E0783FAB,2516445,"[(0, 28), (1, 15), (2, 2), (3, 12), (4, 22), (..."
3,TRAAAEF128F4273421,3759847,"[(0, 5), (1, 4), (2, 3), (3, 2), (4, 1), (5, 1..."
4,TRAAAEW128F42930C0,3783760,"[(0, 4), (3, 5), (4, 7), (5, 2), (6, 4), (8, 1..."


## Tokenization

### Lyrics

In [5]:
nlp = spacy.load("en_core_web_lg")
nlp.Defaults.stop_words |= {'ai', 'gon', '\u2005', 'ooh', 'let', 've', 'gonna', 'woah', 'gotta', 'll', 'mmm', 'maybe',
                            'got', 'oh', 'uh', 'want', 'huh', 'ah', 'ma', 'hey', 'woo', 'ain', 'come', 'goin', 'ya',
                            'la', 'bah', 'yeah', 'ayy', 'em', 'tryna', 'goes', 'damn', 'ooo', 'comin', 'ran', 'wanna',
                            'okay', 'til', 'didn'}

In [6]:
def create_doc(text, pos_tags=["NOUN", 'PROPN', "VERB", "ADJ"]):
    lines = text.split('\n')
    lines_unique_list = []
    for line in lines:
        if len(line) == 0:
            continue
        if line not in lines_unique_list:
            lines_unique_list.append(line)
    unique_lines = " ".join(lines_unique_list)

    lemmas = []
    doc = nlp(unique_lines)
    for token in doc:
        if token.is_stop or token.is_punct:
            continue

        if token.pos_ in pos_tags:
            if token.lemma_ not in nlp.Defaults.stop_words:
                lemmas.append(token.lemma_.lower())
            else:
                continue

    return lemmas

def make_bigrams(docs):
    return [bigram_model[doc] for doc in docs]

def make_trigrams(docs):
    return [trigram_model[bigram_model[doc]] for doc in docs]


In [7]:
try:
    with open("../../data/cache/topic_docs.txt", "r") as f:
        docs = json.load(f)
    f.close()
    with open("../../data/cache/topic_docs_bigrams.txt", "r") as f:
        docs_bigrams = json.load(f)
    f.close()
    with open("../../data/cache/topic_docs_bigrams_trigrams.txt", "r") as f:
        docs_bigrams_trigrams = json.load(f)
    f.close()
except FileNotFoundError:
    docs = []
    i = 0
    l = len(lyrics['lyrics'])
    printProgressBar(i, l, prefix='Progress:', suffix='Complete', length=50)
    for text in lyrics['lyrics']:
        docs.append(create_doc(text))
        i+=1
        printProgressBar(i, l, prefix='Progress:', suffix='Complete', length=50)

    bigram = gensim.models.Phrases(docs, min_count=5, threshold=100) # higher threshold fewer phrases.
    bigram_model = gensim.models.phrases.Phraser(bigram)
    trigram = gensim.models.Phrases(bigram[docs], threshold=100)
    trigram_model = gensim.models.phrases.Phraser(trigram)

    docs_bigrams = make_bigrams(docs)
    docs_bigrams_trigrams = make_trigrams(docs_bigrams)
    with open("../../data/cache/topic_docs.txt", "w") as f:
        json.dump(docs, f)
    f.close()
    with open("../../data/cache/topic_docs_bigrams.txt", "w") as f:
        json.dump(docs_bigrams, f)
    f.close()
    with open("../../data/cache/topic_docs_bigrams_trigrams.txt", "w") as f:
        json.dump(docs_bigrams_trigrams, f)
    f.close()


The MXM data is already in BOW-format and therefore does not need to be tokenized.

## Generate Corpus and Id2Word-Dictionary

### Lyrics

In [8]:
def optimize_corpus(corpus, id2word, words_missing_in_tfidf = [], low_value= 0.03):
    tfidf = TfidfModel(corpus, id2word=id2word)
    dropped_words = []
    l = len(corpus)
    for i in range(0, l):
        bow = corpus[i]
        tfidf_ids = [tfidf_id for tfidf_id, tfidf_value in tfidf[bow]]
        bow_ids = [bow_id for bow_id, bow_value in bow]
        low_value_words = [tfidf_id for tfidf_id, tfidf_value in tfidf[bow] if tfidf_value < low_value]
        drops = low_value_words+words_missing_in_tfidf
        for word_id in drops:
            dropped_words.append(id2word[word_id])
        words_missing_in_tfidf = [bow_id for bow_id in bow_ids if bow_id not in tfidf_ids]
        new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
        corpus[i] = new_bow
        printProgressBar(i+1, l, prefix='Progress:', suffix='Complete', length=50)
    return corpus, dropped_words, words_missing_in_tfidf

texts = docs
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

corpus = optimize_corpus(corpus, id2word)[0]

Progress: |██████████████████████████████████████████████████| 100.0% Complete


### MXM

The 17th line of the MXM-file includes the 5000 most used words. This is used to create the Id2Word-dictionary.

In [9]:
mxm_words = mxm_lines[17][:-1].split(',') #ignore last 2 characters that indicate linebreak
mxm_words[0] = 'i' #remove % from first word
mxm_words_dct = {mxm_words.index(word) : word for word in mxm_words}
mxm_id2word = gensim.corpora.Dictionary.from_corpus(mxm_corpus, id2word=mxm_words_dct)

In [10]:
with open('../../data/input/stemmed_words.txt', 'r') as f:
    word2stem = {line.split('\t')[0] : line.split('\t')[1][:-1] for line in f.readlines()}
    # stem2word = {line.split('\t')[1][:-1] : line.split('\t')[0] for line in f.readlines()}



In [11]:
stop_word_stems = []
for stop_word in nlp.Defaults.stop_words:
    if stop_word in word2stem.keys():
        stop_word_stems.append(word2stem[stop_word])

stop_word_ids = []
for stop_word_stem in stop_word_stems:
    if stop_word_stem in mxm_id2word.token2id.keys():
        stop_word_ids.append(mxm_id2word.token2id[stop_word_stem])

stop_word_ids.sort()
stop_word_ids = list(set(stop_word_ids))
stop_word_ids

[1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 513,
 27,
 533,
 29,
 30,
 31,
 32,
 33,
 34,
 543,
 36,
 545,
 1568,
 39,
 40,
 553,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 61,
 62,
 63,
 575,
 65,
 66,
 67,
 68,
 69,
 576,
 71,
 1031,
 2117,
 75,
 77,
 78,
 79,
 81,
 82,
 83,
 2132,
 1109,
 86,
 88,
 3674,
 3164,
 93,
 528,
 98,
 612,
 613,
 102,
 104,
 616,
 617,
 107,
 108,
 109,
 110,
 2155,
 114,
 115,
 117,
 1141,
 119,
 1550,
 121,
 1145,
 123,
 124,
 125,
 2172,
 641,
 132,
 644,
 134,
 135,
 136,
 140,
 141,
 2189,
 144,
 145,
 146,
 147,
 148,
 149,
 657,
 4759,
 154,
 2203,
 1694,
 159,
 160,
 673,
 675,
 677,
 1189,
 168,
 170,
 172,
 1708,
 174,
 175,
 686,
 177,
 3246,
 181,
 1717,
 1207,
 2229,
 4793,
 186,
 698,
 191,
 2752,
 193,
 706,
 2241,
 196,
 3269,
 2246,
 3780,
 200,
 201,
 1738,
 206,
 722,
 212,
 215,
 219,
 220,
 226,
 228,
 231,
 236,
 23

The already imported corpus is optimized using the TFID method. (very time consuming!)

In [12]:
mxm_corpus = optimize_corpus(mxm_corpus, mxm_id2word, words_missing_in_tfidf=stop_word_ids)[0]

Progress: |██████████████████████████████████████████████████| 100.0% Complete


In [13]:
mxm_id2word = gensim.corpora.Dictionary.from_corpus(mxm_corpus, id2word=mxm_words_dct)

In [14]:
mxm_df['corpus'] = mxm_corpus
mxm_df.to_csv('../../data/topic_modeling/mxm.csv')

## Modeling

### LDA Model for Billboard Lyrics (don't use!)

In [15]:
# lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
#                                             id2word=id2word,
#                                             num_topics=10,
#                                             update_every=1,
#                                             chunksize=100,
#                                             passes=10,
#                                             alpha='auto',
#                                             random_state=1)

Topic Visualization

In [16]:
# import pyLDAvis
# import pyLDAvis.gensim_models
#
# pyLDAvis.enable_notebook()
# pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds")

### LDA Model for MXM

In [17]:
from gensim.models.callbacks import PerplexityMetric

perplexity_logger = PerplexityMetric(corpus=mxm_corpus, logger='shell')

In [None]:
try:
    mxm_lda_model = gensim.models.ldamodel.LdaModel.load('../../gensim/models/mxm_lda')
except FileNotFoundError:
    mxm_lda_model = gensim.models.ldamodel.LdaModel(corpus=mxm_corpus,
                                            id2word=mxm_id2word,
                                            num_topics=20,
                                            update_every=1,
                                            chunksize=10,
                                            passes=10,
                                            alpha='auto',
                                            random_state=1)


Topic Visualization

In [19]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(mxm_lda_model, mxm_corpus, mxm_lda_model.id2word)

  default_term_info = default_term_info.sort_values(


## Calculate Topics for Lyrics

Topic interpreatation based on LDA visualisation.

In [20]:
topic_names = {
    0 : 'Love',
    1 : 'Crime',
    2 : 'Crime / Party',
    3 : 'Christmas',
    4 : '',
    5 : '',
    6 : '',
    7 : '',
    8 : '',
    9 : '',
}

In [21]:
def get_topics(index, lda_corpus = lda_model[corpus]):
    topics = {}
    for lda_tuple in lda_corpus[index]:
        topics[lda_tuple[0]] = lda_tuple[1]
    return dict(sorted(topics.items(), key=lambda item: item[1], reverse=True))

NameError: name 'lda_model' is not defined

In [None]:
def get_topics_str(topics):
    return [topic_names[topic] for topic in topics.keys()]

In [None]:
lyrics['topics'] = [get_topics(i) for i in range(0, len(lyrics))]
lyrics['top_topic'] = lyrics['topics'].apply(lambda topic: list(topic.keys())[0])
# lyrics['topics_str'] = lyrics['topics'].apply(lambda topics: [topic_names[topic] for topic in topics.keys()])
# lyrics['top_topic_str'] = lyrics['top_topic'].apply(lambda topic: topic_names[topic])
# lyrics[['topics', 'topics_str', 'top_topic', 'top_topic_str']]
lyrics[['topics', 'top_topic']]

In [None]:
# lyrics_topics = lyrics[
#     ['billboard_id', 'lyrics_id', 'artist', 'song', 'weeks_on_chart', 'peak_rank', 'lyrics', 'url', 'first_appearance',
#      'release_date', 'topics', 'top_topic', 'top_topic_str', 'topics_str']]
lyrics_topics = lyrics[
    ['billboard_id', 'lyrics_id', 'artist', 'song', 'weeks_on_chart', 'peak_rank', 'lyrics', 'url', 'first_appearance',
     'release_date', 'topics', 'top_topic']]
lyrics_topics.to_excel('../../data/topic_modeling/bb_t100_sentiment.xlsx')