# Topic Modeling

## Import

In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib.dates
import seaborn as sns
import os
from src.progress_bar import printProgressBar
from ast import literal_eval
import gensim

for dirname, _, filenames in os.walk('../../data/lyrics'):
    for filename in filenames:
        print(os.path.join(dirname, filename).replace("\\", "/"))

print(50 * '-')

for dirname, _, filenames in os.walk('../../data/billboard'):
    for filename in filenames:
        print(os.path.join(dirname, filename).replace("\\", "/"))

../../data/lyrics/artist_song_lyrics.csv
../../data/lyrics/artist_song_lyrics_new.csv
../../data/lyrics/bb-t100-lyrics.csv
../../data/lyrics/bb_t100_lyrics_en.csv
../../data/lyrics/bb_t100_lyrics_en_v2.csv
../../data/lyrics/corona-lyrics.csv
../../data/lyrics/covid_lyrics_bb.csv
../../data/lyrics/lyrics_invalid.json
../../data/lyrics/lyrics_invalid_updated.csv
../../data/lyrics/backups/artist_song_lyrics_bak.csv
../../data/lyrics/backups/bb-t100-lyrics.csv
../../data/lyrics/backups/bb-t100-lyrics_new.csv
../../data/lyrics/backups/bb-t100-lyrics_old.csv
../../data/lyrics/backups/bb_t100_lyrics_en.csv
../../data/lyrics/backups/bb_t100_lyrics_en_v1.csv
../../data/lyrics/backups/bb_t100_lyrics_en_v2.csv
../../data/lyrics/backups/bb_t100_lyrics_en_v3.csv
../../data/lyrics/backups/corona-lyrics.csv
../../data/lyrics/backups/lyrics_invalid_updated.csv
--------------------------------------------------
../../data/billboard/bb_t100_en.csv
../../data/billboard/billboard.csv
../../data/billboard/

## Loading Data

## Lyrics

In [2]:
lyrics = pd.read_csv('../../data/lyrics/bb_t100_lyrics_en.csv', index_col=0, encoding='utf-8')
lyrics['first_appearance'] = pd.to_datetime(lyrics['first_appearance'], format='%Y-%m-%d')
lyrics['release_date'] = pd.to_datetime(lyrics['release_date'], format='%Y-%m-%d')
lyrics['genius_annotations'] = lyrics['genius_annotations'].apply(literal_eval)
lyrics['genius_comments'] = lyrics['genius_comments'].apply(literal_eval)
lyrics.reset_index(inplace=True)
lyrics.head()

Unnamed: 0,index,billboard_id,lyrics_id,artist,first_artist,song,weeks_on_chart,peak_rank,genius_id,lyrics,...,word_count,language,language_score,first_appearance,genius_primary_artist,genius_description,genius_annotations,genius_comments,release_date,annotation_ids
0,0,0,0,Ariana Grande,Ariana Grande,"Thank U, Next",28.0,1.0,4063065,Thought I'd end up with Sean\nBut he wasn't a ...,...,460,en,0.999997,2019-01-05,Ariana Grande,On the lead single and titular track to her fi...,[(One taught me love\n One taught me patience\...,"[The Mac shoutout has me fully in tears, this ...",2018-11-03,"['15720075', '15720076', '15720054', '15720247..."
1,1,1,1,Halsey,Halsey,Without Me,52.0,1.0,3977187,Found you when your heart was broke\nI filled ...,...,435,en,0.999995,2019-01-05,Halsey,“Without Me” is the first new song released by...,[(Gave love ’bout a hundred tries (Hundred tri...,[The queen is ready to snatch our wigs once ag...,2018-10-04,"['15517989', '15520369', '15518283', '15518820..."
2,2,2,2,Mariah Carey,Mariah Carey,All I Want For Christmas Is You,43.0,1.0,204233,I don't want a lot for Christmas\nThere is jus...,...,388,en,0.999996,2019-01-05,Mariah Carey,“All I Want For Christmas Is You” is an uptemp...,[(I don’t need to hang my stocking\n There upo...,"[i really like this song, it’s about that time...",1994-11-01,"['8393500', '8393500', '21611023', '8393500', ..."
3,3,3,3,Travis Scott,Travis Scott,Sicko Mode,52.0,1.0,3876994,"Astro, yeah\nSun is down, freezin' cold\nThat'...",...,771,en,0.999998,2019-01-05,Travis Scott,“SICKO MODE” refers to Travis and Drake’s work...,"[(She’s in love with who I am, [['Since Drake ...",[HAD ME OUT LIKE A LIGHT (ayy) LIKE A LIGHT (y...,2018-08-03,"['15114078', '17948214', '15113868', '15113778..."
4,4,4,4,Post Malone & Swae Lee,Post Malone,Sunflower (Spider-Man: Into The Spider-Verse),53.0,1.0,3993850,"Ayy, ayy, ayy, ayy (Ooh)\nOoh, ooh, ooh, ooh (...",...,305,en,0.999997,2019-01-05,Post Malone & Swae Lee,“Sunflower” marks the second collaboration by ...,"[(Then you’re left in the dust, unless I stuck...",[Me enjoying “Sunflower” and someone then inte...,2018-10-18,"['16057378', '16057378']"


In [3]:
import spacy

nlp = spacy.load("en_core_web_lg")
nlp.Defaults.stop_words |= {'ai', 'gon', '\u2005', 'ooh', 'let', 've', 'gonna', 'woah', 'gotta', 'll', 'mmm', 'maybe',
                            'got', 'oh', 'uh', 'want', 'huh', 'ah', 'ma', 'hey', 'woo', 'ain', 'come', 'goin', 'ya',
                            'la', 'bah', 'yeah', 'ayy', 'em', 'tryna', 'goes', 'damn', 'ooo', 'comin', 'ran', 'wanna',
                            'okay', 'til', 'didn'}

In [4]:
def create_doc(text, pos_tags=["NOUN", 'PROPN', "VERB", "ADJ"]):
    lines = text.split('\n')
    lines_unique_list = []
    for line in lines:
        if len(line) == 0:
            continue
        if line not in lines_unique_list:
            lines_unique_list.append(line)
    unique_lines = " ".join(lines_unique_list)

    lemmas = []
    doc = nlp(unique_lines)
    for token in doc:
        if token.is_stop or token.is_punct:
            continue

        if token.pos_ in pos_tags:
            if token.lemma_ not in nlp.Defaults.stop_words:
                lemmas.append(token.lemma_.lower())
            else:
                continue

    return lemmas

def make_bigrams(docs):
    return [bigram_model[doc] for doc in docs]

def make_trigrams(docs):
    return [trigram_model[bigram_model[doc]] for doc in docs]


In [5]:
docs = []
i = 0
l = len(lyrics['lyrics'])
printProgressBar(i, l, prefix='Progress:', suffix='Complete', length=50)
for text in lyrics['lyrics']:
    docs.append(create_doc(text))
    i+=1
    printProgressBar(i, l, prefix='Progress:', suffix='Complete', length=50)

bigram = gensim.models.Phrases(docs, min_count=5, threshold=100) # higher threshold fewer phrases.
bigram_model = gensim.models.phrases.Phraser(bigram)
trigram = gensim.models.Phrases(bigram[docs], threshold=100)
trigram_model = gensim.models.phrases.Phraser(trigram)

docs_bigrams = make_bigrams(docs)
docs_bigrams_trigrams = make_trigrams(docs_bigrams)

Progress: |██████████████████████████████████████████████████| 100.0% Complete


In [12]:
import gensim.corpora as corpora
from gensim.models import TfidfModel

texts = docs
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

tfidf = TfidfModel(corpus, id2word=id2word)

low_value= 0.1
words = []
words_missing_in_tfidf = []

for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [tfidf_id for tfidf_id, tfidf_value in tfidf[bow]]
    bow_ids = [bow_id for bow_id, bow_value in bow]
    low_value_words = [tfidf_id for tfidf_id, tfidf_value in tfidf[bow] if tfidf_value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [bow_id for bow_id in bow_ids if bow_id not in tfidf_ids]
    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=10,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            random_state=1)

import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds")

  default_term_info = default_term_info.sort_values(


In [23]:
type(id2word)

gensim.corpora.dictionary.Dictionary

In [7]:
topic_names = {
    0 : 'Love',
    1 : 'Crime',
    2 : 'Crime / Party',
    3 : 'Christmas',
}

In [8]:
topic_words = []
for words in lda_model.show_topics(formatted=False, num_words=30):
    for word in words[1]:
        if word[0] not in topic_words:
            topic_words.append(word[0])

print(topic_words)


['christmas', 'bad', 'big', 'stop', 'like', 'year', 'merry', 'ice', 'doo', "nothin'", 'dream', 'young', 'fight', 'rich', 'mean', 'hard', 'happy', 'remind', 'die', 'dope', 'party', 'war', 'drip', 'change', 'gettin', 'grow', 'snow', 'leavin', 'que', 'drinkin', 'good', 'shoot', 'town', 'thing', 'livin', 'ask', 'hurt', 'beer', 'cold', 'live', 'round', 'sun', 'surf', 'sure', 'marni', 'pour', 'button', 'saint', 'rock', 'lord', 'foot', 'bae', 'old', 'sky', 'send', 'second', 'believe', 'sweet', 'wake', 'road', 'bitch', 'time', 'ass', 'da', 'boom', 'look', 'fall', 'runnin', 'hate', 'stick', 'slide', 'mad', 'try', 'close', 'lonely', 'break', 'vibe', 'bring', 'house', 'smile', 'heart', 'sign', 'fade', 'catch', 'spin', 'wrist', 'soul', 'callin', 'ex', 'peace', 'love', 'baby', 'little', 'day', 'ba', 'dum', 'mm', 'long', 'holy', 'wet', 'scared', 'country', 'breathe', 'mistletoe', 'thug', 'singe', 'candy', 'eatin', 'mom', 'sight', 'read', 'steady', 'boot', 'bottle', 'dog', 'toss', 'pocket', 'find', '

In [9]:
def get_topics(index, lda_corpus = lda_model[corpus]):
    topics = {}
    for lda_tuple in lda_corpus[index]:
        topics[lda_tuple[0]] = lda_tuple[1]
    return dict(sorted(topics.items(), key=lambda item: item[1], reverse=True))

In [10]:
def get_topics_str(topics):
    return [topic_names[topic] for topic in topics.keys()]

In [11]:
lyrics['topics'] = [get_topics(i) for i in range(0, len(lyrics))]
lyrics['topics_str'] = lyrics['topics'].apply(lambda topics: [topic_names[topic] for topic in topics.keys()])
lyrics['top_topic'] = lyrics['topics'].apply(lambda topic: list(topic.keys())[0])
lyrics['top_topic_str'] = lyrics['top_topic'].apply(lambda topic: topic_names[topic])
lyrics[['topics', 'topics_str', 'top_topic', 'top_topic_str']]

KeyError: 5

In [None]:
lyrics_topics = lyrics[
    ['billboard_id', 'lyrics_id', 'artist', 'song', 'weeks_on_chart', 'peak_rank', 'lyrics', 'url', 'first_appearance',
     'release_date', 'topics', 'top_topic', 'top_topic_str', 'topics_str']]
lyrics_topics.to_excel('../../data/topic_modeling/bb_t100_sentiment.xlsx')

## MXM Data

In [51]:
with open('../../data/input/mxm_dataset_train.txt','r') as f:
    lines = f.readlines()
f.close()

track_ids = []
mxm_track_ids = []
mxm_corpus = []
i = 0
l = len(lines[18:1000])
printProgressBar(i, l, prefix='Progress:', suffix='Complete', length=50)
for line in lines[18:1000]:
    line = line[:-1].split(",")
    track_ids.append(line[0])
    mxm_track_ids.append(line[1])
    bow = []
    for item in line[2:]:
        bow.append(tuple((int(item.split(":")[0]), int(item.split(":")[1]))))
    mxm_corpus.append(bow)
    i+=1
    printProgressBar(i, l, prefix='Progress:', suffix='Complete', length=50)

Progress: |██████████████████████████████████████████████████| 100.0% Complete


In [84]:
stems = lines[17][:-1].split(',')
stems[0] = 'i'
stems_dct = {stems.index(stem) : stem for stem in stems}

In [83]:
mxm_corpus

[[(1, 6),
  (2, 4),
  (3, 2),
  (4, 2),
  (5, 5),
  (6, 3),
  (7, 1),
  (8, 1),
  (11, 1),
  (12, 2),
  (13, 3),
  (14, 1),
  (15, 1),
  (18, 2),
  (19, 2),
  (20, 2),
  (21, 2),
  (23, 4),
  (25, 1),
  (26, 2),
  (28, 1),
  (30, 1),
  (36, 2),
  (42, 1),
  (45, 1),
  (54, 2),
  (56, 1),
  (57, 1),
  (68, 1),
  (99, 1),
  (192, 2),
  (249, 1),
  (264, 1),
  (356, 1),
  (389, 1),
  (561, 1),
  (639, 1),
  (656, 1),
  (687, 1),
  (761, 1),
  (773, 1),
  (804, 1),
  (869, 2),
  (914, 1),
  (1035, 1),
  (1156, 1),
  (1221, 1),
  (1287, 1),
  (1364, 1),
  (1407, 1),
  (1533, 2),
  (1857, 1),
  (2096, 1),
  (2117, 1),
  (2482, 2),
  (2548, 1),
  (2705, 1),
  (2723, 1),
  (2868, 2),
  (2992, 2),
  (3455, 1),
  (3717, 1),
  (3851, 1),
  (4322, 1),
  (4382, 1),
  (4613, 1),
  (4713, 1),
  (4906, 1)],
 [(1, 10),
  (3, 17),
  (4, 8),
  (5, 2),
  (6, 2),
  (7, 1),
  (8, 3),
  (9, 2),
  (10, 3),
  (11, 4),
  (12, 3),
  (14, 7),
  (15, 5),
  (16, 5),
  (18, 6),
  (23, 4),
  (24, 1),
  (26, 6),
  (28

In [85]:
mxm_id2word = gensim.corpora.Dictionary.from_corpus(mxm_corpus, id2word=stems_dct)

In [87]:
mxm_id2word[1]

'the'

In [88]:
lda_model_2 = gensim.models.ldamodel.LdaModel(corpus=mxm_corpus,
                                            id2word=mxm_id2word,
                                            num_topics=10,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            random_state=1)

import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model_2, mxm_corpus, mxm_id2word, mds="mmds")

  default_term_info = default_term_info.sort_values(
