# Topic Modeling

## Import

In [1]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib.dates
import seaborn as sns
import os
from src.progress_bar import printProgressBar
from ast import literal_eval
import gensim

for dirname, _, filenames in os.walk('../../data/lyrics'):
    for filename in filenames:
        print(os.path.join(dirname, filename).replace("\\", "/"))

print(50 * '-')

for dirname, _, filenames in os.walk('../../data/billboard'):
    for filename in filenames:
        print(os.path.join(dirname, filename).replace("\\", "/"))

../../data/lyrics/artist_song_lyrics.csv
../../data/lyrics/artist_song_lyrics_new.csv
../../data/lyrics/bb-t100-lyrics.csv
../../data/lyrics/bb_t100_lyrics_en.csv
../../data/lyrics/bb_t100_lyrics_en_v2.csv
../../data/lyrics/corona-lyrics.csv
../../data/lyrics/covid_lyrics_bb.csv
../../data/lyrics/lyrics_invalid.json
../../data/lyrics/lyrics_invalid_updated.csv
../../data/lyrics/backups/artist_song_lyrics_bak.csv
../../data/lyrics/backups/bb-t100-lyrics.csv
../../data/lyrics/backups/bb-t100-lyrics_new.csv
../../data/lyrics/backups/bb-t100-lyrics_old.csv
../../data/lyrics/backups/bb_t100_lyrics_en.csv
../../data/lyrics/backups/bb_t100_lyrics_en_v1.csv
../../data/lyrics/backups/bb_t100_lyrics_en_v2.csv
../../data/lyrics/backups/bb_t100_lyrics_en_v3.csv
../../data/lyrics/backups/corona-lyrics.csv
../../data/lyrics/backups/lyrics_invalid_updated.csv
--------------------------------------------------
../../data/billboard/bb_t100_en.csv
../../data/billboard/billboard.csv
../../data/billboard/

## Loading Data

## Lyrics

In [6]:
lyrics = pd.read_csv('../../data/lyrics/bb_t100_lyrics_en.csv', index_col=0, encoding='utf-8')
lyrics['first_appearance'] = pd.to_datetime(lyrics['first_appearance'], format='%Y-%m-%d')
lyrics['release_date'] = pd.to_datetime(lyrics['release_date'], format='%Y-%m-%d')
lyrics['genius_annotations'] = lyrics['genius_annotations'].apply(literal_eval)
lyrics['genius_comments'] = lyrics['genius_comments'].apply(literal_eval)
lyrics.head()

Unnamed: 0,billboard_id,lyrics_id,artist,first_artist,song,weeks_on_chart,peak_rank,genius_id,lyrics,url,...,word_count,language,language_score,first_appearance,genius_primary_artist,genius_description,genius_annotations,genius_comments,release_date,annotation_ids
0,0,0,Ariana Grande,Ariana Grande,"Thank U, Next",28.0,1.0,4063065,Thought I'd end up with Sean\nBut he wasn't a ...,https://genius.com/Ariana-grande-thank-u-next-...,...,460,en,0.999997,2019-01-05,Ariana Grande,On the lead single and titular track to her fi...,[(One taught me love\n One taught me patience\...,"[The Mac shoutout has me fully in tears, this ...",2018-11-03,"['15720075', '15720076', '15720054', '15720247..."
1,1,1,Halsey,Halsey,Without Me,52.0,1.0,3977187,Found you when your heart was broke\nI filled ...,https://genius.com/Halsey-without-me-lyrics,...,435,en,0.999995,2019-01-05,Halsey,“Without Me” is the first new song released by...,[(Gave love ’bout a hundred tries (Hundred tri...,[The queen is ready to snatch our wigs once ag...,2018-10-04,"['15517989', '15520369', '15518283', '15518820..."
2,2,2,Mariah Carey,Mariah Carey,All I Want For Christmas Is You,43.0,1.0,204233,I don't want a lot for Christmas\nThere is jus...,https://genius.com/Mariah-carey-all-i-want-for...,...,388,en,0.999996,2019-01-05,Mariah Carey,“All I Want For Christmas Is You” is an uptemp...,[(I don’t need to hang my stocking\n There upo...,"[i really like this song, it’s about that time...",1994-11-01,"['8393500', '8393500', '21611023', '8393500', ..."
3,3,3,Travis Scott,Travis Scott,Sicko Mode,52.0,1.0,3876994,"Astro, yeah\nSun is down, freezin' cold\nThat'...",https://genius.com/Travis-scott-sicko-mode-lyrics,...,771,en,0.999998,2019-01-05,Travis Scott,“SICKO MODE” refers to Travis and Drake’s work...,"[(She’s in love with who I am, [['Since Drake ...",[HAD ME OUT LIKE A LIGHT (ayy) LIKE A LIGHT (y...,2018-08-03,"['15114078', '17948214', '15113868', '15113778..."
4,4,4,Post Malone & Swae Lee,Post Malone,Sunflower (Spider-Man: Into The Spider-Verse),53.0,1.0,3993850,"Ayy, ayy, ayy, ayy (Ooh)\nOoh, ooh, ooh, ooh (...",https://genius.com/Post-malone-and-swae-lee-su...,...,305,en,0.999997,2019-01-05,Post Malone & Swae Lee,“Sunflower” marks the second collaboration by ...,"[(Then you’re left in the dust, unless I stuck...",[Me enjoying “Sunflower” and someone then inte...,2018-10-18,"['16057378', '16057378']"


In [7]:
import spacy
nlp = spacy.load("en_core_web_lg")
nlp.Defaults.stop_words |= {'ai', 'gon', '\u2005'}

In [103]:
def preprocessing(text):
    lines = text.split('\n')
    lines_unique_list = []
    for line in lines:
        if len(line) == 0:
            continue
        if line not in lines_unique_list:
            lines_unique_list.append(line)
    lines_unique = " ".join(lines_unique_list)
    return gensim.utils.simple_preprocess(lines_unique, deacc=True)

def remove_stop_words(docs):
    docs_new = []
    for doc in docs:
        doc_new = []
        for token in doc:
            if token not in nlp.Defaults.stop_words:
                doc_new.append(token)
        docs_new.append(doc_new)
    return docs_new

bigram = gensim.models.Phrases(docs, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[docs], threshold=100)

bigram_model = gensim.models.phrases.Phraser(bigram)
trigram_model = gensim.models.phrases.Phraser(trigram)

def make_bigrams(docs):
    return [bigram_model[doc] for doc in docs]

def make_trigrams(docs):
    return [trigram_model[bigram_model[doc]] for doc in docs]

def lemmatize(docs, pos_tags=["NOUN", "ADJ", "VERB", "ADV"]):
    i = 0
    l = len(docs)
    printProgressBar(i, l, prefix='Progress:', suffix='Complete', length=50)
    new_docs = []
    for doc in docs:
        new_doc = []
        text = " ".join(doc)
        doc = nlp(text)
        for token in doc:
            if token.pos_ in pos_tags:
                new_doc.append(token.lemma_)
        new_docs.append(new_doc)
        i+=1
        printProgressBar(i, l, prefix='Progress:', suffix='Complete', length=50)
    return new_docs


In [110]:
docs = [preprocessing(text) for text in lyrics['lyrics']]
docs = remove_stop_words(docs)
docs_bigrams = make_bigrams(docs)
docs_bigrams_trigrams = make_trigrams(docs_bigrams)
docs_lemma = lemmatize(docs_bigrams)

Progress: |██████████████████████████████████████████████████| 100.0% Complete


In [111]:
import pyperclip
pyperclip.copy(str(docs_lemma))

In [163]:
import gensim.corpora as corpora

id2word_old = corpora.Dictionary(docs_bigrams_trigrams)
corpus_old = []
for doc in docs_bigrams_trigrams:
    corpus_old.append((id2word_old.doc2bow(doc)))

In [208]:
from gensim.models import TfidfModel

texts = docs_bigrams_trigrams
id2word = corpora.Dictionary(texts)
corpus = [id2word.doc2bow(text) for text in texts]

tfidf = TfidfModel(corpus, id2word=id2word)

low_value= 0.05
words = []
words_missing_in_tfidf = []

for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = []
    tfidf_ids = [tfidf_id for tfidf_id, tfidf_value in tfidf[bow]]
    bow_ids = [bow_id for bow_id, bow_value in bow]
    low_value_words = [tfidf_id for tfidf_id, tfidf_value in tfidf[bow] if tfidf_value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [bow_id for bow_id in bow_ids if bow_id not in tfidf_ids]
    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow

output = []
for word_id in drops:
    output.append(id2word[word_id])
print(", ".join(output))

ain, cause, good, got, know, said, wish, yeah, don, feel, feelin, live, oh, right, come, lot, send, stay, want, big, bitches, like, ma, niggas, think, way, gotta, left, couldn, hate, okay, kiss, hard, help, car, nigga, ass, bet, blow, money, whip, drop, black, neck, low, hand, pretty, game, miss, racks, sit


In [211]:
id2word.filter_tokens(bad_ids=drops)


AttributeError: 'list' object has no attribute 'num_terms'

In [210]:
lda_model_new = gensim.models.ldamodel.LdaModel(corpus=new_corpus,
                                            id2word=id2word,
                                            num_topics=10,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto')

IndexError: index 16140 is out of bounds for axis 1 with size 16140

In [204]:
import pyLDAvis
import pyLDAvis.gensim_models

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model_new, corpus, id2word, mds="mmds")

vis

  default_term_info = default_term_info.sort_values(
