## Topic Modelling

In [2]:
import pandas as pd
lyrics = pd.read_csv('song_parts.csv')

In [3]:
from nltk.tokenize import RegexpTokenizer
lyric_corpus = lyrics['lyrics']
lyric_corpus_tokenized = []
tokenizer = RegexpTokenizer(r'\w+')
for lyric in lyric_corpus:
    tokenized_lyric = tokenizer.tokenize(lyric.lower())
    lyric_corpus_tokenized.append(tokenized_lyric)

In [4]:
for s,song in enumerate(lyric_corpus_tokenized):
    filtered_song = []
    for token in song:
        if len(token) > 2 and not token.isnumeric():
            filtered_song.append(token)
    lyric_corpus_tokenized[s] = filtered_song

In [5]:
from nltk.stem.wordnet import WordNetLemmatizer
from tqdm import tqdm
lemmatizer = WordNetLemmatizer()
for s, song in enumerate(tqdm(lyric_corpus_tokenized, desc="Lemmatizing lyrics")):
    lemmatized_tokens = []
    for token in song:
        lemmatized_tokens.append(lemmatizer.lemmatize(token))
    lyric_corpus_tokenized[s] = lemmatized_tokens

Lemmatizing lyrics: 100%|██████████| 5334078/5334078 [15:42<00:00, 5659.74it/s] 


In [6]:
print(lyric_corpus_tokenized[0])

['while', 'watchin', 'every', 'nigga', 'watchin', 'closely', 'shit', 'butter', 'for', 'the', 'bread', 'they', 'wanna', 'toast', 'keep', 'head', 'both', 'them', 'where', 'they', 'supposed', 'hoe', 'get', 'you', 'sidetracked', 'then', 'clapped', 'from', 'close', 'foot', 'don', 'sleep', 'tired', 'feel', 'wired', 'like', 'codeine', 'these', 'day', 'brother', 'gotta', 'admire', 'from', 'four', 'fiend', 'away', 'pain', 'wish', 'wa', 'quick', 'see', 'from', 'sellin', 'caine', 'til', 'brain', 'wa', 'fried', 'fricassee', 'can', 'lie', 'the', 'time', 'never', 'bothered', 'the', 'bar', 'gettin', 'thug', 'properly', 'squad', 'and', 'lack', 'respect', 'for', 'authority', 'laughin', 'hard', 'happy', 'escapin', 'poverty', 'however', 'brief', 'know', 'this', 'game', 'got', 'valley', 'and', 'peak', 'expectation', 'for', 'dip', 'for', 'precipitation', 'stack', 'chip', 'hardly', 'the', 'youth', 'used', 'soon', 'see', 'mill', 'more', 'big', 'willie', 'game', 'ha', 'grown', 'prefer', 'you', 'call', 'willia

In [12]:
lyrics['lemmatized'] = lyric_corpus_tokenized

In [20]:
lyrics

Unnamed: 0,artist,title,tag,year,views,part,lyrics,explicitness
0,JAY-Z,Can I Live,rap,1996,468624,[Verse 1],While I'm watchin' every nigga watchin' me clo...,Explicit content
1,JAY-Z,Can I Live,rap,1996,468624,[Chorus],Ge-ge-geyeahhh\nCan I live?\nCan I live?,Normal
2,JAY-Z,Can I Live,rap,1996,468624,[Verse 2],My mind is infested with sick thoughts that ci...,Normal
3,JAY-Z,Can I Live,rap,1996,468624,[Chorus],Can I live?\nCan I live?\nCan I live?\nCan I l...,Normal
4,Fabolous,Forgive Me Father,rap,2003,4743,[Hook],Forgive me father for I have sinned\nBut look ...,Explicit content
...,...,...,...,...,...,...,...,...
5334073,Alana Springsteen,New Number,country,2022,1,[Chorus],One that I ain't dial at least a couple thousa...,Normal
5334074,Alana Springsteen,New Number,country,2022,1,[Verse 2],You need a new number and you can't get it fas...,Normal
5334075,Alana Springsteen,New Number,country,2022,1,[Chorus],One that I ain't dial at least a couple thousa...,Normal
5334076,Alana Springsteen,New Number,country,2022,1,[Bridge],"Oh, if you wanna help me out\nIf you wanna let...",Normal


In [26]:
# lyrics['tokenized, lemmatized, no bad words'] = lyric_corpus_tokenized
lyrics.to_csv('song_lyrics_lemmatized.csv', index=False)

In [25]:
lyrics

Unnamed: 0,artist,title,tag,year,views,part,lyrics,explicitness,"tokenized, lemmatized, no bad words"
0,JAY-Z,Can I Live,rap,1996,468624,[Verse 1],While I'm watchin' every nigga watchin' me clo...,Explicit content,"[watchin, every, watchin, closely, butter, bre..."
1,JAY-Z,Can I Live,rap,1996,468624,[Chorus],Ge-ge-geyeahhh\nCan I live?\nCan I live?,Normal,"[geyeahhh, live, live]"
2,JAY-Z,Can I Live,rap,1996,468624,[Verse 2],My mind is infested with sick thoughts that ci...,Normal,"[mind, infested, sick, thought, circle, like, ..."
3,JAY-Z,Can I Live,rap,1996,468624,[Chorus],Can I live?\nCan I live?\nCan I live?\nCan I l...,Normal,"[live, live, live, live]"
4,Fabolous,Forgive Me Father,rap,2003,4743,[Hook],Forgive me father for I have sinned\nBut look ...,Explicit content,"[forgive, father, sinned, look, money, spend, ..."
...,...,...,...,...,...,...,...,...,...
5334073,Alana Springsteen,New Number,country,2022,1,[Chorus],One that I ain't dial at least a couple thousa...,Normal,"[one, dial, least, couple, thousand, time, hea..."
5334074,Alana Springsteen,New Number,country,2022,1,[Verse 2],You need a new number and you can't get it fas...,Normal,"[need, new, number, fast, enough, cause, wanna..."
5334075,Alana Springsteen,New Number,country,2022,1,[Chorus],One that I ain't dial at least a couple thousa...,Normal,"[one, dial, least, couple, thousand, time, hea..."
5334076,Alana Springsteen,New Number,country,2022,1,[Bridge],"Oh, if you wanna help me out\nIf you wanna let...",Normal,"[wanna, help, wanna, let, easy, care, way, eve..."


In [13]:
lyrics.to_csv('song_lyrics_lematized.csv', index=False) # the stopwords and very frequent and infrequent words are not removed here

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
for i in range(10):
    print(lyric_corpus_tokenized[i])

In [6]:
import pandas as pd
lyrics = pd.read_csv('song_lyrics_lematized.csv')

In [16]:
from tqdm import tqdm
lyric_corpus_tokenized = list()
for song in tqdm(lyrics['lemmatized'], desc="writing to the list"):
    lyric_corpus_tokenized.append(song.strip("[]").replace("'", "").split(", "))
print(type(lyric_corpus_tokenized[0]))
print(lyric_corpus_tokenized[0])

writing to the list: 100%|██████████| 2797631/2797631 [02:04<00:00, 22473.51it/s]

<class 'list'>
['produced', 'irv', 'gotti', 'intro', 'yeah', 'hah', 'yeah', 'roc', 'fella', 'invite', 'you', 'somethin', 'epic', 'you', 'know', 'well', 'hustle', 'out', 'sense', 'hopelessness', 'sort', 'desperation', 'through', 'that', 'desperation', 'become', 'addicted', 'sort', 'like', 'the', 'fiend', 'accustomed', 'servin', 'but', 'feel', 'have', 'nothin', 'lose', 'offer', 'you', 'well', 'offer', 'our', 'life', 'right', 'what', 'you', 'bring', 'the', 'table', 'verse', 'while', 'watchin', 'every', 'nigga', 'watchin', 'closely', 'shit', 'butter', 'for', 'the', 'bread', 'they', 'wanna', 'toast', 'keep', 'head', 'both', 'them', 'where', 'they', 'supposed', 'hoe', 'get', 'you', 'sidetracked', 'then', 'clapped', 'from', 'close', 'foot', 'don', 'sleep', 'tired', 'feel', 'wired', 'like', 'codeine', 'these', 'day', 'brother', 'gotta', 'admire', 'from', 'four', 'fiend', 'away', 'pain', 'wish', 'wa', 'quick', 'see', 'from', 'sellin', 'caine', 'til', 'brain', 'wa', 'fried', 'fricassee', 'can', 




In [10]:
with open('profanities.txt', 'r') as file:
    prof_string = file.read().replace('\n', '')
    profanities = set(prof_string.split(", "))




In [11]:
from tqdm import tqdm

def filter_song(song):
    return [token for token in song if token not in profanities]

lyric_corpus_tokenized = [
    filter_song(song) for song in tqdm(lyric_corpus_tokenized, desc="removing profanities")
]

removing profanities: 100%|██████████| 5334078/5334078 [01:03<00:00, 83929.94it/s] 


In [9]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
new_stop_words = ['ooh','yeah','hey','whoa','woah', 'ohh', 'was', 'mmm', 'oooh','yah','yeh','mmm', 'hmm','deh','doh','jah','wa']
stop_words.extend(new_stop_words)
print(len(stop_words))
stop_words = set(stop_words)
print(len(stop_words))
print(type(stop_words))

for s,song in enumerate(tqdm(lyric_corpus_tokenized, desc="removing stop words")):
    filtered_text = []
    for token in song:
        if token not in stop_words:
            filtered_text.append(token)
    lyric_corpus_tokenized[s] = filtered_text

# for i in range(10):
#     print(lyric_corpus_tokenized[i])



196
194
<class 'set'>


removing stop words: 100%|██████████| 5334078/5334078 [00:56<00:00, 93736.36it/s] 


In [12]:
# lyric_corpus_tokenized = lyrics['no profanities and no stop words'].tolist()
lyrics.to_csv('song_lyrics_lematized.csv', index=False) # the stopwords and very frequent and infrequent words are not removed here


In [21]:
# lyrics_corpus_tokenized = list()
# for song in tqdm(lyrics['no profanities and no stop words'], desc="writing to the list"):
#     lyrics_corpus_tokenized.append(song.strip("[]").replace("'", "").split(", "))
# print(type(lyrics_corpus_tokenized[0]))
# print(lyrics_corpus_tokenized[0])

writing to the list: 100%|██████████| 2797631/2797631 [01:07<00:00, 41462.24it/s]

<class 'list'>
['produced', 'irv', 'gotti', 'intro', 'hah', 'roc', 'fella', 'invite', 'somethin', 'epic', 'know', 'well', 'hustle', 'sense', 'hopelessness', 'sort', 'desperation', 'desperation', 'become', 'addicted', 'sort', 'like', 'fiend', 'accustomed', 'servin', 'feel', 'nothin', 'lose', 'offer', 'well', 'offer', 'life', 'right', 'bring', 'table', 'verse', 'watchin', 'every', 'watchin', 'closely', 'butter', 'bread', 'wanna', 'toast', 'keep', 'head', 'supposed', 'get', 'sidetracked', 'clapped', 'close', 'foot', 'sleep', 'tired', 'feel', 'wired', 'like', 'codeine', 'day', 'brother', 'gotta', 'admire', 'four', 'fiend', 'away', 'pain', 'wish', 'quick', 'see', 'sellin', 'caine', 'til', 'brain', 'fried', 'fricassee', 'lie', 'time', 'never', 'bothered', 'bar', 'gettin', 'properly', 'squad', 'lack', 'respect', 'authority', 'laughin', 'hard', 'happy', 'escapin', 'poverty', 'however', 'brief', 'know', 'game', 'got', 'valley', 'peak', 'expectation', 'dip', 'precipitation', 'stack', 'chip', 'ha




In [12]:
words_to_remove = {'verse', 'chorus', 'hook', 'bridge', 'intro', 'outro', 'pre-chorus', 'pre-hook', 'get', 'got'}

In [14]:
lyric_corpus_tokenized = [
    [token for token in song if token not in words_to_remove]
    for song in tqdm(lyric_corpus_tokenized, desc="removing")
]

removing: 100%|██████████| 5334078/5334078 [00:51<00:00, 104157.25it/s]


In [15]:
#!pip install gensim

Defaulting to user installation because normal site-packages is not writeable


In [54]:
# lyrics_corpus_tokenized

In [16]:
from gensim.corpora import Dictionary
from tqdm import tqdm
import random

# Assuming you have a big list called 'big_list'
  # Number of items to choose

# Randomly select 20,000 items from the big list
# lyric_corpus_tokenized = lyric_corpus_tokenized.tolist()

tqdm_corpus = tqdm(lyric_corpus_tokenized, desc="Creating dictionary")
dictionary = Dictionary(tqdm_corpus)


Creating dictionary: 100%|██████████| 5334078/5334078 [03:38<00:00, 24416.18it/s]


In [17]:
dictionary.filter_extremes(no_below=100, no_above=0.8)

In [18]:
from gensim.corpora import MmCorpus
gensim_corpus = []
with tqdm(total=len(lyric_corpus_tokenized), desc="Creating Gensim Corpus") as pbar:
    for song in lyric_corpus_tokenized:
        doc = dictionary.doc2bow(song)
        gensim_corpus.append(doc)
        pbar.update(1)
temp = dictionary[0]
id2word = dictionary.id2token

Creating Gensim Corpus: 100%|██████████| 5334078/5334078 [02:21<00:00, 37624.35it/s]


accustomed


In [19]:
print(len(gensim_corpus))

5334078


In [32]:
import time
from gensim.models import LdaModel

start_time = time.time()
chunksize = 2000
passes = 20
iterations = 400
num_topics = 6
sample_size = 2000000


gensim2 = random.sample(gensim_corpus, sample_size)

lda_model = LdaModel(
    corpus=gensim_corpus[],
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes
)

end_time = time.time()
training_time = end_time - start_time
print("Training time: {:.2f} seconds".format(training_time))

Training time: 1313.68 seconds


In [30]:
predicted_topics = []
with tqdm(total=len(gensim_corpus), desc="Predicting topics") as pbar:
    for song in gensim_corpus:
        topic_dist = lda_model.get_document_topics(song)
        predicted_topics.append(topic_dist)
        pbar.update(1)

Predicting topics:   0%|          | 837/2797631 [00:00<18:53, 2468.12it/s]


IndexError: index 16583 is out of bounds for axis 1 with size 16583

In [73]:
lda_model.save("lda_model400")

In [65]:
!pip install pyLDAvis

Defaulting to user installation because normal site-packages is not writeable


In [46]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
vis_data = gensimvis.prepare(lda_model, gensim_corpus, dictionary)
pyLDAvis.display(vis_data)


TypeError: drop() takes from 1 to 2 positional arguments but 3 were given

In [51]:
pyLDAvis.save_html(vis_data, './Lyrics_LDA_k_'+ str(num_topics) +'.html')

NameError: name 'vis_data' is not defined

In [55]:
#print(pyLDAvis.__version__)
!pip install pyLDAvis==3.4.1
print(pyLDAvis.__version__)
!pip install pandas==1.5.3
import pandas as pd
print(pd.__version__)

Defaulting to user installation because normal site-packages is not writeable
[31mERROR: Ignored the following versions that require a different python version: 3.4.1 Requires-Python >=3.9[0m[31m
[0m[31mERROR: Could not find a version that satisfies the requirement pyLDAvis==3.4.1 (from versions: 1.0.0, 1.1.0, 1.2.0, 1.3.0, 1.3.1, 1.3.2, 1.3.3, 1.3.4, 1.3.5, 1.4.0, 1.4.1, 1.5.0, 1.5.1, 2.0.0, 2.1.0, 2.1.1, 2.1.2, 3.0.0, 3.1.0, 3.2.0, 3.2.1, 3.2.2, 3.3.0, 3.3.1, 3.4.0)[0m[31m
[0m[31mERROR: No matching distribution found for pyLDAvis==3.4.1[0m[31m
[0m3.4.0
Defaulting to user installation because normal site-packages is not writeable
2.0.1


In [50]:
import pandas as pd
print(pd.__version__)

2.0.1


In [44]:
import pandas as pd

In [27]:
from gensim.corpora import Dictionary
from gensim.corpora import MmCorpus
from gensim.models import LdaModel

lda_model = LdaModel.load('lda_model400')


In [45]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
vis_data = gensimvis.prepare(lda_model, gensim_corpus, dictionary)
pyLDAvis.display(vis_data)
#pyLDAvis.save_html(vis_data, './Lyrics_LDA_k_'+ str(num_topics) +'.html')

KeyboardInterrupt: 

In [27]:
lyrics = pd.read_csv('song_topics.csv')

In [28]:

count = lyrics['topic'].value_counts()['life and relationships']
print(count, " life and relationships")
count = lyrics['topic'].value_counts()['money and authority']
print(count, " money and authority")
count = lyrics['topic'].value_counts()['general']
print(count, " general")
count = lyrics['topic'].value_counts()['religion and society']
print(count, " religion and society")


1708442  life and relationships
1980937  money and authority
1266192  general
378507  religion and society
