In [1]:
import pandas as pd
import gensim
from gensim import corpora,models
from gensim.models import LdaModel, LsiModel
import warnings
warnings.filterwarnings("ignore")

# 1. Preprocessing

In [2]:
word = pd.read_csv('CleanData\word_list.csv', index_col=None)
word.head()

Unnamed: 0.1,Unnamed: 0,word
0,0,i
1,1,the
2,2,you
3,3,to
4,4,and


In [4]:
mxm = pd.read_csv('CleanData\lyrics.csv', index_col=None)
mxm = mxm.iloc[:,1:]
mxm.head()

Unnamed: 0,track_id,mxm_tid,word,count,is_test
0,TRAAAAV128F421A322,4623710,i,6,0
1,TRAAAAV128F421A322,4623710,the,4,0
2,TRAAAAV128F421A322,4623710,you,2,0
3,TRAAAAV128F421A322,4623710,to,2,0
4,TRAAAAV128F421A322,4623710,and,5,0


In [5]:
data = mxm['word'].tolist()

In [6]:
mxm['full_word'] = (mxm.word + ' ') * mxm['count']

In [7]:
df = mxm.groupby('track_id')['full_word'].apply(list).reset_index(name='full_lyrics')

In [8]:
lyrics = [] 
for i in range(0,len(df.full_lyrics)):
    a = ' '.join(df.full_lyrics.iloc[i])
    lyrics.append(a)
lyrics

['i i i i i i  the the the the  you you  to to  and and and and and  a a a  me  it  my  is is  of of of  your  that  are are  we we  am am  will will  for for for for  be  have have  so  this  like like  de  up  was was  if  got  would  been  these these  seem  someon  understand  pass  river  met  piec  damn  worth  flesh  grace  poor poor  somehow  ignor  passion  tide  season  seed  resist  order order  piti  fashion  grant  captur captur  ici  soil  patienc  social social  highest highest  slice  leaf  lifeless  arrang  wilder  shark  devast  element ',
 'i i i i i i i i i i  you you you you you you you you you you you you you you you you you  to to to to to to to to  and and  a a  me  it it it  not not  in in in  my my my my  is is is  your your your your your your your  that that that that that  do do do do do  are are are are are are  for for for for  no  have have have have have have  so so  know know know know know  but but but  what what what  when when  time time time  can  

In [17]:
# Remove stop words and tokenize
from nltk.corpus import stopwords
mystopwords = stopwords.words('english')

token_list=[[w for w in line.split(' ') if w != '' and w not in mystopwords] 
            for line in lyrics]

In [18]:
from collections import defaultdict
frequency = defaultdict(int)

for tokens in token_list:
    for token in tokens:
        frequency[token] += 1

token_list = [[token for token in tokens if frequency[token]>1]
             for tokens in token_list]


# 2. Generate Term Document Matrix

1. Generate token dictionary class
2. Generate a unique token list
3. Build a corpus
4. Save a corpus (using gensim)

In [19]:
# Generate token dictionary class
dictionary = corpora.Dictionary(token_list)
print(dictionary)

Dictionary(4884 unique tokens: ['arrang', 'captur', 'damn', 'de', 'devast']...)


In [20]:
# Generate a unique token list
sort_token = sorted(dictionary.items(),key=lambda k:k[0], reverse=False)
unique_token=[token for (ID,token) in sort_token]

In [21]:
# Build a corpus
corpus = [dictionary.doc2bow(tokens) for tokens in token_list]

In [None]:
# Save a term document matrix

#import numpy as np
#matrix = gensim.matutils.corpus2dense(corpus,num_terms=len(dictionary),dtype='int')
#matrix = matrix.T #transpose the matrix

# Convert the numpy matrix into pandas data frame
#matrix_df = pd.DataFrame(matrix,columns=unique_token)

# Write matrix df to csv
#matrix_df.to_csv('Term_Document_matrix_Lyrics.csv')

# 3. Topic Modeling (LDA)

### 10 topics

In [22]:
# Fit LDA model
lda = models.LdaModel(corpus,id2word=dictionary,num_topics=10) 
lda.print_topics(10)

[(0,
  '0.064*"que" + 0.047*"de" + 0.032*"la" + 0.028*"el" + 0.023*"en" + 0.022*"te" + 0.020*"mi" + 0.018*"se" + 0.017*"tu" + 0.014*"un"'),
 (1,
  '0.061*"la" + 0.033*"de" + 0.026*"le" + 0.023*"je" + 0.021*"et" + 0.019*"un" + 0.019*"les" + 0.019*"e" + 0.016*"il" + 0.015*"che"'),
 (2,
  '0.340*"babi" + 0.093*"na" + 0.026*"u" + 0.021*"babe" + 0.019*"wo" + 0.015*"lovin" + 0.014*"doo" + 0.011*"n" + 0.011*"free" + 0.011*"mine"'),
 (3,
  '0.128*"love" + 0.022*"night" + 0.021*"like" + 0.019*"feel" + 0.016*"heart" + 0.014*"littl" + 0.014*"want" + 0.014*"tonight" + 0.012*"sing" + 0.011*"make"'),
 (4,
  '0.020*"god" + 0.013*"die" + 0.012*"us" + 0.012*"lord" + 0.012*"soul" + 0.010*"kill" + 0.010*"live" + 0.009*"dead" + 0.009*"blood" + 0.009*"death"'),
 (5,
  '0.024*"get" + 0.021*"got" + 0.021*"like" + 0.010*"man" + 0.008*"ya" + 0.008*"back" + 0.008*"go" + 0.008*"know" + 0.007*"caus" + 0.007*"fuck"'),
 (6,
  '0.010*"time" + 0.010*"eye" + 0.010*"see" + 0.009*"away" + 0.009*"come" + 0.008*"fall" + 0

In [None]:
# Generate U Matrix for LDA model
corpus_lda = lda[corpus]

# Convert corpus_lda to numpy matrix
U_matrix_lda = gensim.matutils.corpus2dense(corpus_lda,num_terms=10) # Asking for 10 topics

# Write U_matrix into panda dataframe and output
U_matrix_lda_df = pd.DataFrame(U_matrix_lda)
#U_matrix_lda_df.to_csv('U_matrix_lda.csv')

In [None]:
print(matrix_df.shape)
print(U_matrix_lda_df.shape)

# 4. LDA Topis Visualization (pyLDAvis)

In [15]:
import pyLDAvis.gensim

In [23]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda,corpus,dictionary)