# Topic modeling 

In [106]:
from gensim import matutils
from gensim.models.ldamodel import LdaModel
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import random

In [54]:
df = pd.read_csv("michael_jackson_lyrics_corpus.csv", encoding='utf-8')
df.columns = ['text']
data = list(df['text'])

In [104]:
vectorizer = CountVectorizer(ngram_range=(1,1))
matrix = vectorizer.fit_transform(data)
vocabulary = vectorizer.get_feature_names()
random.shuffle(vocabulary)

In [135]:
num_topics = 5
passes = 40
tm_model = LdaModel(matutils.Sparse2Corpus(matrix), 
                   num_topics=num_topics, passes=passes,id2word=dict([(i, s) for i, s in enumerate(vocabulary)]))

In [136]:
tm_model.show_topics()

[(0,
  '0.153*"mountain" + 0.131*"woo" + 0.078*"excuses" + 0.078*"young" + 0.059*"homeless" + 0.044*"called" + 0.044*"erase" + 0.042*"begins" + 0.021*"standing" + 0.016*"bathe"'),
 (1,
  '0.047*"written" + 0.034*"understands" + 0.034*"lived" + 0.029*"abuse" + 0.027*"their" + 0.026*"begins" + 0.026*"hold" + 0.024*"cursed" + 0.024*"he" + 0.024*"castle"'),
 (2,
  '0.119*"taking" + 0.077*"each" + 0.075*"makes" + 0.053*"closet" + 0.049*"gave" + 0.039*"disappear" + 0.037*"anything" + 0.029*"that" + 0.029*"funky" + 0.024*"nigh"'),
 (3,
  '0.094*"voice" + 0.071*"him" + 0.061*"diaaaaaana" + 0.060*"as" + 0.045*"final" + 0.045*"left" + 0.041*"more" + 0.034*"answered" + 0.032*"within" + 0.031*"excuses"'),
 (4,
  '0.155*"childhood" + 0.145*"kissing" + 0.065*"hurting" + 0.063*"madness" + 0.061*"promise" + 0.054*"stop" + 0.042*"everyday" + 0.040*"counting" + 0.034*"youre" + 0.034*"waiting"')]

выглядит супер, но есть стоп-слова, а их бы убрать...🤔 и вообще причесать датасет, мы запустили "как есть"

In [124]:
from nltk.corpus import stopwords
english_stopwords = set(stopwords.words("english")) | set(["im","dont"])

In [125]:
clean_data = []
for song in data:
    tokens = list(set(song.split(" ")))
    words = [x for x in tokens if x not in english_stopwords]
    clean_song = " ".join(words) 
    clean_data.append(clean_song)

In [126]:
vectorizer_clean = CountVectorizer(ngram_range=(1,1))
matrix_clean =  vectorizer_clean.fit_transform(clean_data)
vocabulary_clean = vectorizer_clean.get_feature_names()
random.shuffle(vocabulary_clean)

In [128]:
num_topics = 5
passes = 40
tm_model_clean = LdaModel(matutils.Sparse2Corpus(matrix_clean), 
                    num_topics=num_topics, 
                    passes=passes,
                    id2word=dict([(i, s) for i, s in enumerate(vocabulary_clean)]))

In [129]:
tm_model_clean.show_topics()

[(0,
  '0.023*"lam" + 0.023*"look" + 0.021*"tired" + 0.020*"knows" + 0.019*"speaks" + 0.019*"grandmas" + 0.019*"dreams" + 0.018*"would" + 0.018*"runs" + 0.018*"happiness"'),
 (1,
  '0.090*"school" + 0.082*"sandy" + 0.072*"songbird" + 0.072*"serenading" + 0.062*"enough" + 0.044*"shines" + 0.035*"carousel" + 0.031*"yearning" + 0.028*"wouldnt" + 0.025*"looking"'),
 (2,
  '0.073*"ahwalkin" + 0.065*"tired" + 0.062*"certain" + 0.055*"role" + 0.052*"big" + 0.045*"sail" + 0.031*"sense" + 0.027*"castle" + 0.026*"problems" + 0.025*"would"'),
 (3,
  '0.111*"happen" + 0.111*"house" + 0.079*"education" + 0.059*"talkin" + 0.052*"ravaging" + 0.038*"see" + 0.018*"monkey" + 0.017*"vain" + 0.016*"doomed" + 0.015*"young"'),
 (4,
  '0.079*"coz" + 0.071*"gods" + 0.064*"cramp" + 0.051*"vain" + 0.045*"help" + 0.036*"ones" + 0.034*"shores" + 0.031*"cast" + 0.028*"preacher" + 0.026*"following"')]

 а теперь еще пройдемся стеммингом!

In [130]:
from nltk.stem.porter import *
stemmer = PorterStemmer()

In [131]:
stem_data = []
for song in clean_data:
    tokens = list(set(song.split(" ")))
    words = [stemmer.stem(t) for t in tokens]
    stem_song = " ".join(words) 
    stem_data.append(stem_song)

In [132]:
vectorizer_stem = CountVectorizer(ngram_range=(1,1))
matrix_stem =  vectorizer_stem.fit_transform(clean_data)
vocabulary_stem = vectorizer_stem.get_feature_names()
random.shuffle(vocabulary_stem)

In [133]:
num_topics = 5
passes = 40
tm_model_stem = LdaModel(matutils.Sparse2Corpus(matrix_stem), 
                    num_topics=num_topics, 
                    passes=passes,
                    id2word=dict([(i, s) for i, s in enumerate(vocabulary_stem)]))

In [134]:
tm_model_stem.show_topics()

[(0,
  '0.073*"sometime" + 0.072*"boyfriend" + 0.065*"mojo" + 0.056*"meeting" + 0.053*"wrap" + 0.049*"waking" + 0.040*"thick" + 0.037*"blue" + 0.035*"gang" + 0.030*"holy"'),
 (1,
  '0.152*"flee" + 0.047*"unfold" + 0.046*"painful" + 0.038*"heartaches" + 0.037*"raise" + 0.025*"macho" + 0.021*"stare" + 0.019*"open" + 0.010*"counting" + 0.003*"choice"'),
 (2,
  '0.068*"earth" + 0.063*"ear" + 0.053*"fifteen" + 0.051*"product" + 0.041*"wifes" + 0.040*"gettin" + 0.037*"burnt" + 0.036*"thighs" + 0.036*"sang" + 0.030*"daylight"'),
 (3,
  '0.119*"see" + 0.084*"standing" + 0.081*"we" + 0.073*"felt" + 0.040*"name" + 0.037*"perfect" + 0.028*"women" + 0.026*"beginning" + 0.023*"yes" + 0.019*"abandoned"'),
 (4,
  '0.039*"motormouth" + 0.031*"counting" + 0.028*"whatever" + 0.028*"little" + 0.028*"shit" + 0.027*"helpless" + 0.027*"tonight" + 0.024*"candy" + 0.022*"lord" + 0.022*"choice"')]