### Import text collections

In [80]:
with open("texts_amateur.txt", 'r', encoding = "utf-8-sig") as f:
    lay = f.read()
with open("texts_children.txt", 'r', encoding = "utf-8-sig") as f:
    children = f.read()
with open("texts_professional.txt", 'r', encoding = "utf-8-sig") as f:
    prof = f.read()

Clean from punctuation marks

In [2]:
import re 
children_clean = re.sub("[.,\"?!:;()\[\]{}\-–—]", "", children)
lay_clean = re.sub("[.,\"?!:;()\[\]{}\-–—]", "", lay)
prof_clean = re.sub("[.,\"?!:;()\[\]{}\-–—]", "", prof)

### Lemmatization

In [5]:
import nltk

In [10]:
from nltk import WordNetLemmatizer

In [7]:
from nltk import word_tokenize

In [11]:
def lemmatizer(poem):
    tokens = word_tokenize(poem)
    tokens_tags = nltk.pos_tag(tokens)
    matching_tags = {'NN':'n', 'VB':'v', 'JJ':'a', 'RB':'r'}

    lemmatizer = WordNetLemmatizer()
    tokens_lemma = []
    for (token, tag) in tokens_tags:

        if not(token[0].isalpha()):
            continue
        token = token.lower()
        if tag[:2] in matching_tags:
            token = lemmatizer.lemmatize(token, pos=matching_tags[tag[:2]])
            tokens_lemma.append(token)
        
    return ' '.join(tokens_lemma)

In [12]:
lemmas_child = lemmatizer(children)

In [13]:
lemmas_lay = lemmatizer(lay)

In [14]:
lemmas_prof = lemmatizer(prof)

### Tokenize and clean

In [15]:
tokens_children = lemmas_child.split()
tokens_lay = lemmas_lay.split()
tokens_prof = lemmas_prof.split()

In [16]:
small_words = {elem for elem in tokens_lay if len(elem) < 3} | {elem for elem in tokens_prof if len(elem) < 3} | {elem for elem in tokens_children if len(elem) < 3}

In [17]:
tinies = ['a',
 'ah',
 'am',
 'an',
 'as',
 'at',
 'be',
 'do',
 'go',
 'ha',
 'he',
 'hi',
 'i',
 'in',
 'is',
 'it',
 'me',
 'my',
 'no',
 'of',
 'oh',
 'ok',
 'or',
 'so',
 'to']

In [18]:
len(tinies)

25

In [19]:
for arr in [tokens_lay, tokens_prof, tokens_children]:
    for elem in arr:
        if len(elem) < 3 and elem not in tinies:
            arr.remove(elem)

### Find the most frequent words

In [20]:
from collections import Counter

freq_children = Counter(tokens_children)
frequency_children = freq_children.most_common(20)
print(frequency_children)
print()
freq_lay = Counter(tokens_lay)
frequency_lay = freq_lay.most_common(20)
print(frequency_lay)
print()
freq_prof = Counter(tokens_prof)
frequency_prof = freq_prof.most_common(20)
print(frequency_prof)

[('be', 3325), ('have', 593), ('go', 445), ('love', 411), ('do', 380), ('not', 314), ('friend', 282), ('get', 282), ('so', 280), ('day', 267), ('see', 256), ('make', 241), ('know', 215), ('say', 202), ('don', 196), ('come', 193), ('look', 180), ('time', 173), ('play', 169), ('just', 162)]

[('be', 2051), ('do', 667), ('have', 478), ('so', 432), ("n't", 419), ('know', 417), ('love', 395), ('just', 373), ('not', 365), ('never', 266), ('see', 261), ('go', 257), ('feel', 235), ('i', 225), ('say', 220), ('now', 210), ('day', 201), ('want', 198), ('make', 197), ('heart', 195)]

[('be', 1820), ('have', 488), ('not', 328), ('do', 252), ('go', 192), ('say', 190), ('come', 184), ('now', 168), ('make', 162), ('know', 161), ('love', 151), ('so', 151), ('night', 147), ('eye', 126), ('then', 125), ('see', 122), ('old', 118), ('time', 114), ('man', 110), ('still', 110)]


In [21]:
with open('Lay_frequency.tsv', 'w') as f:
    f.write('Word\tFrequency\n')
    for word,fr in frequency_lay:
            f.write(word + '\t' + str(fr) + '\n')

In [22]:
with open('Children_frequency.tsv', 'w') as f:
    f.write('Word\tFrequency\n')
    for word,fr in frequency_children:
            f.write(word + '\t' + str(fr) + '\n')

In [84]:
with open('Prof_frequency.tsv', 'w') as f:
    f.write('Word\tFrequency\n')
    for word,fr in frequency_prof:
            f.write(word + '\t' + str(fr) + '\n')

### Find the most frequent bigrams

In [24]:
n = 2
bigrams_children = []
    
for i in range(len(tokens_children)-n+1):
    bigrams_children.append(tokens_children[i:i+n])
    
ngrams_children = [' '.join(tokens_children[i:i+n]) for i in range(len(tokens_children)-n+1)]

In [25]:
n = 2
bigrams_lay = []
    
for i in range(len(tokens_lay)-n+1):
    bigrams_lay.append(tokens_lay[i:i+n])
    
ngrams_lay = [' '.join(tokens_lay[i:i+n]) for i in range(len(tokens_lay)-n+1)]

In [26]:
n = 2
bigrams_prof = []
    
for i in range(len(tokens_prof)-n+1):
    bigrams_prof.append(tokens_prof[i:i+n])
    
ngrams_prof = [' '.join(tokens_prof[i:i+n]) for i in range(len(tokens_prof)-n+1)]

In [27]:
bigram_freq = Counter(ngrams_children)
freq_bigram_children = bigram_freq.most_common(15)
freq_bigram_children

[('friend be', 94),
 ('be best', 83),
 ('be not', 73),
 ('do not', 58),
 ('be be', 56),
 ('be so', 56),
 ('be very', 55),
 ('best friend', 48),
 ('be great', 44),
 ('be fun', 42),
 ('be as', 40),
 ('love be', 40),
 ('want be', 39),
 ('be good', 38),
 ('be go', 36)]

In [85]:
with open('Bigram_frequency_child.tsv', 'w') as f:
    f.write('Bigram\tFrequency\n')
    for bigr,fr in freq_bigram_children:
            f.write(bigr + '\t' + str(fr) + '\n')

In [28]:
bigram_freq_lay = Counter(ngrams_lay)
freq_bigram_lay = bigram_freq_lay.most_common(15)
freq_bigram_lay

[("do n't", 252),
 ("n't know", 88),
 ('be so', 65),
 ('so much', 57),
 ('be not', 56),
 ('love be', 50),
 ('know be', 47),
 ('life be', 44),
 ('be just', 42),
 ('be be', 42),
 ('have be', 38),
 ('know do', 36),
 ('be there', 35),
 ("be n't", 33),
 ('heart be', 33)]

In [86]:
with open('Bigram_frequency_lay.tsv', 'w') as f:
    f.write('Bigram\tFrequency\n')
    for bigr,fr in freq_bigram_lay:
            f.write(bigr + '\t' + str(fr) + '\n')

In [29]:
bigram_freq_prof = Counter(ngrams_prof)
freq_bigram_prof = bigram_freq_prof.most_common(15)
freq_bigram_prof

[('be not', 59),
 ("do n't", 53),
 ('have be', 50),
 ('do not', 43),
 ('love be', 23),
 ('be be', 22),
 ('say be', 21),
 ('be go', 19),
 ('life be', 19),
 ('not be', 18),
 ('be so', 18),
 ('be make', 17),
 ('here be', 17),
 ('now be', 16),
 ('too much', 16)]

In [87]:
with open('Bigram_frequency_prof.tsv', 'w') as f:
    f.write('Bigram\tFrequency\n')
    for bigr,fr in freq_bigram_prof:
            f.write(bigr + '\t' + str(fr) + '\n')

### POS (part of speech) - tagging. 
Tag the words in each corpus, count POS tags to compare the corpora with regard to part of speech frequency

1. Remove stopwords using NLTK

In [92]:
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Avvrik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [91]:
tokens_children_clean = [word for word in tokens_children if word not in stopwords.words('english')]

In [94]:
tokens_lay_clean = [word for word in tokens_lay if word not in stopwords.words('english')]

In [95]:
tokens_prof_clean = [word for word in tokens_prof if word not in stopwords.words('english')]

2. Tag the words with NLTK and count the tags, then calculate percentage of each part of speech in the text collections

In [128]:
pos_tags_children = nltk.pos_tag(tokens_children_clean)
    
from collections import Counter
counts_children = Counter(tag for word,tag in pos_tags_children)
print(counts_children)
total = sum(counts_children.values())
print(total)


Counter({'NN': 16507, 'JJ': 7420, 'VBP': 2679, 'RB': 2187, 'VB': 1456, 'NNS': 618, 'VBD': 597, 'IN': 478, 'VBG': 441, 'VBN': 245, 'JJS': 177, 'VBZ': 114, 'CD': 90, 'JJR': 80, 'NNP': 40, 'MD': 36, 'RP': 35, 'RBS': 29, 'RBR': 29, 'DT': 26, 'WP': 25, 'FW': 23, 'UH': 18, 'CC': 11, 'WRB': 9, 'PRP': 5, 'WDT': 2, 'WP$': 2, 'PRP$': 2})
33381


In [97]:
pos_tags_lay = nltk.pos_tag(tokens_lay_clean)
    
from collections import Counter
counts_lay = Counter(tag for word,tag in pos_tags_lay)
print(counts_lay)
total = sum(counts_lay.values())
dict((word, float(n)/total) for word,n in counts_lay.items())

Counter({'NN': 12073, 'JJ': 5709, 'RB': 2745, 'VBP': 2630, 'VB': 1972, 'VBD': 558, 'NNS': 391, 'VBG': 361, 'IN': 303, 'VBN': 263, 'CD': 116, 'VBZ': 90, 'JJR': 72, 'JJS': 54, 'RP': 40, 'RBR': 38, 'MD': 32, 'DT': 23, 'FW': 22, 'NNP': 14, 'RBS': 8, 'CC': 8, 'WP': 6, 'PRP': 6, 'WDT': 4, 'UH': 2, 'WRB': 2, 'WP$': 2})


{'CC': 0.0002904443799012489,
 'CD': 0.004211443508568109,
 'DT': 0.0008350275922160906,
 'FW': 0.0007987220447284345,
 'IN': 0.011000580888759803,
 'JJ': 0.20726837060702874,
 'JJR': 0.00261399941911124,
 'JJS': 0.0019604995643334303,
 'MD': 0.0011617775196049957,
 'NN': 0.43831687481847226,
 'NNP': 0.0005082776648271856,
 'NNS': 0.014195469067673541,
 'PRP': 0.00021783328492593667,
 'RB': 0.09965872785361603,
 'RBR': 0.0013796108045309324,
 'RBS': 0.0002904443799012489,
 'RP': 0.0014522218995062445,
 'UH': 7.261109497531223e-05,
 'VB': 0.07159453964565786,
 'VBD': 0.020258495498112112,
 'VBG': 0.013106302643043856,
 'VBN': 0.009548358989253557,
 'VBP': 0.09548358989253558,
 'VBZ': 0.0032674992738890504,
 'WDT': 0.00014522218995062446,
 'WP': 0.00021783328492593667,
 'WP$': 7.261109497531223e-05,
 'WRB': 7.261109497531223e-05}

In [98]:
pos_tags_prof = nltk.pos_tag(tokens_prof_clean)
    
from collections import Counter
counts_prof = Counter(tag for word,tag in pos_tags_prof)
print(counts_prof)
total = sum(counts_prof.values())
dict((word, float(n)/total) for word,n in counts_prof.items())

Counter({'NN': 17796, 'JJ': 8160, 'VBP': 2205, 'RB': 2031, 'VB': 1064, 'VBD': 979, 'NNS': 601, 'VBG': 500, 'IN': 434, 'VBN': 348, 'VBZ': 169, 'JJR': 97, 'CD': 69, 'JJS': 67, 'RBR': 58, 'MD': 48, 'NNP': 30, 'RP': 29, 'FW': 26, 'WP': 18, 'CC': 18, 'WP$': 12, 'DT': 11, 'PRP': 6, 'RBS': 5, 'WDT': 4, 'UH': 3, 'WRB': 3, 'PDT': 2, 'PRP$': 1})


{'CC': 0.0005173305742369374,
 'CD': 0.0019831005345749267,
 'DT': 0.000316146462033684,
 'FW': 0.0007472552738977985,
 'IN': 0.012473414956601713,
 'JJ': 0.23452319365407828,
 'JJR': 0.0027878369833879407,
 'JJS': 0.0019256193596597115,
 'MD': 0.0013795481979651664,
 'NN': 0.5114674943955855,
 'NNP': 0.000862217623728229,
 'NNS': 0.01727309306202219,
 'PDT': 5.748117491521527e-05,
 'PRP': 0.0001724435247456458,
 'PRP$': 2.8740587457607634e-05,
 'RB': 0.058372133126401105,
 'RBR': 0.0016669540725412428,
 'RBS': 0.00014370293728803817,
 'RP': 0.0008334770362706214,
 'UH': 8.62217623728229e-05,
 'VB': 0.030579985054894523,
 'VBD': 0.028137035120997873,
 'VBG': 0.014370293728803817,
 'VBN': 0.010001724435247456,
 'VBP': 0.06337299534402484,
 'VBZ': 0.00485715928033569,
 'WDT': 0.00011496234983043054,
 'WP': 0.0005173305742369374,
 'WP$': 0.0003448870494912916,
 'WRB': 8.62217623728229e-05}

### Tf-idf of each corpus with regard to other two corpora

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [33]:
three_texts = [lemmas_child, lemmas_lay, lemmas_prof]

In [36]:
count = CountVectorizer(min_df=1, max_df=3, stop_words = 'english')
tf = count.fit_transform(three_texts).toarray()
tf.shape

(3, 12740)

In [63]:
vect = TfidfVectorizer(min_df=1, max_df=3, stop_words = 'english')
tfidf = vect.fit_transform(three_texts).toarray()
tfidf.shape

(3, 12740)

In [64]:
vect.stop_words_

set()

In [65]:
vect.vocabulary_

{'friend': 4386,
 'hang': 5017,
 'shop': 9831,
 'ice': 5490,
 'cream': 2459,
 'oreo': 7676,
 'money': 7090,
 'happy': 5031,
 'mother': 7151,
 'day': 2718,
 'mum': 7225,
 'love': 6553,
 'miss': 7028,
 'new': 7378,
 'zealand': 12714,
 'great': 4822,
 'really': 8844,
 'good': 4720,
 'place': 8191,
 'country': 2396,
 've': 12061,
 'house': 5402,
 'lot': 6541,
 'poisonous': 8292,
 'spider': 10371,
 'want': 12243,
 'bite': 1030,
 'rat': 8809,
 'amaze': 313,
 'easy': 3409,
 'chase': 1773,
 'bee': 885,
 'aspire': 567,
 'believe': 921,
 'confidence': 2221,
 'dream': 3262,
 'enjoy': 3602,
 'family': 3900,
 'greatness': 4824,
 'inspire': 5742,
 'joy': 5983,
 'kindness': 6083,
 'mystery': 7269,
 'precious': 8434,
 'question': 8698,
 'rely': 8987,
 'special': 10342,
 'try': 11619,
 'unstoppable': 11938,
 'vacate': 12018,
 'worship': 12593,
 'ray': 8822,
 'yolo': 12687,
 'zone': 12733,
 'drink': 3287,
 'sucker': 10766,
 'enviromental': 3641,
 'dislike': 3094,
 'pointless': 8288,
 'plastic': 8210,
 '

In [66]:
words = vect.get_feature_names()
words

['95',
 'aaaaaaaaaaaaaccccccccccoooooooo',
 'aaaaaaaaaaahhhhhhhhhhhhhhhh',
 'aaaaahhhhhh',
 'aarhus',
 'abacus',
 'abandon',
 'abated',
 'abattoir',
 'abbey',
 'abbie',
 'abbot',
 'abc',
 'abdicate',
 'abdomen',
 'abduct',
 'abe',
 'abed',
 'abel',
 'aberdeen',
 'abide',
 'ability',
 'abject',
 'ablaze',
 'able',
 'abominable',
 'abort',
 'abound',
 'abracadabra',
 'abraham',
 'abroad',
 'abrumptly',
 'abrupt',
 'absconded',
 'absence',
 'absent',
 'absolute',
 'absolutely',
 'absolutes',
 'absolution',
 'absolves',
 'absorb',
 'absorbed',
 'abstract',
 'abstracted',
 'absurd',
 'absurdities',
 'abundance',
 'abundant',
 'abuse',
 'abused',
 'abusive',
 'abyss',
 'academic',
 'academy',
 'accelerate',
 'accelerated',
 'accept',
 'acceptable',
 'accident',
 'accidentally',
 'acclaim',
 'accompaniment',
 'accomplish',
 'accomplishment',
 'accord',
 'accost',
 'account',
 'accounting',
 'accrue',
 'accumulate',
 'accuracy',
 'accurate',
 'accusation',
 'accuser',
 'ace',
 'ach',
 'achaia'

* Rows represent corpora, columns represent feature words. Therefore, to find tf-idf of each corpus, we need to extract it from the rows one by one.

In [67]:
row_children = tfidf[0]
row_children.shape

(12740,)

In [68]:
import numpy as np

In [69]:
sort_indeces_child = np.argsort(row_children)

In [70]:
top_indeces_child = list(sort_indeces_child)[-20:]

In [71]:
top_words_child = []
for ind in top_indeces_child:
    top_words_child.append(words[ind])
top_words_child

['fun',
 'cat',
 'night',
 'want',
 'sky',
 'best',
 'like',
 'just',
 'play',
 'mum',
 'time',
 'look',
 'come',
 'don',
 'say',
 'know',
 'make',
 'day',
 'friend',
 'love']

In [78]:
with open('children_tf_idf.txt', 'w', encoding = 'utf-8') as f:
    f.write(str(top_words_child))

In [72]:
row_lay = tfidf[1]
row_lay.shape
sort_indeces_lay = np.argsort(row_lay) 

top_indeces_lay = list(sort_indeces_lay)[-20:]
  
top_words_lay = []
for ind in top_indeces_lay:
    top_words_lay.append(words[ind])
top_words_lay

['try',
 'tell',
 'come',
 'leave',
 'look',
 'thing',
 'eye',
 'way',
 'time',
 'think',
 'life',
 'heart',
 'want',
 'make',
 'day',
 'say',
 'feel',
 'just',
 'love',
 'know']

In [77]:
with open('lay_tf_idf.txt', 'w', encoding = 'utf-8') as f:
    f.write(str(top_words_lay))

In [73]:
row_prof = tfidf[2]
row_prof.shape
sort_indeces_prof = np.argsort(row_prof) 

top_indeces_prof = list(sort_indeces_prof)[-20:]
  
top_words_prof = []
for ind in top_indeces_prof:
    top_words_prof.append(words[ind])
top_words_prof

['thing',
 'life',
 'face',
 'think',
 'leave',
 'white',
 'look',
 'hand',
 'day',
 'light',
 'man',
 'time',
 'old',
 'eye',
 'love',
 'night',
 'make',
 'know',
 'come',
 'say']

In [79]:
with open('prof_tf_idf.txt', 'w', encoding = 'utf-8') as f:
    f.write(str(top_words_prof))

### Topic modeling

1. Split the strings into poems
2. Vectorize them and find term frequency for each text collection

In [47]:
n = 5000
poems_children = [lemmas_child[i:i+n] for i in range(0, len(lemmas_child), n)]
poems_lay = [lemmas_lay[i:i+n] for i in range(0, len(lemmas_lay), n)]
poems_prof = [lemmas_prof[i:i+n] for i in range(0, len(lemmas_prof), n)]

In [54]:
count_child = CountVectorizer(stop_words = 'english')
tf_child = count_child.fit_transform(poems_children).toarray()
tf_child.shape

(46, 5084)

In [56]:
count_lay = CountVectorizer(min_df=2, max_df=3, stop_words = 'english')
tf_lay = count_lay.fit_transform(poems_lay).toarray()
tf_lay.shape

(38, 1051)

In [57]:
count_prof = CountVectorizer(min_df=2, max_df=3, stop_words = 'english')
tf_prof = count_prof.fit_transform(poems_prof).toarray()
tf_prof.shape

(49, 1946)

#### Topic modeling with LDA

In [55]:
from sklearn.decomposition import LatentDirichletAllocation 
lda_child = LatentDirichletAllocation(n_components=7)
lda_child.fit(tf_child)

topic_words_child = lda_child.components_
topic_words_child.shape
count_words_child = count_child.get_feature_names()

for topic_ind, topic in enumerate(topic_words_child):
    print('topic', topic_ind)
    top_indeces_child = list(topic.argsort())[-7:]
    lda_top_words_child = []
    for ind in top_indeces_child:
        lda_top_words_child.append(count_words_child[ind])
    print(', '.join(lda_top_words_child))



topic 0
come, make, blue, love, day, cat, friend
topic 1
ripper, fast, loud, eater, hater, runner, lover
topic 2
blue, night, time, day, love, sky, say
topic 3
just, night, christmas, say, love, day, box
topic 4
taste, sound, smell, feel, look, summer, beach
topic 5
don, say, know, make, day, friend, love
topic 6
say, want, day, come, know, friend, love


In [58]:
from sklearn.decomposition import LatentDirichletAllocation 
lda_lay = LatentDirichletAllocation(n_components=7)
lda_lay.fit(tf_lay)

topic_words_lay = lda_lay.components_
topic_words_lay.shape
count_words_lay = count_lay.get_feature_names()

for topic_ind, topic in enumerate(topic_words_lay):
    print('topic', topic_ind)
    top_indeces_lay = list(topic.argsort())[-7:]
    lda_top_words_lay = []
    for ind in top_indeces_lay:
        lda_top_words_lay.append(count_words_lay[ind])
    print(', '.join(lda_top_words_lay))



topic 0
reflection, dollar, sex, muse, spell, misconstrue, pursue
topic 1
extra, distress, forevermore, machine, patter, fairytale, guitar
topic 2
existence, video, debut, pop, funny, misconstrue, spider
topic 3
poetess, horse, crystal, illusion, sneeze, robot, depression
topic 4
expression, discover, pie, feather, fuck, ive, mama
topic 5
design, thy, bother, affection, unrequited, feed, perfume
topic 6
painful, teacher, sigh, toast, risk, willow, fairy


In [59]:
from sklearn.decomposition import LatentDirichletAllocation 
lda_prof = LatentDirichletAllocation(n_components=6)
lda_prof.fit(tf_prof)

topic_words_prof = lda_prof.components_
topic_words_prof.shape
count_words_prof = count_prof.get_feature_names()

for topic_ind, topic in enumerate(topic_words_prof):
    print('topic', topic_ind)
    top_indeces_prof = list(topic.argsort())[-8:]
    lda_top_words_prof = []
    for ind in top_indeces_prof:
        lda_top_words_prof.append(count_words_prof[ind])
    print(', '.join(lda_top_words_prof))



topic 0
limp, mountainside, perch, ain, ram, ma, valley, harlem
topic 1
strict, eh, halt, toddle, spiral, childhood, la, odor
topic 2
hardy, whitewash, byre, elm, lo, guitar, nibble, scum
topic 3
coldly, glist, tier, loneliness, aye, fiesole, warmth, ti
topic 4
upper, princess, depart, hippopotamus, castle, miner, radio, create
topic 5
apollo, decay, riding, ceiling, raft, nonsense, sabbath, frog


#### Topic Modeling with NMF

In [60]:
from sklearn.decomposition import NMF
nmf_child = NMF(n_components=7)
nmf_child.fit(tf_child)
nmf_topic_word_child = nmf_child.components_
nmf_topic_word_child.shape
for topic_ind, topic in enumerate(nmf_topic_word_child):
    print('topic', topic_ind)
    top_indeces_child = list(topic.argsort())[-7:]
    top_words_child = []
    for ind in top_indeces_child:
        top_words_child.append(count_words_child[ind])
    print(', '.join(top_words_child))

topic 0
dog, fortnite, play, school, don, say, love
topic 1
taste, look, sound, feel, smell, summer, beach
topic 2
big, come, dog, sun, make, day, cat
topic 3
cat, play, look, love, best, dad, friend
topic 4
say, beautiful, day, blue, sky, love, wish
topic 5
think, good, like, play, make, want, know
topic 6
winter, sea, christmas, tree, night, day, box


In [61]:
from sklearn.decomposition import NMF
nmf_lay = NMF(n_components=7)
nmf_lay.fit(tf_lay)
nmf_topic_word_lay = nmf_lay.components_
nmf_topic_word_lay.shape
for topic_ind, topic in enumerate(nmf_topic_word_lay):
    print('topic', topic_ind)
    top_indeces_lay = list(topic.argsort())[-7:]
    top_words_lay = []
    for ind in top_indeces_lay:
        top_words_lay.append(count_words_lay[ind])
    print(', '.join(top_words_lay))

topic 0
tan, scene, wreck, doesnt, hoping, horse, depression
topic 1
writer, amidst, spell, cat, expression, poetess, sex
topic 2
glory, wipe, bean, discover, feather, pie, mama
topic 3
road, rail, pouring, sore, patter, fairytale, guitar
topic 4
snowflake, sparkle, beam, spider, flake, crystal, sneeze
topic 5
case, untitled, grim, raven, remorse, careful, robot
topic 6
effort, bridge, masterpiece, allow, ancient, arrive, dime


In [62]:
from sklearn.decomposition import NMF
nmf_prof = NMF(n_components=7)
nmf_prof.fit(tf_prof)
nmf_topic_word_prof = nmf_prof.components_
nmf_topic_word_prof.shape
for topic_ind, topic in enumerate(nmf_topic_word_prof):
    print('topic', topic_ind)
    top_indeces_prof = list(topic.argsort())[-7:]
    top_words_prof = []
    for ind in top_indeces_prof:
        top_words_prof.append(count_words_prof[ind])
    print(', '.join(top_words_prof))

topic 0
unsaid, frog, whitewash, intent, ram, create, perch
topic 1
glist, enamel, dragon, aye, fiesole, warmth, ti
topic 2
nude, thread, handful, eh, childhood, la, odor
topic 3
sphere, dante, telescope, ye, guitar, lo, scum
topic 4
gigantic, thistle, vale, upward, daisy, mountainside, valley
topic 5
elm, insurance, telescope, harlem, nonsense, sabbath, nibble
topic 6
sly, stern, toddle, halt, princess, miner, spiral
