### Import text collections

In [80]:
with open("texts_amateur.txt", 'r', encoding = "utf-8-sig") as f:
    lay = f.read()
with open("texts_children.txt", 'r', encoding = "utf-8-sig") as f:
    children = f.read()
with open("texts_professional.txt", 'r', encoding = "utf-8-sig") as f:
    prof = f.read()

Clean from punctuation marks

In [2]:
import re 
children_clean = re.sub("[.,\"?!:;()\[\]{}\-–—]", "", children)
lay_clean = re.sub("[.,\"?!:;()\[\]{}\-–—]", "", lay)
prof_clean = re.sub("[.,\"?!:;()\[\]{}\-–—]", "", prof)

### Lemmatization

In [5]:
import nltk

In [10]:
from nltk import WordNetLemmatizer

In [7]:
from nltk import word_tokenize

In [11]:
def lemmatizer(poem):
    tokens = word_tokenize(poem)
    tokens_tags = nltk.pos_tag(tokens)
    matching_tags = {'NN':'n', 'VB':'v', 'JJ':'a', 'RB':'r'}

    lemmatizer = WordNetLemmatizer()
    tokens_lemma = []
    for (token, tag) in tokens_tags:

        if not(token[0].isalpha()):
            continue
        token = token.lower()
        if tag[:2] in matching_tags:
            token = lemmatizer.lemmatize(token, pos=matching_tags[tag[:2]])
            tokens_lemma.append(token)
        
    return ' '.join(tokens_lemma)

In [138]:
lemmas_child = lemmatizer(children)
with open('lemmas_children.txt', 'w', encoding = 'utf-8') as f:
    f.write(lemmas_child)

In [13]:
lemmas_lay = lemmatizer(lay)

In [139]:
with open('lemmas_lay.txt', 'w', encoding = 'utf-8') as f:
    f.write(lemmas_lay)

In [14]:
lemmas_prof = lemmatizer(prof)

In [140]:
with open('lemmas_prof.txt', 'w', encoding = 'utf-8') as f:
    f.write(lemmas_prof)

### Tokenize and clean

In [15]:
tokens_children = lemmas_child.split()
tokens_lay = lemmas_lay.split()
tokens_prof = lemmas_prof.split()

In [16]:
small_words = {elem for elem in tokens_lay if len(elem) < 3} | {elem for elem in tokens_prof if len(elem) < 3} | {elem for elem in tokens_children if len(elem) < 3}

In [17]:
tinies = ['a',
 'ah',
 'am',
 'an',
 'as',
 'at',
 'be',
 'do',
 'go',
 'ha',
 'he',
 'hi',
 'i',
 'in',
 'is',
 'it',
 'me',
 'my',
 'no',
 'of',
 'oh',
 'ok',
 'or',
 'so',
 'to']

In [18]:
len(tinies)

25

In [19]:
for arr in [tokens_lay, tokens_prof, tokens_children]:
    for elem in arr:
        if len(elem) < 3 and elem not in tinies:
            arr.remove(elem)

In [170]:
lemmas_child = " ".join(tokens_children)
lemmas_lay = " ".join(tokens_lay)
lemmas_prof = " ".join(tokens_prof)

### Find unique words

In [135]:
unique_child = set(tokens_children)
list(unique_child)[:20]

['jingle',
 'saddest',
 'reluctance',
 'guess',
 'chan',
 'monday',
 'late',
 'upper',
 'uniform',
 'wall',
 'granny',
 'explore',
 'craaash',
 'together…',
 'storage',
 'wareers',
 'formula',
 'tell',
 'crater',
 'tint']

In [136]:
unique_lay = set(tokens_lay)
list(unique_lay)[:20]

['profound',
 'quill',
 'variety',
 'unanswered',
 'cuz',
 'immenently',
 'wrinkle',
 'revelation',
 'willingly',
 'guess',
 'decipher',
 'late',
 'retain',
 'uniform',
 'idiocy',
 'wall',
 'emptiness',
 'regale',
 'wart',
 'medical']

In [137]:
unique_prof = set(tokens_prof)
list(unique_prof)[:20]

['sky-floor',
 'wherefore',
 'oust',
 'monday',
 'emptiness',
 'maremma',
 'flautist',
 'february',
 'moondust',
 'you.',
 'coronado',
 'incetown',
 'infamy',
 'crane',
 'concede',
 'plainly',
 'mounting',
 'blizzard',
 'quaintly',
 'wing']

### Find the most frequent words

In [20]:
from collections import Counter

freq_children = Counter(tokens_children)
frequency_children = freq_children.most_common(20)
print(frequency_children)
print()
freq_lay = Counter(tokens_lay)
frequency_lay = freq_lay.most_common(20)
print(frequency_lay)
print()
freq_prof = Counter(tokens_prof)
frequency_prof = freq_prof.most_common(20)
print(frequency_prof)

[('be', 3325), ('have', 593), ('go', 445), ('love', 411), ('do', 380), ('not', 314), ('friend', 282), ('get', 282), ('so', 280), ('day', 267), ('see', 256), ('make', 241), ('know', 215), ('say', 202), ('don', 196), ('come', 193), ('look', 180), ('time', 173), ('play', 169), ('just', 162)]

[('be', 2051), ('do', 667), ('have', 478), ('so', 432), ("n't", 419), ('know', 417), ('love', 395), ('just', 373), ('not', 365), ('never', 266), ('see', 261), ('go', 257), ('feel', 235), ('i', 225), ('say', 220), ('now', 210), ('day', 201), ('want', 198), ('make', 197), ('heart', 195)]

[('be', 1820), ('have', 488), ('not', 328), ('do', 252), ('go', 192), ('say', 190), ('come', 184), ('now', 168), ('make', 162), ('know', 161), ('love', 151), ('so', 151), ('night', 147), ('eye', 126), ('then', 125), ('see', 122), ('old', 118), ('time', 114), ('man', 110), ('still', 110)]


In [21]:
with open('Lay_frequency.tsv', 'w') as f:
    f.write('Word\tFrequency\n')
    for word,fr in frequency_lay:
            f.write(word + '\t' + str(fr) + '\n')

In [22]:
with open('Children_frequency.tsv', 'w') as f:
    f.write('Word\tFrequency\n')
    for word,fr in frequency_children:
            f.write(word + '\t' + str(fr) + '\n')

In [84]:
with open('Prof_frequency.tsv', 'w') as f:
    f.write('Word\tFrequency\n')
    for word,fr in frequency_prof:
            f.write(word + '\t' + str(fr) + '\n')

### Find the most frequent bigrams

In [24]:
n = 2
bigrams_children = []
    
for i in range(len(tokens_children)-n+1):
    bigrams_children.append(tokens_children[i:i+n])
    
ngrams_children = [' '.join(tokens_children[i:i+n]) for i in range(len(tokens_children)-n+1)]

In [25]:
n = 2
bigrams_lay = []
    
for i in range(len(tokens_lay)-n+1):
    bigrams_lay.append(tokens_lay[i:i+n])
    
ngrams_lay = [' '.join(tokens_lay[i:i+n]) for i in range(len(tokens_lay)-n+1)]

In [26]:
n = 2
bigrams_prof = []
    
for i in range(len(tokens_prof)-n+1):
    bigrams_prof.append(tokens_prof[i:i+n])
    
ngrams_prof = [' '.join(tokens_prof[i:i+n]) for i in range(len(tokens_prof)-n+1)]

In [27]:
bigram_freq = Counter(ngrams_children)
freq_bigram_children = bigram_freq.most_common(15)
freq_bigram_children

[('friend be', 94),
 ('be best', 83),
 ('be not', 73),
 ('do not', 58),
 ('be be', 56),
 ('be so', 56),
 ('be very', 55),
 ('best friend', 48),
 ('be great', 44),
 ('be fun', 42),
 ('be as', 40),
 ('love be', 40),
 ('want be', 39),
 ('be good', 38),
 ('be go', 36)]

In [85]:
with open('Bigram_frequency_child.tsv', 'w') as f:
    f.write('Bigram\tFrequency\n')
    for bigr,fr in freq_bigram_children:
            f.write(bigr + '\t' + str(fr) + '\n')

In [28]:
bigram_freq_lay = Counter(ngrams_lay)
freq_bigram_lay = bigram_freq_lay.most_common(15)
freq_bigram_lay

[("do n't", 252),
 ("n't know", 88),
 ('be so', 65),
 ('so much', 57),
 ('be not', 56),
 ('love be', 50),
 ('know be', 47),
 ('life be', 44),
 ('be just', 42),
 ('be be', 42),
 ('have be', 38),
 ('know do', 36),
 ('be there', 35),
 ("be n't", 33),
 ('heart be', 33)]

In [86]:
with open('Bigram_frequency_lay.tsv', 'w') as f:
    f.write('Bigram\tFrequency\n')
    for bigr,fr in freq_bigram_lay:
            f.write(bigr + '\t' + str(fr) + '\n')

In [29]:
bigram_freq_prof = Counter(ngrams_prof)
freq_bigram_prof = bigram_freq_prof.most_common(15)
freq_bigram_prof

[('be not', 59),
 ("do n't", 53),
 ('have be', 50),
 ('do not', 43),
 ('love be', 23),
 ('be be', 22),
 ('say be', 21),
 ('be go', 19),
 ('life be', 19),
 ('not be', 18),
 ('be so', 18),
 ('be make', 17),
 ('here be', 17),
 ('now be', 16),
 ('too much', 16)]

In [87]:
with open('Bigram_frequency_prof.tsv', 'w') as f:
    f.write('Bigram\tFrequency\n')
    for bigr,fr in freq_bigram_prof:
            f.write(bigr + '\t' + str(fr) + '\n')

### POS (part of speech) - tagging. 
Tag the words in each corpus, count POS tags to compare the corpora with regard to part of speech frequency

1. Remove stopwords using NLTK

In [92]:
from nltk.corpus import stopwords
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Avvrik\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [91]:
tokens_children_clean = [word for word in tokens_children if word not in stopwords.words('english')]

In [94]:
tokens_lay_clean = [word for word in tokens_lay if word not in stopwords.words('english')]

In [95]:
tokens_prof_clean = [word for word in tokens_prof if word not in stopwords.words('english')]

2. Tag the words with NLTK and count the tags, then calculate percentage of each part of speech in the text collections

In [141]:
pos_tags_children = nltk.pos_tag(tokens_children_clean)
    
from collections import Counter
counts_children = Counter(tag for word,tag in pos_tags_children)
print(counts_children)
total = sum(counts_children.values())
print(total)
dict((word, float(n)/total) for word,n in counts_children.items())

Counter({'NN': 16507, 'JJ': 7420, 'VBP': 2679, 'RB': 2187, 'VB': 1456, 'NNS': 618, 'VBD': 597, 'IN': 478, 'VBG': 441, 'VBN': 245, 'JJS': 177, 'VBZ': 114, 'CD': 90, 'JJR': 80, 'NNP': 40, 'MD': 36, 'RP': 35, 'RBS': 29, 'RBR': 29, 'DT': 26, 'WP': 25, 'FW': 23, 'UH': 18, 'CC': 11, 'WRB': 9, 'PRP': 5, 'WDT': 2, 'WP$': 2, 'PRP$': 2})
33381


{'CC': 0.0003295287738533897,
 'CD': 0.0026961445133459154,
 'DT': 0.0007788861927443756,
 'FW': 0.0006890147089661784,
 'IN': 0.01431952308199275,
 'JJ': 0.22228213654474102,
 'JJR': 0.002396572900751925,
 'JJS': 0.0053024175429136336,
 'MD': 0.0010784578053383662,
 'NN': 0.4945028609089003,
 'NNP': 0.0011982864503759624,
 'NNS': 0.01851352565830862,
 'PRP': 0.0001497858062969953,
 'PRP$': 5.9914322518798116e-05,
 'RB': 0.06551631167430574,
 'RBR': 0.0008687576765225727,
 'RBS': 0.0008687576765225727,
 'RP': 0.001048500644078967,
 'UH': 0.0005392289026691831,
 'VB': 0.04361762679368503,
 'VBD': 0.017884425271861237,
 'VBG': 0.013211108115394984,
 'VBN': 0.0073395045085527695,
 'VBP': 0.08025523501393007,
 'VBZ': 0.003415116383571493,
 'WDT': 5.9914322518798116e-05,
 'WP': 0.0007489290314849765,
 'WP$': 5.9914322518798116e-05,
 'WRB': 0.00026961445133459155}

In [97]:
pos_tags_lay = nltk.pos_tag(tokens_lay_clean)
    
from collections import Counter
counts_lay = Counter(tag for word,tag in pos_tags_lay)
print(counts_lay)
total = sum(counts_lay.values())
dict((word, float(n)/total) for word,n in counts_lay.items())

Counter({'NN': 12073, 'JJ': 5709, 'RB': 2745, 'VBP': 2630, 'VB': 1972, 'VBD': 558, 'NNS': 391, 'VBG': 361, 'IN': 303, 'VBN': 263, 'CD': 116, 'VBZ': 90, 'JJR': 72, 'JJS': 54, 'RP': 40, 'RBR': 38, 'MD': 32, 'DT': 23, 'FW': 22, 'NNP': 14, 'RBS': 8, 'CC': 8, 'WP': 6, 'PRP': 6, 'WDT': 4, 'UH': 2, 'WRB': 2, 'WP$': 2})


{'CC': 0.0002904443799012489,
 'CD': 0.004211443508568109,
 'DT': 0.0008350275922160906,
 'FW': 0.0007987220447284345,
 'IN': 0.011000580888759803,
 'JJ': 0.20726837060702874,
 'JJR': 0.00261399941911124,
 'JJS': 0.0019604995643334303,
 'MD': 0.0011617775196049957,
 'NN': 0.43831687481847226,
 'NNP': 0.0005082776648271856,
 'NNS': 0.014195469067673541,
 'PRP': 0.00021783328492593667,
 'RB': 0.09965872785361603,
 'RBR': 0.0013796108045309324,
 'RBS': 0.0002904443799012489,
 'RP': 0.0014522218995062445,
 'UH': 7.261109497531223e-05,
 'VB': 0.07159453964565786,
 'VBD': 0.020258495498112112,
 'VBG': 0.013106302643043856,
 'VBN': 0.009548358989253557,
 'VBP': 0.09548358989253558,
 'VBZ': 0.0032674992738890504,
 'WDT': 0.00014522218995062446,
 'WP': 0.00021783328492593667,
 'WP$': 7.261109497531223e-05,
 'WRB': 7.261109497531223e-05}

In [98]:
pos_tags_prof = nltk.pos_tag(tokens_prof_clean)
    
from collections import Counter
counts_prof = Counter(tag for word,tag in pos_tags_prof)
print(counts_prof)
total = sum(counts_prof.values())
dict((word, float(n)/total) for word,n in counts_prof.items())

Counter({'NN': 17796, 'JJ': 8160, 'VBP': 2205, 'RB': 2031, 'VB': 1064, 'VBD': 979, 'NNS': 601, 'VBG': 500, 'IN': 434, 'VBN': 348, 'VBZ': 169, 'JJR': 97, 'CD': 69, 'JJS': 67, 'RBR': 58, 'MD': 48, 'NNP': 30, 'RP': 29, 'FW': 26, 'WP': 18, 'CC': 18, 'WP$': 12, 'DT': 11, 'PRP': 6, 'RBS': 5, 'WDT': 4, 'UH': 3, 'WRB': 3, 'PDT': 2, 'PRP$': 1})


{'CC': 0.0005173305742369374,
 'CD': 0.0019831005345749267,
 'DT': 0.000316146462033684,
 'FW': 0.0007472552738977985,
 'IN': 0.012473414956601713,
 'JJ': 0.23452319365407828,
 'JJR': 0.0027878369833879407,
 'JJS': 0.0019256193596597115,
 'MD': 0.0013795481979651664,
 'NN': 0.5114674943955855,
 'NNP': 0.000862217623728229,
 'NNS': 0.01727309306202219,
 'PDT': 5.748117491521527e-05,
 'PRP': 0.0001724435247456458,
 'PRP$': 2.8740587457607634e-05,
 'RB': 0.058372133126401105,
 'RBR': 0.0016669540725412428,
 'RBS': 0.00014370293728803817,
 'RP': 0.0008334770362706214,
 'UH': 8.62217623728229e-05,
 'VB': 0.030579985054894523,
 'VBD': 0.028137035120997873,
 'VBG': 0.014370293728803817,
 'VBN': 0.010001724435247456,
 'VBP': 0.06337299534402484,
 'VBZ': 0.00485715928033569,
 'WDT': 0.00011496234983043054,
 'WP': 0.0005173305742369374,
 'WP$': 0.0003448870494912916,
 'WRB': 8.62217623728229e-05}

### Tf-idf of each corpus with regard to other two corpora

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [171]:
three_texts = [lemmas_child, lemmas_lay, lemmas_prof]

In [172]:
count = CountVectorizer(min_df=1, max_df=3, stop_words = 'english')
tf = count.fit_transform(three_texts).toarray()
tf.shape

(3, 12683)

In [173]:
vect = TfidfVectorizer(min_df=1, max_df=3, stop_words = 'english')
tfidf = vect.fit_transform(three_texts).toarray()
tfidf.shape

(3, 12683)

In [174]:
vect.stop_words_

set()

In [175]:
vect.vocabulary_

{'friend': 4376,
 'hang': 5005,
 'shop': 9789,
 'ice': 5476,
 'cream': 2456,
 'oreo': 7646,
 'money': 7065,
 'happy': 5019,
 'mother': 7126,
 'day': 2715,
 'mum': 7200,
 'love': 6531,
 'miss': 7005,
 'new': 7351,
 'zealand': 12657,
 'great': 4811,
 'really': 8804,
 'good': 4709,
 'place': 8154,
 'country': 2393,
 'house': 5388,
 'lot': 6519,
 'poisonous': 8254,
 'spider': 10327,
 'want': 12189,
 'bite': 1028,
 'rat': 8769,
 'amaze': 311,
 'easy': 3403,
 'chase': 1770,
 'bee': 883,
 'aspire': 565,
 'believe': 919,
 'confidence': 2218,
 'dream': 3257,
 'enjoy': 3596,
 'family': 3890,
 'greatness': 4813,
 'inspire': 5727,
 'joy': 5965,
 'kindness': 6064,
 'mystery': 7244,
 'precious': 8396,
 'question': 8659,
 'rely': 8947,
 'special': 10298,
 'try': 11571,
 'unstoppable': 11888,
 'vacate': 11967,
 'worship': 12538,
 'ray': 8782,
 'yolo': 12630,
 'zone': 12676,
 'drink': 3282,
 'sucker': 10721,
 'enviromental': 3635,
 'dislike': 3090,
 'pointless': 8250,
 'plastic': 8173,
 'ocean': 7551,


In [176]:
words = vect.get_feature_names()
words

['95',
 'aaaaaaaaaaaaaccccccccccoooooooo',
 'aaaaaaaaaaahhhhhhhhhhhhhhhh',
 'aaaaahhhhhh',
 'aarhus',
 'abacus',
 'abandon',
 'abated',
 'abattoir',
 'abbey',
 'abbie',
 'abbot',
 'abc',
 'abdicate',
 'abdomen',
 'abduct',
 'abe',
 'abed',
 'abel',
 'aberdeen',
 'abide',
 'ability',
 'abject',
 'ablaze',
 'able',
 'abominable',
 'abort',
 'abound',
 'abracadabra',
 'abraham',
 'abroad',
 'abrumptly',
 'abrupt',
 'absconded',
 'absence',
 'absent',
 'absolute',
 'absolutely',
 'absolutes',
 'absolution',
 'absolves',
 'absorb',
 'absorbed',
 'abstract',
 'abstracted',
 'absurd',
 'absurdities',
 'abundance',
 'abundant',
 'abuse',
 'abused',
 'abusive',
 'abyss',
 'academic',
 'academy',
 'accelerate',
 'accelerated',
 'accept',
 'acceptable',
 'accident',
 'accidentally',
 'acclaim',
 'accompaniment',
 'accomplish',
 'accomplishment',
 'accord',
 'accost',
 'account',
 'accounting',
 'accrue',
 'accumulate',
 'accuracy',
 'accurate',
 'accusation',
 'accuser',
 'ace',
 'ach',
 'achaia'

* Rows represent corpora, columns represent feature words. Therefore, to find tf-idf of each corpus, we need to extract it from the rows one by one.

In [177]:
row_children = tfidf[0]
row_children.shape

(12683,)

In [68]:
import numpy as np

In [178]:
sort_indeces_child = np.argsort(row_children)

In [179]:
top_indeces_child = list(sort_indeces_child)[-20:]

In [180]:
top_words_child = []
for ind in top_indeces_child:
    top_words_child.append(words[ind])
top_words_child

['blue',
 'sun',
 'fun',
 'cat',
 'night',
 'want',
 'best',
 'sky',
 'like',
 'play',
 'mum',
 'time',
 'look',
 'come',
 'say',
 'know',
 'make',
 'day',
 'friend',
 'love']

In [181]:
with open('children_tf_idf.txt', 'w', encoding = 'utf-8') as f:
    f.write(str(top_words_child))

In [182]:
row_lay = tfidf[1]
row_lay.shape
sort_indeces_lay = np.argsort(row_lay) 

top_indeces_lay = list(sort_indeces_lay)[-20:]
  
top_words_lay = []
for ind in top_indeces_lay:
    top_words_lay.append(words[ind])
top_words_lay

['wish',
 'try',
 'come',
 'tell',
 'leave',
 'look',
 'thing',
 'eye',
 'way',
 'time',
 'think',
 'life',
 'heart',
 'want',
 'make',
 'day',
 'say',
 'feel',
 'love',
 'know']

In [183]:
with open('lay_tf_idf.txt', 'w', encoding = 'utf-8') as f:
    f.write(str(top_words_lay))

In [184]:
row_prof = tfidf[2]
row_prof.shape
sort_indeces_prof = np.argsort(row_prof) 

top_indeces_prof = list(sort_indeces_prof)[-20:]
  
top_words_prof = []
for ind in top_indeces_prof:
    top_words_prof.append(words[ind])
top_words_prof

['face',
 'life',
 'think',
 'leave',
 'white',
 'look',
 'hand',
 'day',
 'light',
 'man',
 'time',
 'old',
 'eye',
 'just',
 'love',
 'night',
 'make',
 'know',
 'come',
 'say']

In [185]:
with open('prof_tf_idf.txt', 'w', encoding = 'utf-8') as f:
    f.write(str(top_words_prof))

### Topic modeling

1. Split the strings into poems
2. Vectorize them and find term frequency for each text collection

In [186]:
n = 5000
poems_children = [lemmas_child[i:i+n] for i in range(0, len(lemmas_child), n)]
poems_lay = [lemmas_lay[i:i+n] for i in range(0, len(lemmas_lay), n)]
poems_prof = [lemmas_prof[i:i+n] for i in range(0, len(lemmas_prof), n)]

In [187]:
count_child = CountVectorizer(stop_words = 'english')
tf_child = count_child.fit_transform(poems_children).toarray()
tf_child.shape

(40, 5033)

In [188]:
count_lay = CountVectorizer(min_df=2, max_df=3, stop_words = 'english')
tf_lay = count_lay.fit_transform(poems_lay).toarray()
tf_lay.shape

(33, 1056)

In [189]:
count_prof = CountVectorizer(min_df=2, max_df=3, stop_words = 'english')
tf_prof = count_prof.fit_transform(poems_prof).toarray()
tf_prof.shape

(48, 1935)

#### Topic modeling with LDA

In [199]:
from sklearn.decomposition import LatentDirichletAllocation 
lda_child = LatentDirichletAllocation(n_components=5)
lda_child.fit(tf_child)

topic_words_child = lda_child.components_
topic_words_child.shape
count_words_child = count_child.get_feature_names()

for topic_ind, topic in enumerate(topic_words_child):
    print('topic', topic_ind)
    top_indeces_child = list(topic.argsort())[-7:]
    lda_top_words_child = []
    for ind in top_indeces_child:
        lda_top_words_child.append(count_words_child[ind])
    print(', '.join(lda_top_words_child))



topic 0
night, like, play, day, know, love, friend
topic 1
sky, know, make, friend, come, love, day
topic 2
say, come, know, make, day, friend, love
topic 3
luck, fast, day, lover, say, love, bad
topic 4
know, come, day, make, say, friend, love


In [200]:
from sklearn.decomposition import LatentDirichletAllocation 
lda_lay = LatentDirichletAllocation(n_components=5)
lda_lay.fit(tf_lay)

topic_words_lay = lda_lay.components_
topic_words_lay.shape
count_words_lay = count_lay.get_feature_names()

for topic_ind, topic in enumerate(topic_words_lay):
    print('topic', topic_ind)
    top_indeces_lay = list(topic.argsort())[-7:]
    lda_top_words_lay = []
    for ind in top_indeces_lay:
        lda_top_words_lay.append(count_words_lay[ind])
    print(', '.join(lda_top_words_lay))



topic 0
completely, bean, poetess, dye, princess, eternal, prince
topic 1
okay, painful, warmth, patient, funny, horse, illusion
topic 2
exactly, blanket, christmas, crystal, fuck, shoe, sneeze
topic 3
queen, moss, hunt, imagine, discover, feather, garden
topic 4
misconstrue, pursue, sex, wishing, guitar, spider, depression


In [198]:
from sklearn.decomposition import LatentDirichletAllocation 
lda_prof = LatentDirichletAllocation(n_components=5)
lda_prof.fit(tf_prof)

topic_words_prof = lda_prof.components_
topic_words_prof.shape
count_words_prof = count_prof.get_feature_names()

for topic_ind, topic in enumerate(topic_words_prof):
    print('topic', topic_ind)
    top_indeces_prof = list(topic.argsort())[-7:]
    lda_top_words_prof = []
    for ind in top_indeces_prof:
        lda_top_words_prof.append(count_words_prof[ind])
    print(', '.join(lda_top_words_prof))



topic 0
mountainside, daisy, realize, perch, giant, unsaid, valley
topic 1
princess, goal, tidy, thread, miner, childhood, odor
topic 2
ceiling, aunt, venus, halt, radio, frog, create
topic 3
coldly, sabbath, warmth, fiesole, scum, nibble, harlem
topic 4
taught, brave, jim, tomb, nonsense, negro, ram


#### Topic Modeling with NMF

In [192]:
from sklearn.decomposition import NMF
nmf_child = NMF(n_components=4)
nmf_child.fit(tf_child)
nmf_topic_word_child = nmf_child.components_
nmf_topic_word_child.shape
for topic_ind, topic in enumerate(nmf_topic_word_child):
    print('topic', topic_ind)
    top_indeces_child = list(topic.argsort())[-6:]
    top_words_child = []
    for ind in top_indeces_child:
        top_words_child.append(count_words_child[ind])
    print(', '.join(top_words_child))

topic 0
like, play, make, know, day, say
topic 1
sound, taste, smell, look, summer, beach
topic 2
tree, blue, love, wish, day, sky
topic 3
like, play, dad, best, friend, love


In [193]:
from sklearn.decomposition import NMF
nmf_lay = NMF(n_components=4)
nmf_lay.fit(tf_lay)
nmf_topic_word_lay = nmf_lay.components_
nmf_topic_word_lay.shape
for topic_ind, topic in enumerate(nmf_topic_word_lay):
    print('topic', topic_ind)
    top_indeces_lay = list(topic.argsort())[-6:]
    top_words_lay = []
    for ind in top_indeces_lay:
        top_words_lay.append(count_words_lay[ind])
    print(', '.join(top_words_lay))

topic 0
drag, hunter, sanity, yo, woo, didnt
topic 1
roof, beam, farewell, shudder, proof, professional
topic 2
shes, harp, fighting, daisy, girlfriend, sober
topic 3
slave, distraught, splendor, sharpen, drone, mr


In [194]:
from sklearn.decomposition import NMF
nmf_prof = NMF(n_components=4)
nmf_prof.fit(tf_prof)
nmf_topic_word_prof = nmf_prof.components_
nmf_topic_word_prof.shape
for topic_ind, topic in enumerate(nmf_topic_word_prof):
    print('topic', topic_ind)
    top_indeces_prof = list(topic.argsort())[-6:]
    top_words_prof = []
    for ind in top_indeces_prof:
        top_words_prof.append(count_words_prof[ind])
    print(', '.join(top_words_prof))

topic 0
bough, halt, perch, scum, harlem, nibble
topic 1
tier, coldly, dragon, aye, warmth, fiesole
topic 2
nude, princess, thread, miner, childhood, odor
topic 3
upward, industry, vale, daisy, mountainside, valley
