In [None]:
# import brown corpus from nltk package
import nltk
nltk.download('brown')
from nltk.corpus import brown

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


## Ref: https://www.nltk.org/book/ch02.html

In [None]:
# examine the categoris in the brown corpus 
print(brown.categories())


['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [None]:
# examine some words in each category
print(brown.words(categories='government'))
print('\n\n')
print(brown.words(categories='hobbies'))
print('\n\n')
print(brown.words(categories='romance'))

['The', 'Office', 'of', 'Business', 'Economics', '(', ...]



['Too', 'often', 'a', 'beginning', 'bodybuilder', ...]



['They', 'neither', 'liked', 'nor', 'disliked', 'the', ...]


In [None]:
# inspect the conditional frequency distribution of words in each genre
cfd = nltk.ConditionalFreqDist(
           (genre, word)
           for genre in brown.categories()
           for word in brown.words(categories=genre))
genres = ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government']
modals = ['can', 'could', 'neither', 'often', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)

                   can   could neither   often    must    will 
     adventure      46     151       9      10      27      50 
belles_lettres     246     213      22      75     170     236 
     editorial     121      56       7      17      53     233 
       fiction      37     166       9      12      55      52 
    government     117      38       8       9     102     244 


In [None]:
# let's prepare our dataset
data = []

for file_id in brown.fileids():
    document = ' '.join(brown.words(file_id))
    data.append(document)
    
print(type(data))
number_of_doc = len(data)
print(number_of_doc)
print(data[10])
print('\n\n')
print(data[100])
print('\n\n')
print(data[300])

<class 'list'>
500
Miami , Fla. , March 17 -- The Orioles tonight retained the distinction of being the only winless team among the eighteen Major-League clubs as they dropped their sixth straight spring exhibition decision , this one to the Kansas City Athletics by a score of 5 to 3 . Indications as late as the top of the sixth were that the Birds were to end their victory draught as they coasted along with a 3-to-o advantage . Siebern hits homer Over the first five frames , Jack Fisher , the big righthander who figures to be in the middle of Oriole plans for a drive on the 1961 American League pennant , held the A's scoreless while yielding three scattered hits . Then Dick Hyde , submarine-ball hurler , entered the contest and only five batters needed to face him before there existed a 3-to-3 deadlock . A two-run homer by Norm Siebern and a solo blast by Bill Tuttle tied the game , and single runs in the eighth and ninth gave the Athletics their fifth victory in eight starts . House 

In [None]:
# pre-process the text data
stemming- stopword removal- punctuation  removal- lemmetization
# vectorization 
suitable
not suitable

In [None]:
# import packages from scikit-learn
from sklearn.decomposition import LatentDirichletAllocation, NMF, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)

  token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')


In [None]:
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components= 10, max_iter=10, learning_method='online') # no of components = no. of topics
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components= 10)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=10)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)

(500, 10)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  kwargs['lwork'] = ret[-2][0].real.astype(numpy.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  kwargs['lwork'] = ret[-2][0].real.astype(numpy.int)


(500, 10)
(500, 10)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  kwargs['lwork'] = ret[-2][0].real.astype(numpy.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  kwargs['lwork'] = ret[-2][0].real.astype(numpy.int)


In [None]:
# inspect the inferred topics
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])


In [None]:
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 



LDA Model:
Topic 0:
[('boats', 24.84601433954905), ('boat', 22.044748752994177), ('boating', 14.767481539539805), ('safety', 4.247583205700865), ('craft', 4.147794653338657), ('afloat', 2.8775975307188713), ('waterfront', 2.478522290476578), ('recreational', 2.3725652990825306), ('sport', 2.212368482476962), ('marine', 1.7002206168735399)]
Topic 1:
[('used', 311.92011666558153), ('use', 201.5876264303709), ('number', 192.17268739999201), ('time', 191.7140595624263), ('water', 168.6210757697908), ('surface', 164.4655414034601), ('small', 134.22465414351367), ('temperature', 131.62987741813362), ('possible', 128.7417106780615), ('large', 128.1516766451174)]
Topic 2:
[('bridge', 42.4874980631903), ('junior', 27.654475570730945), ('class', 25.134993023460463), ('dog', 14.565098032062822), ('westminster', 14.563829576760595), ('bridges', 10.998409549117625), ('palmer', 7.816387207282058), ('judge', 7.746163795318613), ('classes', 7.581041164972548), ('ring', 6.881262398828163)]
Topic 3:
[('

In [None]:
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)


NMF Model:
Topic 0:
[('like', 5.599400522662841), ('man', 4.23193797643742), ('time', 3.8464191286234994), ('just', 3.4028872115784417), ('did', 3.398575620462638), ('little', 2.9262181259256623), ('said', 2.8230059646653496), ('know', 2.5782662349921552), ('way', 2.4577290616651064), ('don', 2.4219266472319436)]
Topic 1:
[('new', 5.339613937054055), ('world', 3.933227260921321), ('life', 2.854351419479522), ('man', 2.8322348660735623), ('people', 2.550991635316139), ('god', 2.1647755048215207), ('american', 2.104669138678238), ('church', 2.102281623482738), ('great', 2.018705404489502), ('social', 2.000769141344503)]
Topic 2:
[('said', 12.930596044182662), ('city', 1.3877369932145907), ('house', 1.380754272685577), ('new', 1.2733612739823303), ('home', 1.248626398835889), ('told', 1.2448675189443617), ('asked', 1.152483434244877), ('president', 1.012086303139959), ('police', 0.9605094792767774), ('mike', 0.8420131623380697)]
Topic 3:
[('mrs', 11.84323393202126), ('miss', 1.70569164319



[('used', 3.5084429682319573), ('use', 2.043475694472924), ('surface', 1.9304426991552368), ('number', 1.8525964881388641), ('feed', 1.7915655350904172), ('time', 1.7652701303494736), ('temperature', 1.6625256544702682), ('water', 1.617137818135781), ('data', 1.5002037662910226), ('clay', 1.3353393447553155)]
Topic 5:
[('school', 5.688258232470334), ('education', 2.7891937714073847), ('schools', 2.6009611573699862), ('children', 2.2062388767224066), ('college', 2.1681947581953342), ('vocational', 1.8464722971299221), ('training', 1.723645990724342), ('students', 1.7100733072393286), ('high', 1.5771410714035716), ('program', 1.4137433783574405)]
Topic 6:
[('state', 10.749973070179163), ('federal', 3.3662912515877923), ('states', 2.3143180283947125), ('law', 2.0773887251000573), ('policy', 1.5690141550830417), ('vehicles', 1.4470072117540935), ('rhode', 1.401095357677012), ('court', 1.3737449484652666), ('island', 1.3687379350503237), ('congress', 1.2278769249116879)]
Topic 7:
[('united'

In [None]:
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LSI Model:
Topic 0:
[('said', 0.27373900298794424), ('new', 0.19420883959862512), ('time', 0.18634772924368703), ('man', 0.16422113595864946), ('like', 0.1615125389182214), ('did', 0.12741210121192603), ('years', 0.11440149055470453), ('just', 0.10896553841190947), ('way', 0.10626155012190239), ('state', 0.10546670502008623)]
Topic 1:
[('state', 0.22242164807116616), ('states', 0.17248362354291447), ('new', 0.15869849795495292), ('united', 0.12268692777026485), ('government', 0.10980166242617213), ('year', 0.10784151847288366), ('program', 0.09501333571726432), ('development', 0.08703801616173684), ('federal', 0.08450585683097771), ('use', 0.0798765536142257)]
Topic 2:
[('said', 0.3670665912587617), ('state', 0.35949747244610514), ('mrs', 0.17906600181274687), ('states', 0.17562951141499444), ('year', 0.16282139479714242), ('federal', 0.13555833016241256), ('united', 0.11268321992872816), ('government', 0.1103818058388228), ('tax', 0.11002487254891517), ('fiscal', 0.09640566592840484)]



[('mrs', 0.4744053294522762), ('new', 0.2545683202567768), ('church', 0.15145073252097713), ('world', 0.11605518649077162), ('members', 0.11598354797244104), ('john', 0.1049094121181958), ('god', 0.10006749367274284), ('social', 0.09121388864736826), ('president', 0.08507287705538222), ('life', 0.0839326061458425)]
Topic 4:
[('mrs', 0.4996686970088542), ('used', 0.11207978817982224), ('school', 0.10219736550524837), ('year', 0.09288707161538606), ('information', 0.08557844198664077), ('miss', 0.07842086557930537), ('home', 0.07825929735150756), ('small', 0.07441866196105709), ('high', 0.07340250295534122), ('cost', 0.07174793667905552)]
Topic 5:
[('school', 0.39162232701833644), ('said', 0.20986186176625396), ('education', 0.20165354325090865), ('schools', 0.18261190827465543), ('vocational', 0.1414781264966313), ('children', 0.14106689564157177), ('college', 0.14078654027261728), ('training', 0.12499560046642408), ('students', 0.12251244764663763), ('social', 0.10999628319864846)]
Top

In [None]:
!pip install pyLDAvis



In [None]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
# project-2 starter
import urllib
url = "https://www.gutenberg.org/cache/epub/67425/pg67425.txt"
file = urllib.request.urlopen(url)

for line in file:
	decoded_line = line.decode("utf-8")
	print(decoded_line)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
is fitting that we should receive her joyfully." Then they went in, and

dismounted. And Geraint came to where Arthur was, and saluted him.



"Heaven protect thee," said Arthur, "and the welcome of Heaven be unto

thee! And since Edeyrn the son of Nudd has received his overthrow and

wounds from thy hands, thou hast had a prosperous career."



"Not upon me be the blame!" said Geraint. "It was through the arrogance

of Edeyrn the son of Nudd himself that we were not friends. I would not

quit him until I knew who he was, and until the one had vanquished the

other."



"Now," said Arthur, "where is the maiden for whom I heard thou didst

give challenge?"



"She is gone with Gwenhwyvar to her chamber."



Then went Arthur to see the maiden. And Arthur, and all his companions,

and his whole court, were glad concerning the maiden. And certain were

they all, that, had her array been suitable to her beauty, they had

never