In [1]:
from nltk.corpus import brown
print("Total Categories:",len(brown.categories()))

Total Categories: 15


In [2]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [3]:
# Tokenized sentences
brown.sents(categories='mystery')

[['There', 'were', 'thirty-eight', 'patients', 'on', 'the', 'bus', 'the', 'morning', 'I', 'left', 'for', 'Hanover', ',', 'most', 'of', 'them', 'disturbed', 'and', 'hallucinating', '.'], ['An', 'interne', ',', 'a', 'nurse', 'and', 'two', 'attendants', 'were', 'in', 'charge', 'of', 'us', '.'], ...]

In [4]:
# POS(parts of speech) tagged sentences
brown.tagged_sents(categories='mystery')

[[('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'), ('the', 'AT'), ('bus', 'NN'), ('the', 'AT'), ('morning', 'NN'), ('I', 'PPSS'), ('left', 'VBD'), ('for', 'IN'), ('Hanover', 'NP'), (',', ','), ('most', 'AP'), ('of', 'IN'), ('them', 'PPO'), ('disturbed', 'VBN'), ('and', 'CC'), ('hallucinating', 'VBG'), ('.', '.')], [('An', 'AT'), ('interne', 'NN'), (',', ','), ('a', 'AT'), ('nurse', 'NN'), ('and', 'CC'), ('two', 'CD'), ('attendants', 'NNS'), ('were', 'BED'), ('in', 'IN'), ('charge', 'NN'), ('of', 'IN'), ('us', 'PPO'), ('.', '.')], ...]

In [5]:
# Get sentences in natural form
sentences = brown.sents(categories='mystery')
sentences = [' '.join(sentence_token) for sentence_token in sentences]

In [6]:
print(sentences[0:5])

['There were thirty-eight patients on the bus the morning I left for Hanover , most of them disturbed and hallucinating .', 'An interne , a nurse and two attendants were in charge of us .', "I felt lonely and depressed as I stared out the bus window at Chicago's grim , dirty West Side .", 'It seemed incredible , as I listened to the monotonous drone of voices and smelled the fetid odors coming from the patients , that technically I was a ward of the state of Illinois , going to a hospital for the mentally ill .', 'I suddenly thought of Mary Jane Brennan , the way her pretty eyes could flash with anger , her quiet competence , the gentleness and sweetness that lay just beneath the surface of her defenses .']


In [8]:
# Get tagged words
tagged_words = brown.tagged_words(categories='mystery')
# Get nouns from tagged words
nouns = [(word,tag) for word, tag in tagged_words if any(noun_tag in tag for noun_tag in ['NP', 'NN'])]
print(nouns[0:10])

[('patients', 'NNS'), ('bus', 'NN'), ('morning', 'NN'), ('Hanover', 'NP'), ('interne', 'NN'), ('nurse', 'NN'), ('attendants', 'NNS'), ('charge', 'NN'), ('bus', 'NN'), ('window', 'NN')]


In [9]:
from nltk.corpus import reuters
print(reuters.categories())

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [11]:
# Fileid based access
print(reuters.fileids(categories=['housing','income']))

['test/16118', 'test/18534', 'test/18540', 'test/18664', 'test/18665', 'test/18672', 'test/18911', 'test/19875', 'test/20106', 'test/20116', 'training/1035', 'training/1036', 'training/10602', 'training/10604', 'training/11170', 'training/11665', 'training/2618', 'training/29', 'training/3105', 'training/3708', 'training/3720', 'training/3723', 'training/3898', 'training/5883', 'training/5886', 'training/6000', 'training/6067', 'training/6197', 'training/7005', 'training/7006', 'training/7015', 'training/7036', 'training/7098', 'training/7099', 'training/9615']


In [13]:
from nltk.corpus import wordnet as wn
word = 'hike' # Taking hike as our word of interest
# Get word synsets
word_synsets = wn.synsets(word)
print(word_synsets)

[Synset('hike.n.01'), Synset('rise.n.09'), Synset('raise.n.01'), Synset('hike.v.01'), Synset('hike.v.02')]


In [15]:
# Get details for each synonym in synset
for synset in word_synsets:
    print('Synset Name:', synset.name())
    print('POS tag:', synset.pos())
    print('Definition:', synset.definition())
    print('Example:',synset.examples())
    print()

Synset Name: hike.n.01
POS tag: n
Definition: a long walk usually for exercise or pleasure
Example: ['she enjoys a hike in her spare time']

Synset Name: rise.n.09
POS tag: n
Definition: an increase in cost
Example: ['they asked for a 10% rise in rates']

Synset Name: raise.n.01
POS tag: n
Definition: the amount a salary is increased
Example: ['he got a 3% raise', 'he got a wage hike']

Synset Name: hike.v.01
POS tag: v
Definition: increase
Example: ['The landlord hiked up the rents']

Synset Name: hike.v.02
POS tag: v
Definition: walk a long way, as for pleasure or physical exercise
Example: ['We were hiking in Colorado', 'hike the Rockies']



In [16]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [21]:
# Here keys are words, and values give their frequency.
emmawords = gutenberg.words('austen-emma.txt')
fdist = FreqDist(emmawords)
fdist.most_common(50)

[(',', 11454),
 ('.', 6928),
 ('to', 5183),
 ('the', 4844),
 ('and', 4672),
 ('of', 4279),
 ('I', 3178),
 ('a', 3004),
 ('was', 2385),
 ('her', 2381),
 (';', 2199),
 ('it', 2128),
 ('in', 2118),
 ('not', 2101),
 ('"', 2004),
 ('be', 1970),
 ('she', 1778),
 ('that', 1730),
 ('you', 1677),
 ('had', 1606),
 ('as', 1387),
 ('--', 1382),
 ('he', 1365),
 ('for', 1321),
 ('have', 1301),
 ('is', 1220),
 ('with', 1187),
 ('Mr', 1153),
 ('very', 1151),
 ('but', 1148),
 ('."', 1138),
 ('his', 1088),
 ("'", 1007),
 ('at', 997),
 ('s', 933),
 ('so', 924),
 ('Emma', 865),
 ('all', 835),
 ('could', 825),
 ('would', 815),
 ('been', 759),
 ('him', 758),
 ('Mrs', 699),
 ('.--', 685),
 ('on', 677),
 ('any', 651),
 ('my', 619),
 ('no', 616),
 ('Miss', 592),
 ('were', 591)]

In [22]:
fdist['emma']

0

In [23]:
fdist['the']

4844

In [61]:
import re
def makeAlphaFreqDist(words):
    adist = FreqDist()
    pattern = re.compile(".*[^a-z].*")
    for word in words:
        if not pattern.match(word):
            adist.update([word])
    return adist

In [62]:
adist = makeAlphaFreqDist(emmawords)
adist.most_common(50)

[('to', 5183),
 ('the', 4844),
 ('and', 4672),
 ('of', 4279),
 ('a', 3004),
 ('was', 2385),
 ('her', 2381),
 ('it', 2128),
 ('in', 2118),
 ('not', 2101),
 ('be', 1970),
 ('she', 1778),
 ('that', 1730),
 ('you', 1677),
 ('had', 1606),
 ('as', 1387),
 ('he', 1365),
 ('for', 1321),
 ('have', 1301),
 ('is', 1220),
 ('with', 1187),
 ('very', 1151),
 ('but', 1148),
 ('his', 1088),
 ('at', 997),
 ('s', 933),
 ('so', 924),
 ('all', 835),
 ('could', 825),
 ('would', 815),
 ('been', 759),
 ('him', 758),
 ('on', 677),
 ('any', 651),
 ('my', 619),
 ('no', 616),
 ('were', 591),
 ('do', 580),
 ('must', 564),
 ('me', 564),
 ('will', 559),
 ('by', 558),
 ('which', 552),
 ('from', 535),
 ('or', 490),
 ('said', 484),
 ('much', 478),
 ('more', 464),
 ('an', 452),
 ('are', 447)]

In [32]:
mbdist = FreqDist(text1)

In [36]:
n = 5
file1 = gutenberg.fileids()[n]
file1

'bryant-stories.txt'

In [40]:
import nltk
text = gutenberg.raw(file1)
tokens = nltk.wordpunct_tokenize(text)
tokens[:50]

['[',
 'Stories',
 'to',
 'Tell',
 'to',
 'Children',
 'by',
 'Sara',
 'Cone',
 'Bryant',
 '1918',
 ']',
 'TWO',
 'LITTLE',
 'RIDDLES',
 'IN',
 'RHYME',
 'There',
 "'",
 's',
 'a',
 'garden',
 'that',
 'I',
 'ken',
 ',',
 'Full',
 'of',
 'little',
 'gentlemen',
 ';',
 'Little',
 'caps',
 'of',
 'blue',
 'they',
 'wear',
 ',',
 'And',
 'green',
 'ribbons',
 ',',
 'very',
 'fair',
 '.',
 '(',
 'Flax',
 '.)',
 'From',
 'house']

In [41]:
words = [w.lower() for w in tokens]
words[:50]

['[',
 'stories',
 'to',
 'tell',
 'to',
 'children',
 'by',
 'sara',
 'cone',
 'bryant',
 '1918',
 ']',
 'two',
 'little',
 'riddles',
 'in',
 'rhyme',
 'there',
 "'",
 's',
 'a',
 'garden',
 'that',
 'i',
 'ken',
 ',',
 'full',
 'of',
 'little',
 'gentlemen',
 ';',
 'little',
 'caps',
 'of',
 'blue',
 'they',
 'wear',
 ',',
 'and',
 'green',
 'ribbons',
 ',',
 'very',
 'fair',
 '.',
 '(',
 'flax',
 '.)',
 'from',
 'house']

In [48]:
# Including stop words.
x = 12
words[x:200+x]

['two', 'little', 'riddles', 'in', 'rhyme', 'there', "'", 's', 'a', 'garden']

In [60]:
import re
pattern = re.compile('^[a-z0-9]')
print(pattern.match('9'))
print(pattern.match('('))

<re.Match object; span=(0, 1), match='9'>
None
