## 

## Loading your own Corpus

In [10]:
import nltk

In [11]:
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'C:/Users/dowdj/OneDrive/Documents/GitHub/NLP-Training/data'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()




['t1.txt', 't2.txt', 't3.txt']

In [12]:
wordlists.words('t1.txt')

['The', 'United', 'Nations', 'General', 'Assembly', ...]

In [13]:
len(wordlists.words('t1.txt'))

609

In [14]:
wordlists.sents('t1.txt')

[['The', 'United', 'Nations', 'General', 'Assembly', '(', 'UNGA', 'or', 'GA', ';', 'French', ':', 'Assemblée', 'générale', ',', 'AG', ')', 'is', 'one', 'of', 'the', 'six', 'principal', 'organs', 'of', 'the', 'United', 'Nations', '(', 'UN', '),', 'serving', 'as', 'the', 'main', 'deliberative', ',', 'policymaking', ',', 'and', 'representative', 'organ', 'of', 'the', 'UN', '.'], ['Its', 'powers', ',', 'composition', ',', 'functions', ',', 'and', 'procedures', 'are', 'set', 'out', 'in', 'Chapter', 'IV', 'of', 'the', 'United', 'Nations', 'Charter', '.'], ...]

## Conditional Frequency Distributions

In [15]:
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre))

In [16]:
genre_word = [(genre, word)
for genre in ['news', 'romance']
for word in brown.words(categories=genre)]
len(genre_word)

170576

In [17]:
genre_word[:4]

[('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ('news', 'Grand')]

In [18]:
genre_word[-4:]

[('romance', 'afraid'),
 ('romance', 'not'),
 ('romance', "''"),
 ('romance', '.')]

In [19]:
cfd = nltk.ConditionalFreqDist(genre_word)
cfd.conditions()

['news', 'romance']

In [20]:
print(cfd['news'])
print(cfd['romance'])

<FreqDist with 14394 samples and 100554 outcomes>
<FreqDist with 8452 samples and 70022 outcomes>


In [21]:
cfd['news'].most_common(20)

[('the', 5580),
 (',', 5188),
 ('.', 4030),
 ('of', 2849),
 ('and', 2146),
 ('to', 2116),
 ('a', 1993),
 ('in', 1893),
 ('for', 943),
 ('The', 806),
 ('that', 802),
 ('``', 732),
 ('is', 732),
 ('was', 717),
 ("''", 702),
 ('on', 657),
 ('at', 598),
 ('with', 545),
 ('be', 526),
 ('by', 497)]

In [22]:
cfd['romance']['war']

11

In [23]:
cfd['news']['war']

20

In [24]:
from nltk.corpus import inaugural
cfd = nltk.ConditionalFreqDist(
(target, fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target))

In [25]:
cfd['america']

FreqDist({'2017': 35, '1993': 33, '1997': 31, '2005': 30, '1921': 24, '1973': 23, '1985': 21, '2001': 20, '2013': 19, '1981': 16, ...})

In [26]:
cfd.tabulate(conditions=['america', 'citizen'],
samples=range(10), cumulative=True)
  

        0 1 2 3 4 5 6 7 8 9 
america 0 0 0 0 0 0 0 0 0 0 
citizen 0 0 0 0 0 0 0 0 0 0 


## Generating Random Text with Bigrams

In [27]:
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'C:/Users/dowdj/OneDrive/Documents/GitHub/NLP-Training/data'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()


['t1.txt', 't2.txt', 't3.txt']

In [28]:
def generate_model(cfdist, word, num=30):
    for i in range(num):
        print(word, end=' ')
        word = cfdist[word].max()

In [29]:
word=wordlists.words()
word

['The', 'United', 'Nations', 'General', 'Assembly', ...]

In [30]:
bigrams=list(nltk.bigrams(word))

In [31]:


cfd = nltk.ConditionalFreqDist(bigrams)

In [32]:
generate_model(cfd, 'graph')

graph 

ValueError: A FreqDist must have at least one sample before max is defined.

In [33]:
cfd.conditions()

['The',
 'United',
 'Nations',
 'General',
 'Assembly',
 '(',
 'UNGA',
 'or',
 'GA',
 ';',
 'French',
 ':',
 'Assemblée',
 'générale',
 ',',
 'AG',
 ')',
 'is',
 'one',
 'of',
 'the',
 'six',
 'principal',
 'organs',
 'UN',
 '),',
 'serving',
 'as',
 'main',
 'deliberative',
 'policymaking',
 'and',
 'representative',
 'organ',
 '.',
 'Its',
 'powers',
 'composition',
 'functions',
 'procedures',
 'are',
 'set',
 'out',
 'in',
 'Chapter',
 'IV',
 'Charter',
 'responsible',
 'for',
 'budget',
 'appointing',
 'non',
 '-',
 'permanent',
 'members',
 'to',
 'Security',
 'Council',
 'Secretary',
 'receiving',
 'reports',
 'from',
 'other',
 'parts',
 'system',
 'making',
 'recommendations',
 'through',
 'resolutions',
 '.[',
 '1',
 ']',
 'It',
 'also',
 'establishes',
 'numerous',
 'subsidiary',
 'advance',
 'assist',
 'its',
 'broad',
 'mandate',
 '2',
 'only',
 'wherein',
 'all',
 'member',
 'states',
 'have',
 'equal',
 'representation',
 'meets',
 'under',
 'president',
 'annual',
 'ses