## 

## Loading your own Corpus

In [2]:
import nltk

In [3]:
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'C:/Users/dowdj/OneDrive/Documents/GitHub/NLP-Training/data'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()




['t1.txt']

In [16]:
wordlists.words('t1.txt')

['identify', 'all', 'the', 'categories', ',', 'types', ...]

In [4]:
len(wordlists.words('t1.txt'))

272

## Conditional Frequency Distributions

In [5]:
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre))

In [7]:
genre_word = [(genre, word)
for genre in ['news', 'romance']
for word in brown.words(categories=genre)]
len(genre_word)

170576

In [8]:
genre_word[:4]

[('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ('news', 'Grand')]

In [9]:
genre_word[-4:]

[('romance', 'afraid'),
 ('romance', 'not'),
 ('romance', "''"),
 ('romance', '.')]

In [10]:
cfd = nltk.ConditionalFreqDist(genre_word)
cfd.conditions()

['news', 'romance']

In [11]:
print(cfd['news'])
print(cfd['romance'])

<FreqDist with 14394 samples and 100554 outcomes>
<FreqDist with 8452 samples and 70022 outcomes>


In [12]:
cfd['news'].most_common(20)

[('the', 5580),
 (',', 5188),
 ('.', 4030),
 ('of', 2849),
 ('and', 2146),
 ('to', 2116),
 ('a', 1993),
 ('in', 1893),
 ('for', 943),
 ('The', 806),
 ('that', 802),
 ('``', 732),
 ('is', 732),
 ('was', 717),
 ("''", 702),
 ('on', 657),
 ('at', 598),
 ('with', 545),
 ('be', 526),
 ('by', 497)]

In [15]:
cfd['romance']['war']

11

In [16]:
cfd['news']['war']

20

In [18]:
from nltk.corpus import inaugural
cfd = nltk.ConditionalFreqDist(
(target, fileid[:4])
for fileid in inaugural.fileids()
for w in inaugural.words(fileid)
for target in ['america', 'citizen']
if w.lower().startswith(target))

In [20]:
cfd['america']

FreqDist({'2017': 35, '1993': 33, '1997': 31, '2005': 30, '1921': 24, '1973': 23, '1985': 21, '2001': 20, '2013': 19, '1981': 16, ...})

In [22]:
cfd.tabulate(conditions=['america', 'citizen'],
samples=range(10), cumulative=True)
  

        0 1 2 3 4 5 6 7 8 9 
america 0 0 0 0 0 0 0 0 0 0 
citizen 0 0 0 0 0 0 0 0 0 0 


## Generating Random Text with Bigrams

In [70]:
from nltk.corpus import PlaintextCorpusReader
corpus_root = 'C:/Users/dowdj/OneDrive/Documents/GitHub/NLP-Training/data'
wordlists = PlaintextCorpusReader(corpus_root, '.*')
wordlists.fileids()


['t1.txt']

In [77]:
def generate_model(cfdist, word, num=30):
    for i in range(num):
        print(word, end=' ')
        word = cfdist[word].max()

In [78]:
word=wordlists.words()
word

['identify', 'all', 'the', 'categories', ',', 'types', ...]

In [79]:
bigrams=list(nltk.bigrams(word))

In [81]:


cfd = nltk.ConditionalFreqDist(bigrams)

In [82]:
generate_model(cfd, 'graph')

graph this is available to each other and in a semantic knowledge graph this is available to each other and in a semantic knowledge graph this is available to each 

In [67]:
cfd.conditions()

['identify',
 'all',
 'the',
 'categories',
 ',',
 'types',
 'things',
 'and',
 'objects',
 'that',
 'are',
 'important',
 'for',
 'field',
 'we',
 'then',
 'understand',
 'more',
 'how',
 'they',
 'relate',
 'to',
 'each',
 'other',
 'what',
 'information',
 'is',
 'available',
 'describe',
 'them',
 'even',
 'accurately',
 '.',
 'We',
 'call',
 'this',
 '‘',
 'conceptual',
 'model',
 ',’',
 'in',
 'a',
 'semantic',
 'knowledge',
 'graph',
 'represented',
 'by',
 'schema',
 'or',
 'ontology',
 'Since',
 'express',
 'not',
 'only',
 'schematically',
 'but',
 'also',
 'above',
 'through',
 'human',
 'language',
 'very',
 'individually',
 'different',
 'languages',
 'must',
 'provide',
 'linguistic',
 '’',
 'our',
 'The',
 'serves',
 'label',
 'further',
 'contextualize',
 'individual',
 'elements',
 'of',
 'their',
 'instances',
 'In',
 'made',
 'possible',
 'controlled',
 'vocabularies',
 'such',
 'as',
 'taxonomies',
 'derived',
 'from',
 'analysis',
 'existing',
 'domain',
 'its',
 '