A introudction of conditional frequency distribution (CFD).
CFD is a collection of frequency distributions, each one for a different "condition"


In [2]:
import nltk

In [3]:
brown_corpus = nltk.corpus.brown

In [4]:
cfd = nltk.ConditionalFreqDist((g, w) for g in brown_corpus.categories() for w in brown_corpus.words(categories=g))

In [6]:
# Count frequency of words with certain genre
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)

                  can could   may might  must  will 
           news    93    86    66    38    50   389 
       religion    82    59    78    12    54    71 
        hobbies   268    58   131    22    83   264 
science_fiction    16    49     4    12     8    16 
        romance    74   193    11    51    45    43 
          humor    16    30     8     8     9    13 


In [55]:
# Show news words
news_raw = brown_corpus.words(categories='news')
print(news_raw)

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]


In [58]:
# For a specific look at a single genre of news
genre_word = [(g, w) for g in ['news'] for w in brown_corpus.words(categories=g)]

# The result is a pair (genre, word) => (condition, event)
print(f'{[p for p in genre_word[:10]]}')
print(f'Total number of pair is: {len(genre_word)}')

[('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ('news', 'Grand'), ('news', 'Jury'), ('news', 'said'), ('news', 'Friday'), ('news', 'an'), ('news', 'investigation'), ('news', 'of')]
Total number of pair is: 100554


In [38]:
# CFD are used to record the number of times each sample occurred,
# given the condition under which the experiment was run.
news_cfd = nltk.ConditionalFreqDist(genre_word)

In [62]:
print(news_cfd)
print(news_cfd.conditions())
news_cfd['news']

<ConditionalFreqDist with 1 conditions>
['news']


FreqDist({'the': 5580, ',': 5188, '.': 4030, 'of': 2849, 'and': 2146, 'to': 2116, 'a': 1993, 'in': 1893, 'for': 943, 'The': 806, ...})

In [49]:
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
news_cfd.tabulate(samples=days)

        Monday   Tuesday Wednesday  Thursday    Friday  Saturday    Sunday 
news        54        43        22        20        41        33        51 


In [31]:
from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist((lang, len(word)) for lang in languages for word in udhr.words(lang + '-Latin1'))

In [33]:
cfd.tabulate(samples=range(10), cumulative=True)

                         0    1    2    3    4    5    6    7    8    9 
            Chickasaw    0  411  510  551  619  710  799  876  946  995 
              English    0  185  525  883  997 1166 1283 1440 1558 1638 
       German_Deutsch    0  171  263  614  717  894 1013 1110 1213 1275 
Greenlandic_Inuktikut    0  139  150  151  154  175  182  241  259  283 
     Hungarian_Magyar    0  302  431  503  655  767  881  972 1081 1171 
          Ibibio_Efik    0  228  440  915 1418 1705 1867 1974 2049 2074 


In [86]:
# Create a bigram of a text
text = nltk.corpus.genesis.words('english-kjv.txt')
print(list(nltk.bigrams(text))[:10])
bigrams = nltk.bigrams(text)


[('In', 'the'), ('the', 'beginning'), ('beginning', 'God'), ('God', 'created'), ('created', 'the'), ('the', 'heaven'), ('heaven', 'and'), ('and', 'the'), ('the', 'earth'), ('earth', '.')]


In [92]:
# Treat each word as a condition 
# For each one we effectively create a frequency distribution over the following words
cfd = nltk.ConditionalFreqDist(bigrams) # [_bigram-condition]
# print(cfd.keys())

In [80]:
# Natural Language Toolkit: code_random_text
def generate_model(cfdist, init_word, n_generate=15):
    for i in range(n_generate):
        print(init_word, end=' ')
        init_word = cfdist[init_word].max() # Select the most occured words as the next words

In [91]:
generate_model(cfd, 'living')

living creature that he said , and the land of the land of the land 