## 2.1 Conditions and Events

In [1]:
text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [4]:
pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ...]
# each pair has the form (condition, event)

## 2.2 Counting Words by Genre

In [6]:
import nltk
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre)
)

In [7]:
genre_word = [(genre, word)
              for genre in ['news', 'romance']
              for word in brown.words(categories=genre)]
len(genre_word)

170576

In [8]:
genre_word[:4]

[('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ('news', 'Grand')]

In [9]:
genre_word[-4:]

[('romance', 'afraid'),
 ('romance', 'not'),
 ('romance', "''"),
 ('romance', '.')]

In [10]:
cfd = nltk.ConditionalFreqDist(genre_word)
cfd

ConditionalFreqDist(nltk.probability.FreqDist,
                    {'news': FreqDist({'The': 806,
                               'Fulton': 14,
                               'County': 35,
                               'Grand': 6,
                               'Jury': 2,
                               'said': 402,
                               'Friday': 41,
                               'an': 300,
                               'investigation': 9,
                               'of': 2849,
                               "Atlanta's": 4,
                               'recent': 20,
                               'primary': 17,
                               'election': 38,
                               'produced': 6,
                               '``': 732,
                               'no': 109,
                               'evidence': 17,
                               "''": 702,
                               'that': 802,
                               'any': 90,
            

In [11]:
cfd.conditions()

['news', 'romance']

In [12]:
print(cfd['news'])

<FreqDist with 14394 samples and 100554 outcomes>


In [13]:
print(cfd['romance'])

<FreqDist with 8452 samples and 70022 outcomes>


In [14]:
cfd['romance'].most_common(20)

[(',', 3899),
 ('.', 3736),
 ('the', 2758),
 ('and', 1776),
 ('to', 1502),
 ('a', 1335),
 ('of', 1186),
 ('``', 1045),
 ("''", 1044),
 ('was', 993),
 ('I', 951),
 ('in', 875),
 ('he', 702),
 ('had', 692),
 ('?', 690),
 ('her', 651),
 ('that', 583),
 ('it', 573),
 ('his', 559),
 ('she', 496)]

In [15]:
cfd['romance']['could']

193

## 2.3 Plotting and Tabulating Distributions

In [22]:
from nltk.corpus import inaugural
icfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target))

In [23]:
icfd.conditions()

['citizen', 'america']

In [40]:
years = []
for fileid in inaugural.fileids():
    years.append(fileid[:4])
#print(years)
last10 = years[-10:]
print(last10)

['1973', '1977', '1981', '1985', '1989', '1993', '1997', '2001', '2005', '2009']


In [41]:
#icfd.tabulate()
icfd.tabulate(samples=last10)

        1973 1977 1981 1985 1989 1993 1997 2001 2005 2009 
america   23    5   16   21   11   33   31   20   30   15 
citizen    1    0    3    6    3    2   10   11    7    2 


In [37]:
from nltk.corpus import udhr
languages = ['Chickasaw', 'English', 'German_Deutsch',
    'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
ucfd = nltk.ConditionalFreqDist(
    (lang, len(word))
    for lang in languages
    for word in udhr.words(lang + '-Latin1'))

In [19]:
ucfd.tabulate(conditions=['English', 'German_Deutsch'],
             samples=range(10), cumulative=True)

                  0    1    2    3    4    5    6    7    8    9 
       English    0  185  525  883  997 1166 1283 1440 1558 1638 
German_Deutsch    0  171  263  614  717  894 1013 1110 1213 1275 


In [20]:
cfd.conditions()

['Chickasaw',
 'English',
 'German_Deutsch',
 'Greenlandic_Inuktikut',
 'Hungarian_Magyar',
 'Ibibio_Efik']