# [2. Accessing Text Corpora and Lexical Resources](https://www.nltk.org/book/ch02.html)

Run the cell below before running any other code.

In [None]:
import nltk

## 1 - Accessing Text Corpora

### 1.1 - Guterberg Corpus

In [None]:
nltk.corpus.gutenberg.fileids()

In [None]:
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
type(emma)

In [None]:
len(emma)

* notice that emma is a `nltk.corpus.reader.util.StreamBackedCorpusView` object
* in order to use the `.concordance` method on the `emma` text, we need to convert `emma` into a `nltk.text.Text` object, as shown below

In [None]:
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
type(emma)

In [None]:
emma.concordance("surprize")

In [None]:
from nltk.corpus import gutenberg

In [None]:
gutenberg.fileids()

In [None]:
emma = gutenberg.words('austen-emma.txt')

In [None]:
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
    print(round(num_chars/num_words), round(num_words/num_sents), round(num_words/num_vocab), fileid)

#### Macbeth Sentences

In [None]:
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')

In [None]:
macbeth_sentences

In [None]:
macbeth_sentences[1116]

In [None]:
longest_len = max(len(s) for s in macbeth_sentences)

In [None]:
[s for s in macbeth_sentences if len(s) == longest_len]

### 1.2 - Web and Chat Text

In [None]:
from nltk.corpus import webtext

for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65], '...')

In [None]:
from nltk.corpus import nps_chat

chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]

### 1.3 Brown Corpus

In [None]:
from nltk.corpus import brown

In [None]:
brown.categories()

In [None]:
brown.words(categories='news')

In [None]:
brown.words(fileids=['cg22'])

In [None]:
brown.sents(categories=['news', 'editorial', 'reviews'])

#### Stylistics

In [None]:
from nltk.corpus import brown

In [None]:
news_text = brown.words(categories='news')

In [None]:
fdist = nltk.FreqDist(w.lower() for w in news_text)

In [None]:
modals = ['can', 'could', 'may', 'might', 'must', 'will']

In [None]:
for m in modals:
    print(m + ':', fdist[m], end=' ')

**Your Turn:** Choose a different section of the Brown Corpus, and adapt the previous example to count a selection of wh words, such as what, when, where, who, and why.

#### CFD Sneak Peek

In [48]:
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))

genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)

                  can could   may might  must  will 
           news    93    86    66    38    50   389 
       religion    82    59    78    12    54    71 
        hobbies   268    58   131    22    83   264 
science_fiction    16    49     4    12     8    16 
        romance    74   193    11    51    45    43 
          humor    16    30     8     8     9    13 


### 1.4 Reuters Corpus

## Your Turn Solutions

### 1.3

**Your Turn:** Choose a different section of the Brown Corpus, and adapt the previous example to count a selection of wh words, such as what, when, where, who, and why.

In [47]:
from nltk.corpus import brown

humor_text = brown.words(categories='humor')
humor_fdist = nltk.FreqDist(w.lower() for w in humor_text)
wh = ['what', 'when', 'where', 'who', 'why']

for w in wh:
    print(w + ':', fdist[m], end=' ')

what: 389 when: 389 where: 389 who: 389 why: 389 