# [2. Accessing Text Corpora and Lexical Resources](https://www.nltk.org/book/ch02.html)

Run the cell below before running any other code.

In [None]:
import nltk

## 1 - Accessing Text Corpora

### 1.1 - Guterberg Corpus

In [None]:
nltk.corpus.gutenberg.fileids()

In [None]:
emma = nltk.corpus.gutenberg.words('austen-emma.txt')
type(emma)

In [None]:
len(emma)

* notice that emma is a `nltk.corpus.reader.util.StreamBackedCorpusView` object
* in order to use the `.concordance` method on the `emma` text, we need to convert `emma` into a `nltk.text.Text` object, as shown below

In [None]:
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
type(emma)

In [None]:
emma.concordance("surprize")

In [None]:
from nltk.corpus import gutenberg

In [None]:
gutenberg.fileids()

In [None]:
emma = gutenberg.words('austen-emma.txt')

In [None]:
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
    print(round(num_chars/num_words), round(num_words/num_sents), round(num_words/num_vocab), fileid)

#### Macbeth Sentences

In [None]:
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')

In [None]:
macbeth_sentences

In [None]:
macbeth_sentences[1116]

In [None]:
longest_len = max(len(s) for s in macbeth_sentences)

In [None]:
[s for s in macbeth_sentences if len(s) == longest_len]

### 1.2 - Web and Chat Text

In [None]:
from nltk.corpus import webtext

for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65], '...')

In [None]:
from nltk.corpus import nps_chat

chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]

### 1.3 Brown Corpus

In [None]:
from nltk.corpus import brown

In [None]:
brown.categories()

In [None]:
brown.words(categories='news')

In [None]:
brown.words(fileids=['cg22'])

In [None]:
brown.sents(categories=['news', 'editorial', 'reviews'])

#### Stylistics

In [None]:
from nltk.corpus import brown

In [None]:
news_text = brown.words(categories='news')

In [None]:
fdist = nltk.FreqDist(w.lower() for w in news_text)

In [None]:
modals = ['can', 'could', 'may', 'might', 'must', 'will']

In [None]:
for m in modals:
    print(m + ':', fdist[m], end=' ')

**Your Turn:** Choose a different section of the Brown Corpus, and adapt the previous example to count a selection of wh words, such as what, when, where, who, and why.

#### CFD Sneak Peek

* CFD's will be explained in more detail in Section 2

In [None]:
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))

genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)

### 1.4 - Reuters Corpus

In [None]:
from nltk.corpus import reuters

In [None]:
reuters.fileids()

In [None]:
reuters.categories()

In [None]:
reuters.categories('training/9865')

In [None]:
reuters.categories(['training/9865', 'training/9880'])

In [None]:
reuters.fileids('barley')

In [None]:
reuters.fileids(['barley', 'corn'])

### 1.5 - Inaugural Address Corpus

In [None]:
from nltk.corpus import inaugural

In [None]:
inaugural.fileids()

In [None]:
[fileid[:4] for fileid in inaugural.fileids()]

Pay attention to how this graph varies from the graph displayed in the book. NLTK's Inaugral Address Corpus is still updated, so data from United States presidents past 2005 are included in this graph.

* **note:** for this solution, I used matplotlib library functions to change the size of the graph
    * learn more about matplotlib here: [Intro to pyplot Tutorial](https://matplotlib.org/3.3.1/tutorials/introductory/pyplot.html#sphx-glr-tutorials-introductory-pyplot-py)
* CFD's will be explained in more detail in Section 2

In [None]:
import matplotlib.pyplot as plt

cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target))

plt.figure(figsize=(16, 6)) 

cfd.plot()

### 1.7 - Corpora in Other Languages

In [None]:
nltk.corpus.cess_esp.words()

In [None]:
nltk.corpus.floresta.words()

In [None]:
nltk.corpus.indian.words('hindi.pos')

In [None]:
nltk.corpus.udhr.fileids()

In [None]:
nltk.corpus.udhr.words('Javanese-Latin1')[11:]

* **note:** for this solution, I used matplotlib library functions to change the size of the graph
    * learn more about matplotlib here: [Intro to pyplot Tutorial](https://matplotlib.org/3.3.1/tutorials/introductory/pyplot.html#sphx-glr-tutorials-introductory-pyplot-py)
* CFD's will be explained in more detail in Section 2

In [None]:
import matplotlib.pyplot as plt
from nltk.corpus import udhr

languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']
cfd = nltk.ConditionalFreqDist(
    (lang, len(word))
    for lang in languages
    for word in udhr.words(lang + '-Latin1'))

plt.figure(figsize=(10, 6)) 

cfd.plot(cumulative=True)

**Your Turn:** Pick a language of interest in `udhr.fileids()`, and define a variable `raw_text = udhr.raw(Language-Latin1)`. Now plot a frequency distribution of the letters of the text using `nltk.FreqDist(raw_text).plot()`.

### 1.8 - Text Corpus Structure

In [34]:
from nltk.corpus import gutenberg

raw = gutenberg.raw("burgess-busterbrown.txt")

raw[1:20]

'The Adventures of B'

In [35]:
words = gutenberg.words("burgess-busterbrown.txt")

In [36]:
words[1:20]

['The',
 'Adventures',
 'of',
 'Buster',
 'Bear',
 'by',
 'Thornton',
 'W',
 '.',
 'Burgess',
 '1920',
 ']',
 'I',
 'BUSTER',
 'BEAR',
 'GOES',
 'FISHING',
 'Buster',
 'Bear']

In [37]:
sents = gutenberg.sents("burgess-busterbrown.txt")

In [38]:
sents[1:20]

[['I'],
 ['BUSTER', 'BEAR', 'GOES', 'FISHING'],
 ['Buster',
  'Bear',
  'yawned',
  'as',
  'he',
  'lay',
  'on',
  'his',
  'comfortable',
  'bed',
  'of',
  'leaves',
  'and',
  'watched',
  'the',
  'first',
  'early',
  'morning',
  'sunbeams',
  'creeping',
  'through',
  'the',
  'Green',
  'Forest',
  'to',
  'chase',
  'out',
  'the',
  'Black',
  'Shadows',
  '.'],
 ['Once',
  'more',
  'he',
  'yawned',
  ',',
  'and',
  'slowly',
  'got',
  'to',
  'his',
  'feet',
  'and',
  'shook',
  'himself',
  '.'],
 ['Then',
  'he',
  'walked',
  'over',
  'to',
  'a',
  'big',
  'pine',
  '-',
  'tree',
  ',',
  'stood',
  'up',
  'on',
  'his',
  'hind',
  'legs',
  ',',
  'reached',
  'as',
  'high',
  'up',
  'on',
  'the',
  'trunk',
  'of',
  'the',
  'tree',
  'as',
  'he',
  'could',
  ',',
  'and',
  'scratched',
  'the',
  'bark',
  'with',
  'his',
  'great',
  'claws',
  '.'],
 ['After',
  'that',
  'he',
  'yawned',
  'until',
  'it',
  'seemed',
  'as',
  'if',
  'his',

### 1.9 - Loading your own Corpus

In this example, we are going to look at the root directory of this reposity. The `..` stands for a **parent directory**, or a folder one level higher in the folder hierarchy. See [Section 1.4 of this Unix Tutorial](http://www.ee.surrey.ac.uk/Teaching/Unix/unix1.html) for an in depth explanation of this.

And instead of looking at all of the files that have a `.` in them, we will observe all of the files that end with `.md`. These are markdown files, which are a type of text file.

In [48]:
from nltk.corpus import PlaintextCorpusReader

corpus_root = '../'
wordlists = PlaintextCorpusReader(corpus_root, '.*.md') 
wordlists.fileids()

['02/2 - Work Notes.md', 'README.md', 'afterword.md', 'log.md', 'plan.md']

In [50]:
wordlists.words('README.md')

['#', 'NLTK', '-', 'Book', '-', 'Resource', '*', ...]

Unfortunately the Penn Treebank is [not a free resource](https://catalog.ldc.upenn.edu/LDC99T42). Fortunately there are a lot of [free alternatives to use](https://stackoverflow.com/q/8949517/12578069).

* [American National Corpus](http://www.anc.org/data/masc/downloads/data-download/)

## 2 - Conditional Frequency Distributions

## 3

## 4

## 5

## Your Turn Solutions

### 1.3

**Your Turn:** Choose a different section of the Brown Corpus, and adapt the previous example to count a selection of wh words, such as what, when, where, who, and why.

In [None]:
from nltk.corpus import brown

humor_text = brown.words(categories='humor')
humor_fdist = nltk.FreqDist(w.lower() for w in humor_text)
wh = ['what', 'when', 'where', 'who', 'why']

for w in wh:
    print(w + ':', fdist[m], end=' ')

### 1.7

**Your Turn:** Pick a language of interest in `udhr.fileids()`, and define a variable `raw_text = udhr.raw(Language-Latin1)`. Now plot a frequency distribution of the letters of the text using `nltk.FreqDist(raw_text).plot()`.

* in this example, I will choose `Portuguese_Portugues-Latin1`
* **note:** for this solution, I used matplotlib library functions to change the size of the graph
    * learn more about matplotlib here: [Intro to pyplot Tutorial](https://matplotlib.org/3.3.1/tutorials/introductory/pyplot.html#sphx-glr-tutorials-introductory-pyplot-py)

In [51]:
from nltk.corpus import udhr

udhr.fileids()

['Abkhaz-Cyrillic+Abkh',
 'Abkhaz-UTF8',
 'Achehnese-Latin1',
 'Achuar-Shiwiar-Latin1',
 'Adja-UTF8',
 'Afaan_Oromo_Oromiffa-Latin1',
 'Afrikaans-Latin1',
 'Aguaruna-Latin1',
 'Akuapem_Twi-UTF8',
 'Albanian_Shqip-Latin1',
 'Amahuaca',
 'Amahuaca-Latin1',
 'Amarakaeri-Latin1',
 'Amuesha-Yanesha-UTF8',
 'Arabela-Latin1',
 'Arabic_Alarabia-Arabic',
 'Asante-UTF8',
 'Ashaninca-Latin1',
 'Asheninca-Latin1',
 'Asturian_Bable-Latin1',
 'Aymara-Latin1',
 'Balinese-Latin1',
 'Bambara-UTF8',
 'Baoule-UTF8',
 'Basque_Euskara-Latin1',
 'Batonu_Bariba-UTF8',
 'Belorus_Belaruski-Cyrillic',
 'Belorus_Belaruski-UTF8',
 'Bemba-Latin1',
 'Bengali-UTF8',
 'Beti-UTF8',
 'Bichelamar-Latin1',
 'Bikol_Bicolano-Latin1',
 'Bora-Latin1',
 'Bosnian_Bosanski-Cyrillic',
 'Bosnian_Bosanski-Latin2',
 'Bosnian_Bosanski-UTF8',
 'Breton-Latin1',
 'Bugisnese-Latin1',
 'Bulgarian_Balgarski-Cyrillic',
 'Bulgarian_Balgarski-UTF8',
 'Cakchiquel-Latin1',
 'Campa_Pajonalino-Latin1',
 'Candoshi-Shapra-Latin1',
 'Caquinte-Latin

In [None]:
raw_text = udhr.raw('Portuguese_Portugues-Latin1')

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 6)) 

nltk.FreqDist(raw_text).plot()

## Work

### 1.7

In [None]:
import pandas as pd
from nltk.corpus import udhr

languages = ['Chickasaw', 'English', 'German_Deutsch', 'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik']

cfd = nltk.ConditionalFreqDist(
    (lang, len(word))
    for lang in languages
    for word in udhr.words(lang + '-Latin1'))

def plot_freq(lang):
    max_length = max([len(word) for word in udhr.words(lang + '-Latin1')])
    eng_freq_dist = {}

    for i in range(max_length + 1):
        eng_freq_dist[i] = cfd[lang].freq(i)

    ed = pd.Series(eng_freq_dist, name=lang)

    ed.cumsum().plot(legend=True, title='Cumulative Distribution of Word Lengths')

In [None]:
for lang in languages:
    plot_freq(lang)