# Corpus and Lexicon

In [1]:
import nltk
nltk.download('gutenberg')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to /Users/eva01/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /Users/eva01/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

#### Exercise 1

- Define a function to compute corpus descriptive statistics

    - input:
        - raw text (Chars)
        - words
        - sentences
    - output (print): 
        - average number of:
            - chars per word
            - words per sentence
            - chars per sentence
        - Size of the longest word and sentence


In [2]:
alice_words = nltk.corpus.gutenberg.words('carroll-alice.txt')
alice_sents = nltk.corpus.gutenberg.sents('carroll-alice.txt')

def statistics(words, sents):
    word_lens = [len(word) for word in words]
    sent_lens = [len(sent) for sent in sents]
    chars_in_sents = [len(''.join(sent)) for sent in sents]
    
    word_per_sent = round(sum(sent_lens) / len(sents))
    char_per_word = round(sum(word_lens) / len(words))
    char_per_sent = round(sum(chars_in_sents) / len(sents))
    
    longest_sentence = max(sent_lens)
    longest_word = max(word_lens)
    
    return word_per_sent, char_per_word, char_per_sent, longest_sentence, longest_word

word_per_sent, char_per_word, char_per_sent, longest_sent, longeset_word = statistics(alice_words, alice_sents)

print('Word per sentence', word_per_sent)
print('Char per word', char_per_word)
print('Char per sentence', char_per_sent)
print('Longest sentence', longest_sent)
print('Longest word', longeset_word)

Word per sentence 20
Char per word 3
Char per sentence 68
Longest sentence 204
Longest word 14


## 2. Lexicon



#### Exercise 2

- compute frequency list of __lowercased__ "alice" corpus (you can use either method)
- report `5` most frequent words (use can use provided `nbest` function to get a dict of top N items)
- compare the frequencies to the reference values below

| Word   | Frequency |
|--------|----------:|
| ,      |     1,993 |
| '      |     1,731 |
| the    |     1,642 |
| and    |       872 |
| .      |       764 |


In [5]:
from collections import Counter

In [6]:
def nbest(d, n=1):
    """
    get n max values from a dict
    :param d: input dict (values are numbers, keys are stings)
    :param n: number of values to get (int)
    :return: dict of top n key-value pairs
    """
    return dict(sorted(d.items(), key=lambda item: item[1], reverse=True)[:n])

In [7]:
alice_lowercase_freq_list = Counter([w.lower() for w in alice_words]) # Replace X with the word list of the corpus in lower case (see above))
nbest(alice_lowercase_freq_list, n=6) # Change N form 1 to 5

{',': 1993, "'": 1731, 'the': 1642, 'and': 872, '.': 764, 'to': 729}

##### Exercise 3

<!-- - define a function to compute a lexicon from a frequency list applying minimum and maximum frequency cut-offs
    
    - input: frequence list (dict)
    - output: list
    - use default values for min and max
     -->
- Using the function cut_off
    
    - compute lexicon applying:
    
        - minimum cut-off 2 (remove words that appear less than 2 times, i.e. remove [hapax legomena](https://en.wikipedia.org/wiki/Hapax_legomenon))
        - maximum cut-off 100 (remove words that appear more that 100 times)
        - both minimum and maximum thresholds together
        
    - report size for each comparing to the reference values in the table (on the lowercased lexicon)

| Operation  | Min | Max | Size |
|------------|----:|----:|-----:|
| original   | N/A | N/A | 2636 |
| cut-off    |   2 | N/A | 1503 |
| cut-off    | N/A | 100 | 2586 |
| cut-off    |   2 | 100 | 1453 |


In [8]:
def cut_off(vocab, n_min=100, n_max=100):
    new_vocab = []
    for word, count in vocab.items():
        if count >= n_min and count <= n_max:
            new_vocab.append(word)
    return new_vocab

lower_bound = 2 # Change these two number to compute the required cut offs
upper_bound = 100
lexicon_cut_off = len(cut_off(alice_lowercase_freq_list, n_min=lower_bound, n_max=upper_bound))

print('Original', len(alice_lowercase_freq_list))
print('CutOFF Min:', lower_bound, 'MAX:', upper_bound, ' Lexicon Size:', lexicon_cut_off)

Original 2636
CutOFF Min: 2 MAX: 100  Lexicon Size: 1453


##### Exercise 4
- using Python's built it `set` [methods](https://docs.python.org/2/library/stdtypes.html#set):
    - compute the intersection between the 100 most frequent words in frequency list of the alice corpus and the list of stopwords (report count)
    - remove stopwords from the lexicon
    - print the size of:
            - original lexicon
            - lexicon without stopwords
            - overlap between 100 most freq. words and stopwords

| Operation       | Size |
|-----------------|-----:|
| original        | 2636 |
| no stop words   | 2490 |
| top 100 overlap |   65 |

In [9]:
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOP_WORDS
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as SKLEARN_STOP_WORDS
from nltk.corpus import stopwords

# nltk.download('stopwords') # Run only once

NLTK_STOP_WORDS = set(stopwords.words('english'))

In [10]:
# Set built-in Function
set_a = set(['a', 'b', 'c', 'd', 'e'])
set_b = set(['a', 'b', 'f'])

print(set_a.intersection(set_b)) # Compute overlap
print(set_a.difference(set_b)) # Remove Elements by computing the set diff

{'b', 'a'}
{'c', 'd', 'e'}


In [11]:
alice_vocab = set([w.lower() for w in alice_words])
top100 = list(nbest(alice_lowercase_freq_list,n=100).keys())
stop_words = NLTK_STOP_WORDS
overlap = set(top100).intersection(stop_words) # Compute the intersection between top100 and stop_words
alice_vocab_no_stopwords = alice_vocab.difference(stop_words) # Remove Stopwords from alice vocab
print('Original', len(alice_vocab))
print('No stopwords', len(alice_vocab_no_stopwords))
print('To100 overlap', len(overlap))

Original 2636
No stopwords 2490
To100 overlap 65


## Last Exercise
- Load another corpus from Gutenberg (e.g. `milton-paradise.txt`)
- On this, compute the descriptive statistics using the provided sentences and tokens (.raw, .words, etc.) as __reference__ 
    - After this you will get "reference" version 
- Tokenize and segment into sentences the provided raw corpus using the `spaCy` and `NLTK` libraries. Compute the descriptive statistics on the outcome
    - After this you will get "spaCy" and "NLTK" versions
- Compute lowercased lexicons for all 3 versions (reference, spaCy, NLTK) of the corpus
    - compare lexicon sizes
- Compute frequency distribution for all 3 versions (reference, spaCy, NLTK) of the corpus
    - compare top N frequencies