### Extracting Readily Available Text Files from the Web and Disk
$\Longrightarrow$ Without Source Code Inspection

In [None]:
%matplotlib notebook

In [None]:
from __future__ import division
import nltk, re, pprint

In [None]:
from urllib import request
url = "https://www.gutenberg.org/files/2554/2554-0.txt"
raw = request.urlopen(url).read()
raw = str(raw,"utf-8") # Or --> raw = raw.decode(utf-8)
type(raw)

In [None]:
len(raw)

In [None]:
raw[:75]

In [None]:
print(raw[:500])

In [None]:
words = nltk.word_tokenize(raw)
type(words)
print(words[4:16])

In [None]:
" ".join(words[1021:1059])

In [None]:
tokens = nltk.Text(nltk.word_tokenize(raw))
tokens.collocations()

In [None]:
tokens.collocation_list()

In [None]:
tokens = tokens[96:300]
print(tokens)

<h3>Reading Local Files</h3>

In [None]:
path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
raw = open(path).read()

In [None]:
len(raw)

In [None]:
print(raw[:347])

In [None]:
raw = nltk.word_tokenize(raw)
len(raw)

In [None]:
from nltk.book import * 
len(set(raw))
raw = nltk.Text(raw)
fdistr = FreqDist(raw)

In [None]:
len(text1.tokens)

In [None]:
fdistr.plot(50, cumulative=True)

In [None]:
from collections import OrderedDict
fdistri = OrderedDict(sorted(fdistr.items(), key=lambda t: t[1], reverse=True))

In [None]:
# Python 2
# first2pairs = {k: mydict[k] for k in mydict.keys()[:2]}
# Python 3
first20pairs = {k: fdistri[k] for k in list(fdistri)[:42]}
first20pairs

In [None]:
from string import punctuation
total = sum([v for k,v in fdistri.items() if k not in punctuation])

In [None]:
%matplotlib notebook
from operator import itemgetter, attrgetter, methodcaller
import matplotlib.pylab as plt


A1 = fdistr

B1 = sorted(A1.items(), key=itemgetter(1), reverse=True)

C1 = {}

for k in B1[:18] :
    if str(k[0]) not in str(string.punctuation): 
        C1[k[0]] = k[1]/total


plt.bar(range(len(C1)), list(C1.values()), align='center')
plt.xticks(range(len(C1)), list(C1.keys()))
plt.title("Most frequent words in text1: 'Moby Dick by Herman Melville 1851'", color="b")
plt.xlabel("Words", color="r")
plt.ylabel("Words' Frequency", color="r")
plt.show()

In [None]:
plt.bar(range(1,14), [0.06/(i**1.07) for i in range(1,14)], align='center')

### See <a>https://quantdare.com/benford-law-and-zipf-law/

In [None]:
s = input("Enter some text: ")
print("You typed", len(nltk.word_tokenize(s)), "words.")

Notice that punctuation marks are considered as a token (a word). We can also count all the tokens, and remove punctuations from the count, to get exactly the right number of words while ignoring punctuation marks.

In [None]:
import nltk
from nltk.corpus import gutenberg
raw = gutenberg.raw('melville-moby_dick.txt')
fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())
fdist.keys()

fdist.plot()

In [None]:
path = nltk.data.find('corpora/gutenberg/melville-moby_dick.txt')
raw = open(path).read()

print(path)

In [None]:
len(raw)

In [None]:
print(raw[:347])

In [None]:
raw = nltk.word_tokenize(raw)
len(raw)

In [None]:
from nltk.book import * 
len(set(raw))
raw = nltk.Text(raw)
fdistr = FreqDist(raw)
help(sorted)

<h4>Extracting Text from PDF, MSWord, and Other Binary Formats</h4>

ASCII text and HTML text are human-readable formats. Text often comes in binary formats—such as PDF and MSWord—that can only be opened using specialized software. Third-party libraries such as pypdf and pywin32 provide access to these formats. Extracting text from multicolumn documents is particularly challenging. For one-off conversion of a few documents, it is simpler to open the document with a suitable application, then save it as text to your local drive, and access it as described below. If the document is already on the Web, you can enter its URL in Google’s search box. The search result often includes a link to an HTML version of the document, which you can save as text.

## Using Basic Metacharacters: Regular Expressions (Regexp)
Some characters are assigned special meanings in regexp for processing regular expressions searching sorting and matching operations. These characters are called special characters or metacharacters, because they are characters that have special meaning instead of their literal meaning. 

In [None]:
wordlist = raw.tokens

In [None]:
wordlist_ed = [w for w in wordlist if re.search(r'ed$', w)]
print(wordlist_ed[:50])

In [None]:
wordlist_ed = [w for w in wordlist if re.search(r'ed$', w)]
print(wordlist_ed[:50])

In [None]:
wordlist_j_t_1 = [w for w in wordlist if re.search(r'^..j..t..$', w)]
print(wordlist_j_t_1)

In [None]:
wordlist_j_t_2 = [w for w in wordlist if re.search(r'..j..t..?', w)]
print(wordlist_j_t_2)

### Phone keyboard T9
When using the keys 4653, what are are the suggested words by our phone? (Ofc, conditionally to the used corpus)

In [None]:
Key_4653 = [w for w in wordlist if re.search(r'^[ghi][mno][jlk][def]$', w)]
print(list(set(Key_4653)))

<p>
    The first part of the expression, «<font face="courier",size="3">^[ghi]</font>», matches the start of a word followed by <i>g</i>, <i>h</i>, or <i>i</i>. The next part of the expression, «<font face="courier",size="3">[mno]</font>», constrains the second character to be <i>m</i>, <i>n</i>, or <i>o</i>. The third and fourth characters are also constrained. Only four words satisfy all these constraints. Note that the order of characters inside the square brackets is not significant, so we could have written «<font face="courier",size="3">^[hig][nom][ljk][fed]&#36;</font>» and matched the same words.
</p>

<!-- 
<p>
    <i><font color="b" id="Figure_3.2"><b>Figure 3-5.</b></font> T9: Text on 9 keys.</i><br />
    <img src="img/Figure_3.5.png", width="600",height="400", alt="Photo de montagne", id="Figure_2.6" />
</p>
-->

In [None]:
nltk.FreqDist(nltk.Text(Key_4653))

According to the frequencies, the phone T9 can suggest hold first, then gold then hole. The order of suggestion can also take into account the words employed by the User and their frequencies.

In [None]:
Key_456 = [w for w in wordlist if re.search(r'^[ghijklmno]+$', w)]
print(set(Key_456))

**What does it do?**
- This regex checks if the entire string **w** consists only of the letters g to o (inclusive).
- If **w** contains any letter outside this set, re.search() will return None (no match).
- If **w** is made only of these letters, it returns a match.

In [None]:
# The same thing, but using [g-o] which is the same thing as [ghijklmno]
Key_456_bis = [w for w in wordlist if re.search('^[g-o]+$', w)]
print(set(Key_456_bis))

In [None]:
chat_words = sorted(set(w for w in nltk.corpus.nps_chat.words()))
[w for w in chat_words if re.search('^m+i+n+e+$', w)]

```nltk.corpus.nps_chat.words()``` contains the same words contained in ```text5.tokens```

In [None]:
set([w for w in text5.tokens if re.search('^m+i+n+e+$', w)])

In [None]:
[w for w in chat_words if re.search('^m*i*n*e*$', w)]

In [None]:
[w for w in chat_words if re.search('^[ha]+$', w)]

In [None]:
wsj = sorted(set(nltk.corpus.treebank.words()))
[w for w in wsj if re.search('^[0-9]+\.[0-9]+$', w)][:10]

If we use a normal ```str``` we need to put a backslash '\' before special characters (or key words) such as punctuations, '^' and '$' etc. to avoid doing it all the time, we can use ```r'3.5'``` to say that it's not a normal ```str``` but more a regular expression, otherwise we'd have to use ```'3\.5'```

In [None]:
wsj = sorted(set(nltk.corpus.treebank.words()))
[w for w in wsj if re.search(r'^[0-9]+.[0-9]+$', w)][:10]

In [None]:
[w for w in wsj if re.search('^[A-Z]+\$$', w)]

In [None]:
[w for w in wsj if re.search('^[0-9]{4}$', w)][:10]

In [None]:
[w for w in wsj if re.search('^[0-9]+-[a-z]{3,5}$', w)]

In [None]:
[w for w in wsj if re.search('^[a-z]{5,}-[a-z]{2,3}-[a-z]{,6}$', w)]

In [None]:
[w for w in wsj if re.search('(ed|ing)$', w)]

<p>
    <i><font color="b" id="Table_3.3"><b>Table 3-3</b></font>. Basic regular expression metacharacters, including wildcards, ranges, and closures</i><br />
    <img src="img/Table_3.3.png", width="700",height="600", alt="Regexp image", id="Figure_2.6" />
 </p>

Let’s look for all sequences of two or more vowels in some text, and determine their relative frequency:

In [None]:
wsj = sorted(set(nltk.corpus.treebank.words()))
fd = nltk.FreqDist(vs for word in wsj
    for vs in re.findall(r'[aeiou]{2,}', word))
fd.items()

In [None]:
regexp = r'^[AEIOUaeiou]+|[AEIOUaeiou]+$|[^AEIOUaeiou]'
def compress(word):
    pieces = re.findall(regexp, word)
    return ''.join(pieces)

In [None]:
english_udhr = nltk.corpus.udhr.words('English-Latin1')
print(english_udhr[:75], "\n", "\n", nltk.tokenwrap(compress(w) for w in english_udhr[:75]))

In [None]:
compress("continental")

In [None]:
compress("easter")

In [None]:
compress("free")

<h4>Finding Word Stems</h4>

In [None]:
re.findall(r'^.*(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

In [None]:
words = ["doing","progressive","obvious","government","normally","various","tired","intelligent","smart"]
[re.findall(r'^.*(?:ing|ly|ed|ious|ies|ive|es|s|ment)$', w) for w in words]

In [None]:
re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', 'processing')

In [None]:
words = ["doing","progressive","obvious","government","normally","various","tired","intelligent","smart"]
[re.findall(r'^(.*)(ing|ly|ed|ious|ies|ive|es|s|ment)$', w) for w in words]

```^.*``` allowing any prefix before the suffix, but the matching group is only the one between parentheses, which means that if we want to get the suffix and also the prefix, we need to add parentheses ```^(.*)``` to say that it's a matching group as well.

```^(.*)(?:ing|ly|ed...)``` The pattern still matches words ending in these suffixes, but it does NOT capture the suffix separately (because of this ```(?:```).So if it's used in ```re.findall()```, it will return the entire matched word, not just the suffix.

```(?i)``` will instruct the regexp engine to ignore case.

In [None]:
def stem(word):
    for suffix in ['ing', 'ly', 'ed', 'ious', 'ies', 'ive', 'es', 's', 'ment']:
        if word.endswith(suffix):
            return word[:-len(suffix)]
            return word

In [None]:
def stem(word):
    regexp = r'^(.*?)(ing|ly|ed|ious|ies|ive|es|s|ment)?$'
    stem, suffix = re.findall(regexp, word)[0]
    return stem

raw = """DENNIS: Listen, strange women lying in ponds distributing swords
    is no basis for a system of government. Supreme executive power derives from
    a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = nltk.word_tokenize(raw)
print(raw,"\n")
print([stem(t) for t in tokens])

<h4>Searching Tokenized Text</h4>

In [None]:
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
moby.findall(r"<a> (<.*>) <man>")

In [None]:
moby.findall(r"<a> (<.*>) <man>")

In [None]:
chat = nltk.Text(nps_chat.words())
chat.findall(r"<.*> <.*> <bro>")

In [None]:
from nltk.corpus import gutenberg, nps_chat
moby = nltk.Text(gutenberg.words('melville-moby_dick.txt'))
moby.findall(r"(<.*> <.*> <woman>)")

In [None]:
re.findall(r"a (.*) man", gutenberg.raw('melville-moby_dick.txt'))

In [None]:
chat.findall(r"<l.*>{3,}")

In [None]:
import nltk
from nltk.corpus import brown
hobbies_learned = nltk.Text(brown.words(categories=['hobbies', 'learned']))
hobbies_learned.findall(r"<\w*> <and> <other> <\w*s>")

In [None]:
hobbies_learned.findall(r"<so> <.*> <that> <.*>")

<h3>Normalizing Text</h3>

In [None]:
raw = """DENNIS: Listen, strange women lying in ponds distributing swords
    is no basis for a system of government. Supreme executive power derives from
    a mandate from the masses, not from some farcical aquatic ceremony."""
tokens = nltk.word_tokenize(raw)
print(tokens)

NLTK includes several off-the-shelf stemmers, and if you ever need a stemmer, you should use one of these in preference to crafting your own using regular expressions, since NLTK’s stemmers handle a wide range of irregular cases. The Porter and Lancaster stemmers follow their own rules for stripping affixes. Observe that the Porter stemmer correctly handles the word lying (mapping it to lie), whereas the Lancaster stemmer does not

In [None]:
porter = nltk.PorterStemmer()
print([porter.stem(t) for t in tokens])

In [None]:
lancaster = nltk.LancasterStemmer()
print([lancaster.stem(t) for t in tokens])

<h3>Simple Approaches to Tokenization</h3>

In [None]:
import re
raw = """'When I'M a Duchess,' she said to herself, (not in a very hopeful tone
    though), 'I won't have any pepper in my kitchen AT ALL. Soup does very
    well without--Maybe it's always pepper that makes people hot-tempered,'..."""

In [None]:
print(len(re.split(r' ', raw)), "\n", re.split(r' ', raw))

In [None]:
print(len(re.split(r'[ \t\n]+', raw)), "\n", re.split(r'[ \t\n]+', raw))

In [None]:
print(len(re.split(r'\s+', raw)), "\n", re.split(r'\s+', raw))

<p>Splitting on whitespace gives us tokens like <font face="courier",size="3">'(not'</font> and <font face="courier",size="3">'herself,'</font>. An alternative is to use the fact that Python provides us with a character class <font face="courier",size="3">\w</font> for word characters, equivalent to <font face="courier",size="3">[a-zA-Z0-9_]</font>. It also defines the complement of this class, <font face="courier",size="3">\W</font>, i.e., all characters other than letters, digits, or underscore. We can use <font face="courier",size="3">\W</font> in a simple regular expression to split the input on anything other than a word character:</p>

In [None]:
print(re.split(r'\W+', raw))

In [None]:
print(re.findall(r'\w+|\S\w*', raw))

In [None]:
print(re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", raw))

<h3>Further Issues with Tokenization</h3>

In [None]:
print(nltk.corpus.treebank_raw.raw())

In [None]:
print(nltk.corpus.treebank.words())

<h3>Segmentation</h3>

Tokenization is an instance of a more general problem of <b>segmentation</b>. In this section, we will look at two other instances of this problem, which use radically different techniques to the ones we have seen so far in this chapter.

<h4>Sentence Segmentation</h4>
<p>Manipulating texts at the level of individual words often presupposes the ability to divide a text into individual sentences. As we have seen, some corpora already provide access at the sentence level. In the following example, we compute the average number of words per sentence in the Brown Corpus:</p>

In [None]:
len(nltk.corpus.brown.words()) / len(nltk.corpus.brown.sents())

In [None]:
import pprint
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
text = nltk.corpus.gutenberg.raw('chesterton-thursday.txt')
sents = sent_tokenizer.tokenize(text)
pprint.pprint(sents[171:181])

In [None]:
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"

In [None]:
def segment(text, segs):
    words = []
    last = 0
    for i in range(len(segs)):
        if segs[i] == '1':
            words.append(text[last:i+1])
            last = i+1
    words.append(text[last:])
    return words

In [None]:
print(segment(text, seg1))

In [None]:
print(segment(text, seg2))

<b>Example 3-3</b>. Computing the cost of storing the lexicon and reconstructing the source text.

In [None]:
def evaluate(text, segs):
    words = segment(text, segs)
    text_size = len(words)
    lexicon_size = len(' '.join(list(set(words))))
    return text_size + lexicon_size

text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
seg2 = "0100100100100001001001000010100100010010000100010010000"
seg3 = "0000100100000011001000000110000100010000001100010000001"
print(segment(text, seg3))

In [None]:
evaluate(text, seg1)

In [None]:
print(evaluate(text, seg2),evaluate(text, seg3)) # --> seg1 wins

<i><b>Example 3-4</b>. Non-deterministic search using simulated annealing: Begin searching with phrase
segmentations only; randomly perturb the zeros and ones proportional to the "temperature"; with
each iteration the temperature is lowered and the perturbation of boundaries is reduced.
</i>

In [None]:
from random import randint

def flip(segs, pos):
    return(segs[:pos] + str(1-int(segs[pos])) + segs[pos+1:])

def flip_n(segs, n):
    for i in range(n):
        segs = flip(segs, randint(0,len(segs)-1))
    return(segs)

def anneal(text, segs, iterations, cooling_rate):
    temperature = float(len(segs))
    while temperature > 0.5:
        best_segs, best = segs, evaluate(text, segs)
        for i in range(iterations):
            guess = flip_n(segs, int(round(temperature)))
            score = evaluate(text, guess)
            if score < best:
                best, best_segs = score, guess
        score, segs = best, best_segs
        temperature = temperature / cooling_rate
        print(evaluate(text, segs), segment(text, segs))
    print()
    return(segs)

In [None]:
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000010000000000000000100000000000"
anneal(text, seg1, 5000, 1.2)

In [None]:
text = "doyouseethekittyseethedoggydoyoulikethekittylikethedoggy"
seg1 = "0000000000000001000000000000000000000000000100000000000"
anneal(text, seg1, 5000, 1.2)