# Processing Raw Text

In [2]:
import nltk, re, pprint
from nltk import word_tokenize

import re

## Accessing Text from the Web and from Disk

In [3]:
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')

In [4]:
type(raw)

str

In [5]:
len(raw)

1176812

In [6]:
raw[:75]


'\ufeffThe Project Gutenberg eBook of Crime and Punishment, by Fyodor Dostoevsky\r'

In [7]:
tokens = word_tokenize(raw)
len(tokens)

257058

In [8]:
tokens[:75]

['\ufeffThe',
 'Project',
 'Gutenberg',
 'eBook',
 'of',
 'Crime',
 'and',
 'Punishment',
 ',',
 'by',
 'Fyodor',
 'Dostoevsky',
 'This',
 'eBook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'in',
 'the',
 'United',
 'States',
 'and',
 'most',
 'other',
 'parts',
 'of',
 'the',
 'world',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'almost',
 'no',
 'restrictions',
 'whatsoever',
 '.',
 'You',
 'may',
 'copy',
 'it',
 ',',
 'give',
 'it',
 'away',
 'or',
 're-use',
 'it',
 'under',
 'the',
 'terms',
 'of',
 'the',
 'Project',
 'Gutenberg',
 'License',
 'included',
 'with',
 'this',
 'eBook',
 'or',
 'online',
 'at',
 'www.gutenberg.org',
 '.',
 'If',
 'you',
 'are',
 'not',
 'located']

In [9]:
text = nltk.Text(tokens)
type(text)

nltk.text.Text

In [10]:
text.collocations()

Katerina Ivanovna; Pyotr Petrovitch; Pulcheria Alexandrovna; Avdotya
Romanovna; Rodion Romanovitch; Marfa Petrovna; Sofya Semyonovna; old
woman; Project Gutenberg-tm; Porfiry Petrovitch; Amalia Ivanovna;
great deal; young man; Nikodim Fomitch; Project Gutenberg; Ilya
Petrovitch; Andrey Semyonovitch; Hay Market; Dmitri Prokofitch; Good
heavens


In [11]:
raw.find("PART I")

5575

In [12]:
raw.rfind("End of Project Gutenberg's Crime")

-1

In [13]:
raw = raw[5575:-1]
raw.find("PART I")

0

## Dealing with HTML

In [14]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode('utf8')
html[:60]

'<!doctype html public "-//W3C//DTD HTML 4.0 Transitional//EN'

In [15]:
from bs4 import BeautifulSoup
raw = BeautifulSoup(html, 'html.parser').get_text()
tokens = word_tokenize(raw)
tokens

['BBC',
 'NEWS',
 '|',
 'Health',
 '|',
 'Blondes',
 "'to",
 'die',
 'out',
 'in',
 '200',
 "years'",
 'CATEGORIES',
 'TV',
 'RADIO',
 'COMMUNICATE',
 'WHERE',
 'I',
 'LIVE',
 'INDEX',
 'SEARCH',
 'You',
 'are',
 'in',
 ':',
 'Health',
 'News',
 'Front',
 'Page',
 'World',
 'UK',
 'England',
 'N',
 'Ireland',
 'Scotland',
 'Wales',
 'Politics',
 'Business',
 'Entertainment',
 'Science/Nature',
 'Technology',
 'Health',
 'Medical',
 'notes',
 'Education',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Talking',
 'Point',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Country',
 'Profiles',
 'In',
 'Depth',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Programmes',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'SERVICES',
 'Daily',
 'E-mail',
 'News',
 'Ticker',
 'Mobile/PDAs',
 '--',
 '--',
 '--',
 '--',
 '--',
 '--',
 '-',
 'Text',
 'Only',
 'Feedback',
 'Help',
 'EDITIONS',
 'Change',
 'to',
 'World',
 'Friday',
 ',',
 '27',
 'September',
 ',',
 '2002',
 ',',
 '11:51',
 '

In [16]:
tokens = tokens[110:390]
text = nltk.Text(tokens)
text.concordance('gene')

Displaying 5 of 5 matches:
hey say too few people now carry the gene for blondes to last beyond the next 
blonde hair is caused by a recessive gene . In order for a child to have blond
 have blonde hair , it must have the gene on both sides of the family in the g
ere is a disadvantage of having that gene or by chance . They do n't disappear
des would disappear is if having the gene was a disadvantage and I do not thin


In [17]:
text.collocations()

blonde hair; Jonathan Rees; n't disappear; blondes would; blondes may


## Processing Search Engine Results
## Processing RSS Feeds

In [18]:
import feedparser
llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
llog['feed']['title']

'Language Log'

In [19]:
len(llog.entries)

13

In [20]:
post = llog.entries[2]
post.title

'Judo:  martial arts neologism or ancient philosophical term?'

In [21]:
content = post.content[0].value
content[:70]

'<p>The term "judo", which sport / martial art ("as a physical, mental,'

In [22]:
raw = BeautifulSoup(content, 'html.parser').get_text()
word_tokenize(raw)

['The',
 'term',
 '``',
 'judo',
 "''",
 ',',
 'which',
 'sport',
 '/',
 'martial',
 'art',
 '(',
 '``',
 'as',
 'a',
 'physical',
 ',',
 'mental',
 ',',
 'and',
 'moral',
 'pedagogy',
 "''",
 '[',
 'source',
 ']',
 ')',
 'was',
 'only',
 'created',
 'in',
 '1882',
 'by',
 'Jigoro',
 'Kano',
 '嘉納治五郎',
 '(',
 '1860-1938',
 ')',
 '.',
 'What',
 'I',
 'find',
 'amazing',
 'is',
 'that',
 'jūdō',
 '/',
 'MSM',
 'róudào',
 '柔道',
 '(',
 '``',
 'soft',
 '/',
 'flexible',
 '/',
 'gentle',
 '/',
 'supple',
 '/',
 'mild',
 '/',
 'yielding',
 'way',
 "''",
 ')',
 'comes',
 'right',
 'out',
 'of',
 'the',
 'Yìjīng',
 '易經',
 '(',
 'Book',
 '/',
 'Classic',
 'of',
 'Change',
 '[',
 's',
 ']',
 ')',
 '.',
 'Of',
 'course',
 ',',
 'traditional',
 'Japanese',
 'scholars',
 'have',
 'always',
 'been',
 'learned',
 'in',
 'the',
 'Chinese',
 'classics',
 ',',
 'so',
 'it',
 'should',
 "n't",
 'be',
 'too',
 'surprising',
 'that',
 'they',
 'would',
 'draw',
 'on',
 'the',
 'classics',
 'for',
 'terminolo

## Extracting Text from PDF, MSWord and other Binary Formats

In [23]:
a='On an exceptionally hot evening early in July'
len(a)

45

In [24]:
a=word_tokenize(a)
a

['On', 'an', 'exceptionally', 'hot', 'evening', 'early', 'in', 'July']

In [25]:
len(a)

8

In [26]:
couplet = "Shall I compare thee to a Summer's day?"\
"Thou are more lovely and more temperate:"\
"gfjhjhgjgghjgiugkgiugkgiugkguigkguigk"

In [27]:
couplet

"Shall I compare thee to a Summer's day?Thou are more lovely and more temperate:gfjhjhgjgghjgiugkgiugkgiugkguigkguigk"

In [28]:
couplet = (
"Rough winds do shake the darling buds of May,"
"And Summer's lease hath all too short a date:"
)

couplet

"Rough winds do shake the darling buds of May,And Summer's lease hath all too short a date:"

## The NLP Pipeline

## Regular Expressions for Detecting Word Patterns

In [29]:
with open("text_doc.txt") as f:   
    data = f.read()


In [30]:
data

'APU consumables pre-mission planning includes prediction, analysis, and post-flight performance evaluations for determining consumables requirements. APU \nhydrazine and injector cooling water-loading quantities are loaded per OMRSD requirements, which allow for loading uncertainty. Launch Commit Criteria monitor \nfor hydrazine leakage, ensure sufficient GN2 for gearbox repressurization and verify adequate hydraulic fluid quantity for flight. Minimum pre-deorbit APU fuel \nquantity requirements are defined in the Flight Rules.  Mission Control Center monitors consumables usage during the mission. Mission planning includes \nderivation of APU fuel reserves. Orbiter Fluid Budget allows for loading uncertainty and shows adequate reserves after worst-case consumption during Abort Once \nAround (AOA) landing. APU gas generator injector tube cooling water-loading quantity is provided for contingency operations only when a hot APU restart is \nrequired (sufficient time is not available for 

In [39]:
x=re.search('APU',data)

In [40]:
x.group()

'APU'

In [41]:
x.start()

0

In [43]:
x=re.match('APU',data)

In [44]:
x

<re.Match object; span=(0, 3), match='APU'>

In [32]:
wordlist = [w for w in nltk.word_tokenize(data) if w.islower()]
wordlist


['consumables',
 'pre-mission',
 'planning',
 'includes',
 'prediction',
 'analysis',
 'and',
 'post-flight',
 'performance',
 'evaluations',
 'for',
 'determining',
 'consumables',
 'requirements',
 'hydrazine',
 'and',
 'injector',
 'cooling',
 'water-loading',
 'quantities',
 'are',
 'loaded',
 'per',
 'requirements',
 'which',
 'allow',
 'for',
 'loading',
 'uncertainty',
 'monitor',
 'for',
 'hydrazine',
 'leakage',
 'ensure',
 'sufficient',
 'for',
 'gearbox',
 'repressurization',
 'and',
 'verify',
 'adequate',
 'hydraulic',
 'fluid',
 'quantity',
 'for',
 'flight',
 'pre-deorbit',
 'fuel',
 'quantity',
 'requirements',
 'are',
 'defined',
 'in',
 'the',
 'monitors',
 'consumables',
 'usage',
 'during',
 'the',
 'mission',
 'planning',
 'includes',
 'derivation',
 'of',
 'fuel',
 'reserves',
 'allows',
 'for',
 'loading',
 'uncertainty',
 'and',
 'shows',
 'adequate',
 'reserves',
 'after',
 'worst-case',
 'consumption',
 'during',
 'landing',
 'gas',
 'generator',
 'injector',


In [33]:
[w for w in wordlist if re.search('ing$',w)]

['planning',
 'determining',
 'cooling',
 'water-loading',
 'loading',
 'during',
 'planning',
 'loading',
 'during',
 'landing',
 'cooling',
 'water-loading']