# Processing Raw Text

In [34]:
import nltk, re, pprint
from nltk import word_tokenize

import re

## Accessing Text from the Web and from Disk

In [None]:
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554-0.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')

In [None]:
type(raw)

In [None]:
len(raw)

In [None]:
raw[:75]


In [None]:
tokens = word_tokenize(raw)
len(tokens)

In [None]:
tokens[:75]

In [None]:
text = nltk.Text(tokens)
type(text)

In [None]:
text.collocations()

In [None]:
raw.find("PART I")

In [None]:
raw.rfind("End of Project Gutenberg's Crime")

In [None]:
raw = raw[5575:-1]
raw.find("PART I")

## Dealing with HTML

In [None]:
url = "http://news.bbc.co.uk/2/hi/health/2284783.stm"
html = request.urlopen(url).read().decode('utf8')
html[:60]

In [None]:
from bs4 import BeautifulSoup
raw = BeautifulSoup(html, 'html.parser').get_text()
tokens = word_tokenize(raw)
tokens

In [None]:
tokens = tokens[110:390]
text = nltk.Text(tokens)
text.concordance('gene')

In [None]:
text.collocations()

## Processing Search Engine Results
## Processing RSS Feeds

In [None]:
import feedparser
llog = feedparser.parse("http://languagelog.ldc.upenn.edu/nll/?feed=atom")
llog['feed']['title']

In [None]:
len(llog.entries)

In [None]:
post = llog.entries[2]
post.title

In [None]:
content = post.content[0].value
content[:70]

In [None]:
raw = BeautifulSoup(content, 'html.parser').get_text()
word_tokenize(raw)

## Extracting Text from PDF, MSWord and other Binary Formats

In [None]:
a='On an exceptionally hot evening early in July'
len(a)

In [None]:
a=word_tokenize(a)
a

In [None]:
len(a)

In [None]:
couplet = "Shall I compare thee to a Summer's day?"\
"Thou are more lovely and more temperate:"\
"gfjhjhgjgghjgiugkgiugkgiugkguigkguigk"

In [None]:
couplet

In [None]:
couplet = (
"Rough winds do shake the darling buds of May,"
"And Summer's lease hath all too short a date:"
)

couplet

## The NLP Pipeline

## Regular Expressions for Detecting Word Patterns

In [32]:
with open("text_doc.txt") as f:   
    data = f.read()


In [33]:
data

'APU consumables pre-mission planning includes prediction, analysis, and post-flight performance evaluations for determining consumables requirements. APU \nhydrazine and injector cooling water-loading quantities are loaded per OMRSD requirements, which allow for loading uncertainty. Launch Commit Criteria monitor \nfor hydrazine leakage, ensure sufficient GN2 for gearbox repressurization and verify adequate hydraulic fluid quantity for flight. Minimum pre-deorbit APU fuel \nquantity requirements are defined in the Flight Rules.  Mission Control Center monitors consumables usage during the mission. Mission planning includes \nderivation of APU fuel reserves. Orbiter Fluid Budget allows for loading uncertainty and shows adequate reserves after worst-case consumption during Abort Once \nAround (AOA) landing. APU gas generator injector tube cooling water-loading quantity is provided for contingency operations only when a hot APU restart is \nrequired (sufficient time is not available for 

In [39]:
wordlist = [w for w in nltk.word_tokenize(data) if w.islower()]
wordlist


['consumables',
 'pre-mission',
 'planning',
 'includes',
 'prediction',
 'analysis',
 'and',
 'post-flight',
 'performance',
 'evaluations',
 'for',
 'determining',
 'consumables',
 'requirements',
 'hydrazine',
 'and',
 'injector',
 'cooling',
 'water-loading',
 'quantities',
 'are',
 'loaded',
 'per',
 'requirements',
 'which',
 'allow',
 'for',
 'loading',
 'uncertainty',
 'monitor',
 'for',
 'hydrazine',
 'leakage',
 'ensure',
 'sufficient',
 'for',
 'gearbox',
 'repressurization',
 'and',
 'verify',
 'adequate',
 'hydraulic',
 'fluid',
 'quantity',
 'for',
 'flight',
 'pre-deorbit',
 'fuel',
 'quantity',
 'requirements',
 'are',
 'defined',
 'in',
 'the',
 'monitors',
 'consumables',
 'usage',
 'during',
 'the',
 'mission',
 'planning',
 'includes',
 'derivation',
 'of',
 'fuel',
 'reserves',
 'allows',
 'for',
 'loading',
 'uncertainty',
 'and',
 'shows',
 'adequate',
 'reserves',
 'after',
 'worst-case',
 'consumption',
 'during',
 'landing',
 'gas',
 'generator',
 'injector',


In [40]:
[w for w in wordlist if re.search('ing$',w)]

['planning',
 'determining',
 'cooling',
 'water-loading',
 'loading',
 'during',
 'planning',
 'loading',
 'during',
 'landing',
 'cooling',
 'water-loading']