# Ch3. Working with Text Data

<div id="toc"></div>

## Unit13_Processing HTML Files

In [None]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
# Construct soup from a string
soup1 = BeautifulSoup( "<HTML><HEAD>«headers»</HEAD>«body»</HTML>" )
# Construct soup from a local file
soup2 = BeautifulSoup(open( "myDoc.html" ))
# Construct soup from a web document
# Remember that urlopen() does not add "http://"!
soup3 = BeautifulSoup(urlopen( "http://www.networksciencelab.com/" ))

In [None]:
htmlString = '''
<HTML>
<HEAD><TITLE>My document</TITLE></HEAD>
<BODY>Main text.</BODY></HTML>
'''
soup = BeautifulSoup(htmlString)
soup.get_text()

In [None]:
with urlopen( "http://www.networksciencelab.com/" ) as doc:
soup = BeautifulSoup(doc)
links = [(link.string, link[ "href" ])
for link in soup.find_all( "a" )
if link.has_attr( "href" )]

## Unit14_Handling CSV Files

In [None]:
with open( "somefile.csv" , newline='' ) as infile:
    reader = csv.reader(infile, delimiter=',' , quotechar='"' )

http://catalog.data.gov/dataset?res_format=CSV

In [None]:
with open( "demographics.csv" , newline='' ) as infile:
    data = list(csv.reader(infile))

In [None]:
ageIndex = data[0].index( "Answer.Age" )

In [None]:
ages = [int(row[ageIndex]) for row in data[1:]]
print(statistics.mean(ages), statistics.stdev(ages))

## Unit15_Reading JSON Files

http://dev.twitter.com/overview/documentation  
http://developers.facebook.com  
http://developer.yahoo.com/weather/  

In [None]:
object = «some serializable object»
# Save an object to a file
with open( "data.json" , "w" ) as out_json:
json.dump(object, out_json, indent=None, sort_keys=False)
# Load an object from a file
with open( "data.json" ) as in_json:
    object1 = json.load(in_json)
# Serialize an object to a string
json_string = json.dumps(object1)
# Parse a string as JSON
object2 = json.loads(json_string)

## Unit16_Processing Texts in Natural Languages

#### NLTK Corpora

In [None]:
wn = nltk.corpus.wordnet # The corpus reader
wn.synsets( "cat" )

In [None]:
wn.synset( "cat.n.01" ).definition()
wn.synset( "cat.n.02" ).definition()

In [None]:
wn.synset( "cat.n.01" ).hypernyms()
wn.synset( "cat.n.01" ).hyponyms()

In [None]:
x = wn.synset( "cat.n.01" )
y = wn.synset( "lynx.n.01" )
x.path_similarity(y)

In [None]:
[simxy.definition() for simxy in max(
    (x.path_similarity(y), x, y)
    for x in wn.synsets( 'cat' )
    for y in wn.synsets( 'dog' )
    if x.path_similarity(y) # Ensure the synsets are related at all
)[1:]]

In [None]:
myCorpus.fileids()
myCorpus.raw()
myCorpus.sents()
myCorpus.words()

#### Normalization

In [None]:
from nltk.tokenize import WordPunctTokenizer
word_punct = WordPunctTokenizer()
text = "}Help! :))) :[ ..... :D{"
word_punct.tokenize(text)

In [None]:
nltk.word_tokenize(text)

In [None]:
pstemmer = nltk.PorterStemmer()
pstemmer.stem( "wonderful" )

In [None]:
lstemmer = nltk.LancasterStemmer()
lstemmer.stem( "wonderful" )

In [None]:
lemmatizer = nltk.WordNetLemmatizer()
lemmatizer.lemmatize( "wonderful" )

In [None]:
nltk.pos_tag([ "beautiful" , "world" ])
# An adjective and a noun

In [None]:
from bs4 import BeautifulSoup
from collections import Counter
from nltk.corpus import stopwords
from nltk import LancasterStemmer
# Create a new stemmer
ls = nltk.LancasterStemmer()
# Read the file and cook a soup
with open( "index.html" ) as infile:
    soup = BeautifulSoup(infile)
# Extract and tokenize the text
words = nltk.word_tokenize(soup.text)
# Convert to lowercase
words = [w.lower() for w in words]
# Eliminate stop words and stem the rest of the words
words = [ls.stem(w) for w in text if w not in
         stopwords.words( "english" ) and w.isalnum()]
# Tally the words
freqs = Counter(words)
print(freqs.most_common(10))

#### Other Text-Processing Procedures

## Your Turn

http://www.mediawiki.org/wiki/API:Main_page  
http://en.wikipedia.org/wiki/Category:Rock_music_groups_by_genre  
http://en.wikipedia.org/wiki/Jaccard_index  