In [1]:
import nltk
from nltk.corpus import shakespeare 
from nltk.corpus import XMLCorpusReader

from collections import defaultdict

Let's take a look at the file ids in the `shakespeare` corpus.

In [2]:
shakespeare.fileids()

['a_and_c.xml',
 'dream.xml',
 'hamlet.xml',
 'j_caesar.xml',
 'macbeth.xml',
 'merchant.xml',
 'othello.xml',
 'r_and_j.xml']

I tried to do this in class it was a mess. Turns out that reading XML corpora in NLTK is *complicated*. The loop below took me about 20 minutes to figure out. You're going to need to change the root directory for the correct location on your machine. 

In [3]:
root_location = "/users/chandler/nltk_data/corpora/shakespeare/"

for file_name in shakespeare.fileids() :
    play_name = file_name.replace(".xml","")
    play = XMLCorpusReader(root=root_location,fileids=file_name)
    
    print(f"{play_name} has {len(play.words())} tokens in it.")
    


a_and_c has 34192 tokens in it.
dream has 21538 tokens in it.
hamlet has 40379 tokens in it.
j_caesar has 26058 tokens in it.
macbeth has 22977 tokens in it.
merchant has 27263 tokens in it.
othello has 35092 tokens in it.
r_and_j has 33078 tokens in it.


---

And now space for you to work in your small groups. I'd like you to repeat the analysis from the Macbeth article. Normalize and tokenize your plays. Then count the number of tokens overall, the number of occurances of the word "the", and the fraction of words that are "the". 

If you like, feel free to `FreqDist`; that can make it much easier. Or you can do something like building a dictionary. Once you've done the "the" analysis, push it further by finding other words that are represented in Macbeth at much higher rates than the other plays.

In [22]:
root_location = "/users/chandler/nltk_data/corpora/shakespeare/"

play_data = defaultdict(list)

for file_name in shakespeare.fileids() :
    play_name = file_name.replace(".xml","")
    
    play = XMLCorpusReader(root=root_location,fileids=file_name)

    play_data[play_name].append(len([w for w in play.words() if w.lower() == "the"]))
    play_data[play_name].append(len([w for w in play.words() if w.isalpha()]))


In [17]:
root_location = "/users/chandler/nltk_data/corpora/shakespeare/"

play_data = defaultdict(dict)

for file_name in shakespeare.fileids() :
    play_name = file_name.replace(".xml","")
    
    play = XMLCorpusReader(root=root_location,fileids=file_name)

    play_data[play_name]['the'] = len([w for w in play.words() if w.lower() == "the"])
    play_data[play_name]['total_tokens'] = len([w for w in play.words() if w.isalpha()])
    play_data[play_name]['the_ratio'] = play_data[play_name]['the']/play_data[play_name]['total_tokens']


In [18]:
play_data

defaultdict(dict,
            {'a_and_c': {'the': 871,
              'total_tokens': 27619,
              'the_ratio': 0.03153626126941598},
             'dream': {'the': 563,
              'total_tokens': 17521,
              'the_ratio': 0.0321328691284744},
             'hamlet': {'the': 1148,
              'total_tokens': 32838,
              'the_ratio': 0.034959498142396},
             'j_caesar': {'the': 610,
              'total_tokens': 21216,
              'the_ratio': 0.028751885369532427},
             'macbeth': {'the': 733,
              'total_tokens': 18729,
              'the_ratio': 0.0391371669603289},
             'merchant': {'the': 839,
              'total_tokens': 22609,
              'the_ratio': 0.03710911583882524},
             'othello': {'the': 761,
              'total_tokens': 28464,
              'the_ratio': 0.026735525576166385},
             'r_and_j': {'the': 685,
              'total_tokens': 26583,
              'the_ratio': 0.025768348192453824}}

In [23]:
for key, value in play_data.items() :
    play_data[key].append(value[0]/value[1])


In [24]:
play_data

defaultdict(list,
            {'a_and_c': [871, 27619, 0.03153626126941598],
             'dream': [563, 17521, 0.0321328691284744],
             'hamlet': [1148, 32838, 0.034959498142396],
             'j_caesar': [610, 21216, 0.028751885369532427],
             'macbeth': [733, 18729, 0.0391371669603289],
             'merchant': [839, 22609, 0.03710911583882524],
             'othello': [761, 28464, 0.026735525576166385],
             'r_and_j': [685, 26583, 0.025768348192453824]})

---

As we discussed in class, "lexical diversity" is the ratio of tokens to types (unique tokens). Calculate this for each of the plays as well. 

In [25]:
for file_name in shakespeare.fileids() :
    play_name = file_name.replace(".xml","")
    
    play = XMLCorpusReader(root=root_location,fileids=file_name)
    
    tokens = [w.lower() for w in play.words() if w.isalpha()]
    
    play_data[play_name].append((len(set(tokens))/len(tokens))**(-1))



In [26]:
play_data

defaultdict(list,
            {'a_and_c': [871, 27619, 0.03153626126941598, 7.331829041677728],
             'dream': [563, 17521, 0.0321328691284744, 6.029249827942189],
             'hamlet': [1148, 32838, 0.034959498142396, 7.228263262161568],
             'j_caesar': [610, 21216, 0.028751885369532427, 7.593414459556191],
             'macbeth': [733, 18729, 0.0391371669603289, 5.845505617977528],
             'merchant': [839, 22609, 0.03710911583882524, 7.161545771301869],
             'othello': [761, 28464, 0.026735525576166385, 7.806911684037301],
             'r_and_j': [685, 26583, 0.025768348192453824, 7.473432667978633]})

---

Some Spelling Bee Solver work.

In [41]:
words = set()

big_file = open('big.txt').read()
word_file = open('words-1.txt').read()

words.update([w.lower() for w in big_file.split()])
words.update([w.lower() for w in word_file.split()])

In [51]:
words = {w for w in words if len(w) >= 4 and w.isalpha()}

In [13]:
len(words)

264268

In [43]:
optional_letters = "volpna"
required_letter = "i"
letters = optional_letters + required_letter
letter_set = set(letters)

In [71]:
letter_set = frozenset(letter_set)

In [72]:
word2set = {}

for word in words :
    word2set[word] = frozenset(word)

In [73]:
%%timeit

solutions = [w for w in words if len(word2set[w] - letter_set)==0]

#for word in words :
#    wset = set(word)
#    if all([ch in letter_set for ch in wset]) :
#        solutions.append(word)


75.9 ms ± 847 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [74]:
%%timeit

solutions = []

for word in words :
    
    if required_letter not in word :
        next
    
    good_word = True
    for ch in word :
        if ch not in letters :
            good_word = False
            
    if good_word :
        solutions.append(word)
        
# without removing main letter: 101

112 ms ± 2.05 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [75]:
a = "austin"

In [76]:
b = "autism"

In [83]:
set(a) & (set(b))

{'a', 'i', 's', 't', 'u'}

In [67]:
solutions

['palinal',
 'linpin',
 'palapalai',
 'poppa',
 'lapillo',
 'anonol',
 'opinional',
 'llano',
 'papain',
 'apiolin',
 'pail',
 'lollop',
 'pill',
 'anvil',
 'lion',
 'onion',
 'violin',
 'aionial',
 'paolo',
 'anapanapa',
 'linin',
 'lava',
 'olivilin',
 'loan',
 'anal',
 'palla',
 'pavonian',
 'nonnaval',
 'apollo',
 'aloin',
 'oval',
 'papillon',
 'inion',
 'livonian',
 'pall',
 'pianino',
 'poppin',
 'pipal',
 'olona',
 'novalia',
 'pinna',
 'apionol',
 'pianola',
 'palolo',
 'vinal',
 'anilao',
 'pavlovna',
 'ninon',
 'lain',
 'popal',
 'noon',
 'vanillon',
 'vanillal',
 'violon',
 'planilla',
 'pooli',
 'anon',
 'ilial',
 'palli',
 'napoo',
 'appall',
 'anana',
 'pool',
 'anion',
 'palpi',
 'anna',
 'pallial',
 'pillion',
 'ovinia',
 'vanillin',
 'ainoi',
 'nova',
 'annal',
 'palpal',
 'napal',
 'pallall',
 'violanin',
 'valval',
 'anolian',
 'villain',
 'pilon',
 'avian',
 'papilla',
 'anopia',
 'plan',
 'vain',
 'palila',
 'pappi',
 'linalol',
 'pollinia',
 'liana',
 'papion',
 

---

## Some Scraping

In [18]:
import requests  # To get the pages
from bs4 import BeautifulSoup # and to process them

In [19]:
from bs4.element import Comment

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


In [31]:
r = requests.get("http://www.nytimes.com")


if r.status_code == 200 :
    soup = BeautifulSoup(r.text, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts) 


In [32]:
for t in visible_texts :
    if t.strip() :
        print(t.strip())

Continue reading the main story
Sections
SEARCH
Skip to content
Skip to site index
U.S.
International
Canada
Español
中文
Today’s Paper
World
U.S.
Politics
N.Y.
Business
Opinion
Tech
Science
Health
Sports
Arts
Books
Style
Food
Travel
Magazine
T Magazine
Real Estate
Video
World
U.S.
Politics
N.Y.
Business
Opinion
Tech
Science
Health
Sports
Arts
Books
Style
Food
Travel
Magazine
T Magazine
Real Estate
Video
Biden Struggles to Unite His Own Party Behind His Economic Agenda
President Biden and his aides mounted an all-out effort to salvage a pair of bills containing trillions in spending on infrastructure, education and more.
Mr. Biden canceled a scheduled trip to Chicago in order to continue talking with lawmakers during a critical week of deadlines in the House.
Doug Mills/The New York Times
Republicans are expected to back a bill to avert a shutdown after Democrats moved a debt limit increase into a separate bill.
Sinema Is at the Center of It All. Some Arizonans Wish She Weren’t.
Kyrsten 