## Selecting Unigram probabilities in a sentence

In [1]:
text = "After winning re-election by defeating Republican opponent Mitt Romney, Obama was sworn in for a second term in 2013. During this term, he promoted inclusion for LGBT Americans. His administration filed briefs that urged the Supreme Court to strike down same-sex marriage bans as unconstitutional (United States v. Windsor and Obergefell v. Hodges); same-sex marriage was legalized nationwide in 2015 after the Court ruled so in Obergefell. He advocated for gun control in response to the Sandy Hook Elementary School shooting, indicating support for a ban on assault weapons, and issued wide-ranging executive actions concerning global warming and immigration. In foreign policy, he ordered military intervention in Iraq in response to gains made by ISIL after the 2011 withdrawal from Iraq, continued the process of ending U.S. combat operations in Afghanistan in 2016, promoted discussions that led to the 2015 Paris Agreement on global climate change, initiated sanctions against Russia following the invasion in Ukraine and again after Russian interference in the 2016 United States elections, brokered a nuclear deal with Iran, and normalized U.S. relations with Cuba. Obama nominated three justices to the Supreme Court: Sonia Sotomayor and Elena Kagan were confirmed as justices, while Merrick Garland faced unprecedented partisan obstruction and was ultimately not confirmed. During his term in office, America's soft power and reputation abroad significantly improved"

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [5]:
docs = nlp(text)
probs = [(sum(tok.prob for tok in doc), doc) for doc in docs]
prob, doc = max(probs)

TypeError: 'spacy.tokens.token.Token' object is not iterable

In [None]:
probs

## Using NLTK

In [9]:
import nltk
nltk.download('brown')
from nltk.corpus import brown

[nltk_data] Downloading package brown to
[nltk_data]     /Users/suguthansekar/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


Source - http://www.katrinerk.com/courses/python-worksheets/language-models-in-python

In [57]:
freq_brown_1gram = nltk.FreqDist(brown.words())

len_brown = len(brown.words())


def unigram_prob(word):
    return freq_brown_1gram[ word] 


In [58]:
unigram_prob('the')

62713

In [92]:
text = '“If diseases carried by the new influx of people from the Steppe played a part in changing the demography of Europe, it wouldn’t be the last time this happened, of course. The diseases carried by the Europeans into the Americas played a significant role in decimating the original population of that continent. '

In [93]:

doc = nlp(text)

tokens = [(token.text,token.prob, unigram_prob(token.text)) for token in doc]


In [94]:
for i,j in enumerate(tokens):
    print(i,j)

0 ('“', -20.0, 0)
1 ('If', -20.0, 732)
2 ('diseases', -20.0, 16)
3 ('carried', -20.0, 124)
4 ('by', -20.0, 5103)
5 ('the', -20.0, 62713)
6 ('new', -20.0, 1060)
7 ('influx', -20.0, 4)
8 ('of', -20.0, 36080)
9 ('people', -20.0, 811)
10 ('from', -20.0, 4207)
11 ('the', -20.0, 62713)
12 ('Steppe', -20.0, 0)
13 ('played', -20.0, 103)
14 ('a', -20.0, 21881)
15 ('part', -20.0, 465)
16 ('in', -20.0, 19536)
17 ('changing', -20.0, 40)
18 ('the', -20.0, 62713)
19 ('demography', -20.0, 3)
20 ('of', -20.0, 36080)
21 ('Europe', -20.0, 118)
22 (',', -20.0, 58334)
23 ('it', -20.0, 6723)
24 ('would', -20.0, 2677)
25 ('n’t', -20.0, 0)
26 ('be', -20.0, 6344)
27 ('the', -20.0, 62713)
28 ('last', -20.0, 636)
29 ('time', -20.0, 1556)
30 ('this', -20.0, 3966)
31 ('happened', -20.0, 146)
32 (',', -20.0, 58334)
33 ('of', -20.0, 36080)
34 ('course', -20.0, 464)
35 ('.', -20.0, 49346)
36 ('The', -20.0, 7258)
37 ('diseases', -20.0, 16)
38 ('carried', -20.0, 124)
39 ('by', -20.0, 5103)
40 ('the', -20.0, 62713)
41 

### Using probabilities

In [95]:
for i, j in enumerate(tokens):
    if j[2] > 10 and j[2] <1000:
        print(j[0])

If
diseases
carried
people
played
part
changing
Europe
last
happened
course
diseases
carried
played
significant
role
original
population
continent


In [96]:
unigram_prob('Hawaii')

16

## Using Length of words

In [97]:
for i, j in enumerate(tokens):
    if len(j[0]) > 4:
        print(i,j)

2 ('diseases', -20.0, 16)
3 ('carried', -20.0, 124)
7 ('influx', -20.0, 4)
9 ('people', -20.0, 811)
12 ('Steppe', -20.0, 0)
13 ('played', -20.0, 103)
17 ('changing', -20.0, 40)
19 ('demography', -20.0, 3)
21 ('Europe', -20.0, 118)
24 ('would', -20.0, 2677)
31 ('happened', -20.0, 146)
34 ('course', -20.0, 464)
37 ('diseases', -20.0, 16)
38 ('carried', -20.0, 124)
41 ('Europeans', -20.0, 5)
44 ('Americas', -20.0, 1)
45 ('played', -20.0, 103)
47 ('significant', -20.0, 84)
50 ('decimating', -20.0, 0)
52 ('original', -20.0, 102)
53 ('population', -20.0, 136)
56 ('continent', -20.0, 11)


### Sorting

In [98]:
def getKey(item):
    return item[2]
l = [[2, 3], [6, 7], [3, 34], [24, 64], [1, 43]]
sorted(tokens, key=getKey)

[('“', -20.0, 0),
 ('Steppe', -20.0, 0),
 ('n’t', -20.0, 0),
 ('decimating', -20.0, 0),
 ('Americas', -20.0, 1),
 ('demography', -20.0, 3),
 ('influx', -20.0, 4),
 ('Europeans', -20.0, 5),
 ('continent', -20.0, 11),
 ('diseases', -20.0, 16),
 ('diseases', -20.0, 16),
 ('changing', -20.0, 40),
 ('significant', -20.0, 84),
 ('original', -20.0, 102),
 ('played', -20.0, 103),
 ('played', -20.0, 103),
 ('role', -20.0, 104),
 ('Europe', -20.0, 118),
 ('carried', -20.0, 124),
 ('carried', -20.0, 124),
 ('population', -20.0, 136),
 ('happened', -20.0, 146),
 ('course', -20.0, 464),
 ('part', -20.0, 465),
 ('last', -20.0, 636),
 ('If', -20.0, 732),
 ('people', -20.0, 811),
 ('new', -20.0, 1060),
 ('time', -20.0, 1556),
 ('into', -20.0, 1782),
 ('would', -20.0, 2677),
 ('this', -20.0, 3966),
 ('from', -20.0, 4207),
 ('by', -20.0, 5103),
 ('by', -20.0, 5103),
 ('be', -20.0, 6344),
 ('it', -20.0, 6723),
 ('The', -20.0, 7258),
 ('that', -20.0, 10237),
 ('in', -20.0, 19536),
 ('in', -20.0, 19536),
 