# Example 0: stemmers

In [191]:
import nltk as nltk
#PorterStemmer:
porter = nltk.PorterStemmer() 
porter.stem('Manufacturing')#not so good! We should have had "manufact".

'manufactur'

In [192]:
porter.stem('haved') #not so good!

'have'

In [193]:
#A problem:
porter.stem('relies')

'reli'

In [194]:
#Exceptions in grammar:
porter.stem('mice')
#bad performance! But this problem is much more related to lemmatization

'mice'

In [195]:
#Another example:
porter.stem('geese')

'gees'

In [196]:
porter = nltk.LancasterStemmer() 
porter.stem('manufacturing') #good!

'manufact'

In [197]:
porter.stem('haved') #good!

'hav'

In [198]:
#Exceptions in grammar:
porter.stem('mice')
#bad performance! But this problem is much more related to lemmatization

'mic'

In [199]:
#Another example:
porter.stem('geese')

'gees'

# Example 1: lemmatization

In [200]:
nltk.download('wordnet')
lemmatizer = nltk.WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [201]:
#lemmatizing an adjective:
lemmatizer.lemmatize('stricter')# bad performance!

'stricter'

In [202]:
#But let's use Wordnet:
lemmatizer.lemmatize('stricter', pos = nltk.corpus.wordnet.ADJ)

'strict'

In [203]:
#lemmatizing a noun:
lemmatizer.lemmatize('mice')# good performance!

'mouse'

In [204]:
# lemmatize as adverb
lemmatizer.lemmatize('better', pos = nltk.corpus.wordnet.ADV) #good performance!

'well'

# Example 2: POS classification

In [205]:
import nltk 
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
#From 'Gone with the wind'
txt = "Frankly, my dear, I don't give a damn!" 
nltk.pos_tag(nltk.word_tokenize(txt))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


[('Frankly', 'RB'),
 (',', ','),
 ('my', 'PRP$'),
 ('dear', 'JJ'),
 (',', ','),
 ('I', 'PRP'),
 ('do', 'VBP'),
 ("n't", 'RB'),
 ('give', 'VB'),
 ('a', 'DT'),
 ('damn', 'NN'),
 ('!', '.')]

In [206]:
# Example 17: creating a grammar and then chunking#Other languages: Russian
nltk.download('averaged_perceptron_tagger_ru')
nltk.pos_tag(nltk.word_tokenize("Илья оторопел и дважды перечитал бумажку."), lang='rus')  
#"Ilia' was astonished and twice read the notice"

[nltk_data] Downloading package averaged_perceptron_tagger_ru to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_ru is already up-to-
[nltk_data]       date!


[('Илья', 'S'),
 ('оторопел', 'V'),
 ('и', 'CONJ'),
 ('дважды', 'ADV'),
 ('перечитал', 'V'),
 ('бумажку', 'S'),
 ('.', 'NONLEX')]

# Example 3: creating a grammar and then chunking

In [207]:
grammar1 = ('''NP: {<DT>?<JJ>*<NN>} ''')
grammar2 = ('''V: {<VB\w?>} ''')

In [208]:
import nltk 
from nltk import  RegexpParser
text = "This is a simple example of chuncking a sentence"
tagged = nltk.pos_tag(nltk.word_tokenize(text))
tree = nltk.RegexpParser(grammar1).parse(tagged)
for subtree in tree.subtrees():
    print(subtree)

(S
  This/DT
  is/VBZ
  (NP a/DT simple/JJ example/NN)
  of/IN
  chuncking/VBG
  (NP a/DT sentence/NN))
(NP a/DT simple/JJ example/NN)
(NP a/DT sentence/NN)


In [209]:
tree2 = nltk.RegexpParser(grammar2).parse(tagged)
for subtree in tree2.subtrees():
    print(subtree)

(S
  This/DT
  (V is/VBZ)
  a/DT
  simple/JJ
  example/NN
  of/IN
  (V chuncking/VBG)
  a/DT
  sentence/NN)
(V is/VBZ)
(V chuncking/VBG)


In [210]:
from nltk import  RegexpParser
# From "The Guardian", 11 gen 2021:
text = "With a government this bad in charge of the UK during Covid, how do we respond?" 
sentence = nltk.pos_tag(nltk.word_tokenize(text))
sentence

[('With', 'IN'),
 ('a', 'DT'),
 ('government', 'NN'),
 ('this', 'DT'),
 ('bad', 'JJ'),
 ('in', 'IN'),
 ('charge', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('UK', 'NNP'),
 ('during', 'IN'),
 ('Covid', 'NNP'),
 (',', ','),
 ('how', 'WRB'),
 ('do', 'VBP'),
 ('we', 'PRP'),
 ('respond', 'VB'),
 ('?', '.')]

In [211]:
import nltk
#sentence = [("the", "DT"),("book", "NN"),("has","VBZ"),("many","JJ"),("chapters","NNS")]
#chunker=nltk.RegexpParser(r'''
#NP:{<DT><NN.*><.*>*<NN.*>}
#}<VB.*>{
#''')
#chunker.parse(sentence)
#Output=chunker.parse(sentence)
#Output
#Output.draw()
#Recall to close the draw window to end execution of the cell

# Example 4: named entities

In [212]:
#import nltk
#text = "European authorities fined Google a record 5.1 billion dollars on Wednesday for abusing its power..."
#nltk.download('maxent_ne_chunker')
#nltk.download('words')
#nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


TclError: ignored

Tree('S', [Tree('GPE', [('European', 'JJ')]), ('authorities', 'NNS'), ('fined', 'VBD'), Tree('PERSON', [('Google', 'NNP')]), ('a', 'DT'), ('record', 'NN'), ('5.1', 'CD'), ('billion', 'CD'), ('dollars', 'NNS'), ('on', 'IN'), ('Wednesday', 'NNP'), ('for', 'IN'), ('abusing', 'VBG'), ('its', 'PRP$'), ('power', 'NN'), ('...', ':')])

In [213]:
#import spacy
#nlp = spacy.load("en_core_web_sm") 
#doc = nlp(text)

In [214]:
# Example 4: named entitiesfor ent in doc.ents: 
#print(doc.text, doc.label_)

# Example 5: Regex and text data in Pandas

In [215]:
import pandas as pd
opinion = pd.read_csv('BikeMiSurvey_short2.csv', sep = ";")

In [216]:
opinion

Unnamed: 0,Italian,English
0,Quando ho avuto problemi con la restituzione d...,"When I had problems with returning the bike, t..."
1,più bici elettriche. spesso anche se presenti ...,more electric bikes. often even if present the...
2,fare più attenzione alle stazioni che molto sp...,pay more attention to stations that very often...
3,fondamentale inserire bici con seggiolino bimbi,essential to introduce bikes with child seat
4,Ieri ho preso la bicicletta alle 7:00 e l'ho l...,Yesterday I took my bike at 7:00 and left it a...
5,Lunedì sono andato dalla Stazione Centrale al ...,On Monday I went from the Central Station to t...


In [217]:
texts = pd.DataFrame(opinion['English'])
texts

Unnamed: 0,English
0,"When I had problems with returning the bike, t..."
1,more electric bikes. often even if present the...
2,pay more attention to stations that very often...
3,essential to introduce bikes with child seat
4,Yesterday I took my bike at 7:00 and left it a...
5,On Monday I went from the Central Station to t...


In [218]:
# find the number of characters for each string in texts['English']
texts['English'].str.len()

0    538
1     90
2    118
3     44
4    102
5    101
Name: English, dtype: int64

In [219]:
# find the number of tokens for each string in df['text']
texts['English'].str.split().str.len()

0    101
1     16
2     21
3      7
4     21
5     19
Name: English, dtype: int64

In [220]:
# find which entries contain the word 'bike'
texts['English'].str.contains('bike')

0     True
1     True
2    False
3     True
4     True
5     True
Name: English, dtype: bool

In [221]:
# find how many times a digit occurs in each string (found only number 2 in first row and the time numbers in sixth)
texts['English'].str.count(r'\d')

0    1
1    0
2    0
3    0
4    6
5    0
Name: English, dtype: int64

In [222]:
# find all occurences of the digits (only 2 in first row and the time numbers in fifth)
texts['English'].str.findall(r'\d')

0                   [2]
1                    []
2                    []
3                    []
4    [7, 0, 0, 7, 2, 9]
5                    []
Name: English, dtype: object

In [223]:
# group and find the hours and minutes
texts['English'].str.findall(r'(\d?\d):(\d\d)')

0                    []
1                    []
2                    []
3                    []
4    [(7, 00), (7, 29)]
5                    []
Name: English, dtype: object

In [224]:
# replace 'Yesterday' and 'Monday' with '???'
texts['English'].str.replace(r'\w+day\b', '???')

0    When I had problems with returning the bike, t...
1    more electric bikes. often even if present the...
2    pay more attention to stations that very often...
3         essential to introduce bikes with child seat
4    ??? I took my bike at 7:00 and left it at 7:29...
5    On ??? I went from the Central Station to the ...
Name: English, dtype: object

In [225]:
# replace 'Monday' with 'the first day of the week'
sixth_row = pd.DataFrame(texts['English'].str.replace(r'Monday', 'the first day of the week'))

In [226]:
sixth_row['English'].iloc[5]

'On the first day of the week I went from the Central Station to the Duomo and halfway the bike broke. They always break.'

In [227]:
# replace weekdays with 3 letter abbrevations (lambda represents an anonymous: If it is used with 
# in a df
#  each element of a series is fed into the lambda function)
# Be careful with cases like here where we have Yester-day and Mon-day
texts['English'].str.replace(r'(\w+day\b)', lambda x: x.groups()[0][:3])
#texts['English'].str.replace(r'(\w+nday\b)', lambda x: x.groups()[0][:3])

0    When I had problems with returning the bike, t...
1    more electric bikes. often even if present the...
2    pay more attention to stations that very often...
3         essential to introduce bikes with child seat
4    Yes I took my bike at 7:00 and left it at 7:29...
5    On Mon I went from the Central Station to the ...
Name: English, dtype: object

In [228]:
import pandas as pd
trips = pd.read_csv('BIKEMI_TRIPS.csv', sep = ";")
trips.head()

Unnamed: 0,BIKE_ID,BIKE_TIPE,USER_ID,CHECK_IN_TIME,CHECK_IN_STATION_ID,CHECK_IN_STATION_NAME,CHECK_IN_SLOT_ID,CHECK_OUT_TIME,CHECK_OUT_STATION_ID,CHECK_OUT_STATION_NAME,CHECK_OUT_SLOT_ID,DURATION,TOTAL_DISTANCE,AVOIDED_CO2,CONSUMED_CALORIES
0,20283,Bike,34956,01/01/19 06:52,187,Sarpi Albertini,7,01/01/19 07:05,25,Centrale 1,10,0:13:25,3408.92,0.6913,65.66
1,30131,Child Seat eBike,355600,01/01/19 07:01,302,De Angeli - Ripamonti,9,01/01/19 07:10,23,Regina Margherita,14,0:09:18,1813.18,0.3677,34.93
2,21911,Bike,236069,01/01/19 07:05,222,Durante - D'aviano,22,01/01/19 07:30,110,S. F. Romana,30,0:24:35,2025.83,0.4108,39.02
3,10863,eBike,348357,01/01/19 07:06,257,Valtellina - Aprica,13,01/01/19 08:12,262,Livigno - Monte San Genesio,15,1:06:22,887.54,0.18,17.1
4,2781,Bike,147224,01/01/19 07:07,154,Ascanio Sforza - Pavia,6,01/01/19 08:10,64,Diaz,36,1:02:46,2104.1,0.4267,40.53


In [229]:
texts = pd.DataFrame(trips['CHECK_IN_TIME'])
# group and find the hours and minutes
texts['CHECK_IN_TIME'].str.findall(r'(\d?\d):(\d\d)')

0     [(06, 52)]
1     [(07, 01)]
2     [(07, 05)]
3     [(07, 06)]
4     [(07, 07)]
5     [(07, 20)]
6     [(07, 22)]
7     [(07, 26)]
8     [(07, 32)]
9     [(07, 42)]
10    [(07, 43)]
11    [(07, 52)]
12    [(07, 59)]
Name: CHECK_IN_TIME, dtype: object

In [230]:
# create new columns from first match of extracted groups
only_hour = pd.DataFrame(texts['CHECK_IN_TIME'].str.extract(r'(\d?\d):(\d\d)'))
only_hour

Unnamed: 0,0,1
0,6,52
1,7,1
2,7,5
3,7,6
4,7,7
5,7,20
6,7,22
7,7,26
8,7,32
9,7,42


In [231]:
# extract the entire time, the hours, the minutes, and the period
#df['text'].str.extractall(r'((\d?\d):(\d\d) ?([ap]m))')
texts['CHECK_IN_TIME'].str.extractall(r'((\d?\d):(\d\d))')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,06:52,6,52
1,0,07:01,7,1
2,0,07:05,7,5
3,0,07:06,7,6
4,0,07:07,7,7
5,0,07:20,7,20
6,0,07:22,7,22
7,0,07:26,7,26
8,0,07:32,7,32
9,0,07:42,7,42


In [232]:
trips['CHECK_IN_TIME'] = pd.to_datetime(trips['CHECK_IN_TIME'])


In [233]:
trips.dtypes

BIKE_ID                            int64
BIKE_TIPE                         object
USER_ID                            int64
CHECK_IN_TIME             datetime64[ns]
CHECK_IN_STATION_ID                int64
CHECK_IN_STATION_NAME             object
CHECK_IN_SLOT_ID                   int64
CHECK_OUT_TIME                    object
CHECK_OUT_STATION_ID               int64
CHECK_OUT_STATION_NAME            object
CHECK_OUT_SLOT_ID                  int64
DURATION                          object
TOTAL_DISTANCE                   float64
AVOIDED_CO2                      float64
CONSUMED_CALORIES                float64
dtype: object

In [234]:
trips['CHECK_IN_DATE_ONLY'] = [d.date() for d in trips['CHECK_IN_TIME']]
trips['CHECK_IN_TIME_ONLY'] = [d.time() for d in trips['CHECK_IN_TIME']]
trips

Unnamed: 0,BIKE_ID,BIKE_TIPE,USER_ID,CHECK_IN_TIME,CHECK_IN_STATION_ID,CHECK_IN_STATION_NAME,CHECK_IN_SLOT_ID,CHECK_OUT_TIME,CHECK_OUT_STATION_ID,CHECK_OUT_STATION_NAME,CHECK_OUT_SLOT_ID,DURATION,TOTAL_DISTANCE,AVOIDED_CO2,CONSUMED_CALORIES,CHECK_IN_DATE_ONLY,CHECK_IN_TIME_ONLY
0,20283,Bike,34956,2019-01-01 06:52:00,187,Sarpi Albertini,7,01/01/19 07:05,25,Centrale 1,10,0:13:25,3408.92,0.6913,65.66,2019-01-01,06:52:00
1,30131,Child Seat eBike,355600,2019-01-01 07:01:00,302,De Angeli - Ripamonti,9,01/01/19 07:10,23,Regina Margherita,14,0:09:18,1813.18,0.3677,34.93,2019-01-01,07:01:00
2,21911,Bike,236069,2019-01-01 07:05:00,222,Durante - D'aviano,22,01/01/19 07:30,110,S. F. Romana,30,0:24:35,2025.83,0.4108,39.02,2019-01-01,07:05:00
3,10863,eBike,348357,2019-01-01 07:06:00,257,Valtellina - Aprica,13,01/01/19 08:12,262,Livigno - Monte San Genesio,15,1:06:22,887.54,0.18,17.1,2019-01-01,07:06:00
4,2781,Bike,147224,2019-01-01 07:07:00,154,Ascanio Sforza - Pavia,6,01/01/19 08:10,64,Diaz,36,1:02:46,2104.1,0.4267,40.53,2019-01-01,07:07:00
5,20347,Bike,321606,2019-01-01 07:20:00,383,Angilberto - Comacchio,29,01/01/19 07:26,303,Benaco - Brembo,30,0:06:43,1018.38,0.2065,19.62,2019-01-01,07:20:00
6,2253,Bike,268945,2019-01-01 07:22:00,115,Caiazzo,25,01/01/19 07:29,213,Novelli - Carnaghi,22,0:07:04,2015.42,0.4087,38.82,2019-01-01,07:22:00
7,20972,Bike,258180,2019-01-01 07:26:00,3,Cadorna 1,5,01/01/19 07:36,69,S. Nazaro in Brolo,13,0:09:41,2161.06,0.4383,41.63,2019-01-01,07:26:00
8,11345,eBike,357772,2019-01-01 07:32:00,124,Porta Venezia 2,25,01/01/19 07:42,14,San Barnaba H Mangiagalli,1,0:09:48,1859.56,0.3771,35.82,2019-01-01,07:32:00
9,8179,Bike,35909,2019-01-01 07:42:00,180,Canova - Sangiorgio,27,01/01/19 07:54,69,S. Nazaro in Brolo,20,0:12:29,3290.21,0.6673,63.38,2019-01-01,07:42:00


# Example 6: bag of words

In [235]:
from sklearn.feature_extraction.text import CountVectorizer 


In [236]:
corpus = corpus = [
    'Donald Trump is expected to issue more than 100 presidential pardons.',
    'Trump is expected to end his time in office.',
    'US defense officials say they are worried about an insider attack.',
    'He would like to take the extraordinary step of issuing a pardon for himself']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

['100', 'about', 'an', 'are', 'attack', 'defense', 'donald', 'end', 'expected', 'extraordinary', 'for', 'he', 'himself', 'his', 'in', 'insider', 'is', 'issue', 'issuing', 'like', 'more', 'of', 'office', 'officials', 'pardon', 'pardons', 'presidential', 'say', 'step', 'take', 'than', 'the', 'they', 'time', 'to', 'trump', 'us', 'worried', 'would']


In [237]:
print(X.toarray())

[[1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1
  0 0 0]
 [0 0 0 0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1
  0 0 0]
 [0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0
  1 1 0]
 [0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 1 1 0 1 0 0 1 0 0 0 1 1 0 1 0 0 1 0
  0 0 1]]


In [238]:
# This time we use 2-grams:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())


['100 presidential', 'about an', 'an insider', 'are worried', 'defense officials', 'donald trump', 'end his', 'expected to', 'extraordinary step', 'for himself', 'he would', 'his time', 'in office', 'insider attack', 'is expected', 'issue more', 'issuing pardon', 'like to', 'more than', 'of issuing', 'officials say', 'pardon for', 'presidential pardons', 'say they', 'step of', 'take the', 'than 100', 'the extraordinary', 'they are', 'time in', 'to end', 'to issue', 'to take', 'trump is', 'us defense', 'worried about', 'would like']


In [239]:
print(X2.toarray())

[[1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 1 0 0
  0]
 [0 0 0 0 0 0 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0
  0]
 [0 1 1 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1
  0]
 [0 0 0 0 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0
  1]]


# Example 7: TF-IDF

In [240]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

['100', 'about', 'an', 'are', 'attack', 'defense', 'donald', 'end', 'expected', 'extraordinary', 'for', 'he', 'himself', 'his', 'in', 'insider', 'is', 'issue', 'issuing', 'like', 'more', 'of', 'office', 'officials', 'pardon', 'pardons', 'presidential', 'say', 'step', 'take', 'than', 'the', 'they', 'time', 'to', 'trump', 'us', 'worried', 'would']


In [241]:
print(X.shape)

(4, 39)


In [242]:
print(X.toarray())# Example 8: Practice with SpaCy.toarray())

[[0.32840433 0.         0.         0.         0.         0.
  0.32840433 0.         0.25891775 0.         0.         0.
  0.         0.         0.         0.         0.25891775 0.32840433
  0.         0.         0.32840433 0.         0.         0.
  0.         0.32840433 0.32840433 0.         0.         0.
  0.32840433 0.         0.         0.         0.20961623 0.25891775
  0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.37082366 0.29236164 0.         0.         0.
  0.         0.37082366 0.37082366 0.         0.29236164 0.
  0.         0.         0.         0.         0.37082366 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.37082366 0.23669194 0.29236164
  0.         0.         0.        ]
 [0.         0.30151134 0.30151134 0.30151134 0.30151134 0.30151134
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.30151134 0.        

# Example 8: Practice with SpaCy

# SpaCy

spaCy is an open-source software library for advanced natural language processing: https://spacy.io/

The following code is based on: https://medium.com/@ageitgey/natural-language-processing-is-fun-9a0bff37854e

In [243]:
import spacy
import textacy.extract
from urllib import request
from bs4 import BeautifulSoup

# Load the large English NLP model
nlp = spacy.load('en_core_web_sm')

## Extracting Facts from text

In [244]:
def print_facts(keyword, url):

    # fetch url
    response = request.urlopen(url)
    
    # read html in utf8
    html = response.read().decode('utf8')
    
    # strip html and get raw text
    raw = BeautifulSoup(html, 'html.parser').get_text()
    
    # you should do some pre-processing...
    text = raw.replace('\n',' ')
    
    # Parse the document with spaCy
    doc = nlp(text)

    # Extract semi-structured statements
    statements = textacy.extract.semistructured_statements(doc, keyword)

    # Print the results
    if keyword == 'Biden':
        print("Here are the things I know about Biden:\n")
        for statement in statements:
            subject, verb, fact = statement
            print(f" - {fact}")
    else:
        print("Here are the things I know about Trump:\n")
        for statement in statements:
            subject, verb, fact = statement
            print(f" - {fact}")
    return

In [245]:
# print facts about London fetching the wikipedia page
print_facts("Biden", "https://en.wikipedia.org/wiki/Joe_Biden")

Here are the things I know about Biden:

 - a longtime member of the Senate Foreign Relations Committee, and eventually became its chairman
 - a standout halfback and wide receiver on the high school football team;[11][14
 - a Washington lobbyist and investment adviser.[64
 - one of the Senate's strongest opponents of race-integration busing
 - a longtime member of the Senate Foreign Relations Committee
 - not an academic
 - the kind of fundamentally happy person who can be as generous toward others as he is to himself
 - his running mate.[165
 - always prepared to be the skunk at the family picnic to make sure we are as intellectually honest as possible
 - the only guy with real negotiating authority, and
 - still uncertain about running
 - a staunch supporter of the Affordable Care Act (ACA).[393][394
 - an inductee of the Delaware Volunteer Firemen's Association Hall of Fame.[441
 - eligible to receive classified intelligence briefings since his nomination in August.[365
 - the stut

In [185]:
# print facts about Trump fetching the wikipedia page
print_facts("Trump", "https://en.wikipedia.org/wiki/Donald_Trump")

Here are the things I know about Trump:

 - an American politician who served as the 45th president of the United States from 2017 to 2021
 - the healthiest individual ever elected to the presidency" in a letter released by the Trump campaign.[39
 - healthy overall
 - a millionaire by age 8
 - a guest about 24 times on the nationally syndicated Howard Stern Show.[172
 - the oldest person to take office as president at the time of his inauguration
 - a skeptic of multilateral trade agreements and favors bilateral trade agreements
 - slow to appoint second-tier officials in the executive branch, saying many of the positions are unnecessary
 - the "likely the largest driver" of COVID-19 misinformation in the first five months of 2020.[571
 - the subject of increasing Justice Department and congressional scrutiny, with investigations covering his election campaign, transition and inauguration
 - the first elected president not to be named most admired in his first year in office.[793
 - th

## What else can we do?

Imagine that you were building a website that let’s the user view information for every city in the world using the information we extracted in the last example. If you had a search feature on the website, it might be nice to __autocomplete__ common search queries like Google does. But to do this, we need a list of possible completions to suggest to the user. We can use NLP to quickly generate this data. Here’s one way to extract frequently-mentioned noun chunks from a document.

In [186]:
def autocomplete(keyword, url, min_freq):
    
    # fetch url
    response = request.urlopen(url)
    
    # read html in utf8
    html = response.read().decode('utf8')
    
    # strip html and get raw text
    raw = BeautifulSoup(html, 'html.parser').get_text()
    
    # you should do some pre-processing...
    text = raw.replace('\n',' ')
    
    # Parse the document with spaCy
    doc = nlp(text)

    # Extract noun chunks that appear
    noun_chunks = textacy.extract.noun_chunks(doc, min_freq = min_freq)

    # Convert noun chunks to lowercase strings
    noun_chunks = map(str, noun_chunks)
    noun_chunks = map(str.lower, noun_chunks)

    # Collect any nouns that are at least 2 words long
    res = []
    for noun_chunk in set(noun_chunks):
        if len(noun_chunk.split(" ")) > 1:
            res.append(noun_chunk)
        
    return res

In [187]:
# autocomplete Biden
autocomplete("Biden", "https://en.wikipedia.org/wiki/Joe_Biden", 7)

['second term',
 'usa today',
 'vice president',
 'donald trump',
 'washington post',
 'new york times',
 'white house',
 'los angeles times',
 'foreign policy',
 'abc news',
 'u.s. senate',
 'joe biden',
 'running mate',
 'foreign relations',
 'associated press',
 'fox news',
 'nbc news',
 'news journal',
 'united states',
 'wall street journal']

In [188]:
# autocomplete Trump
autocomplete("Trump", "https://en.wikipedia.org/wiki/Donald_Trump", 10)

['usa today',
 'supreme court',
 'russian government',
 'main article',
 'donald trump',
 'washington post',
 'cbs news',
 'new york times',
 'white house',
 'michael d.',
 'retrieved february',
 'president trump',
 'russian interference',
 '^ kranish',
 'david a.',
 'ap news',
 'abc news',
 'potentially dated statements',
 'trump administration',
 'new york',
 'nbc news',
 'north korea',
 'mueller report',
 'bbc news',
 'trump campaign',
 'united states',
 'wall street journal']

## Example 9: PCA in text mining

In [189]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

def vectorizing(data):
    vec = CountVectorizer()
    X = vec.fit_transform(data)
    df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
    return df

def find_principal_components(n, data):
    pca = PCA(n_components = n)
    principalComponents = pca.fit_transform(data)
    return pd.DataFrame(pca.components_, columns=data.columns)

text = ['Texas real estate agent Ryan Williams',
        'part mob Trump storm administration Capitol congress continue insist innocence',
        'even face charge breach Capitol guilt heart Ryan', 
        'tell today Pelosi show glad Ryan Williams there because witness history Trump administration', 
        'never get  chance do again Texas Capitol there mob', 
        'storm Pelosi laptop invade office congress Pelosi Trump Biden prison guilt prison breach steal laptop',
        'Trump Williams Biden Trump president elect president Trump']

df = vectorizing(text)

print(df) # 7 row x 44 columns

   administration  again  agent  because  ...  today  trump  williams  witness
0               0      0      1        0  ...      0      0         1        0
1               1      0      0        0  ...      0      1         0        0
2               0      0      0        0  ...      0      0         0        0
3               1      0      0        1  ...      1      1         1        1
4               0      1      0        0  ...      0      0         0        0
5               0      0      0        0  ...      0      1         0        0
6               0      0      0        0  ...      0      3         1        0

[7 rows x 44 columns]


In [190]:
principalDF = find_principal_components(2, df)

print(principalDF) # 2 rows x 44 columns


   administration     again     agent  ...     trump  williams   witness
0       -0.047409 -0.083647 -0.061637  ...  0.221641 -0.070288 -0.029229
1        0.051772 -0.084410 -0.022809  ...  0.629815  0.255983  0.062849

[2 rows x 44 columns]
