# Example 0: stemmers

In [None]:
import nltk as nltk
#PorterStemmer:
porter = nltk.PorterStemmer() 
porter.stem('Manufacturing')#not so good! We should have had "manufact".

In [None]:
porter.stem('haved') #not so good!

In [None]:
#A problem:
porter.stem('relies')

In [None]:
#Exceptions in grammar:
porter.stem('mice')
#bad performance! But this problem is much more related to lemmatization

In [None]:
#Another example:
porter.stem('geese')

In [None]:
porter = nltk.LancasterStemmer() 
porter.stem('manufacturing') #good!

In [None]:
porter.stem('haved') #good!

In [None]:
#Exceptions in grammar:
porter.stem('mice')
#bad performance! But this problem is much more related to lemmatization

In [None]:
#Another example:
porter.stem('geese')

# Example 1: lemmatization

In [None]:
lemmatizer = nltk.WordNetLemmatizer()

In [None]:
#lemmatizing an adjective:
lemmatizer.lemmatize('stricter')# bad performance!

In [None]:
#But let's use Wordnet:
lemmatizer.lemmatize('stricter', pos = nltk.corpus.wordnet.ADJ)

In [None]:
#lemmatizing a noun:
lemmatizer.lemmatize('mice')# good performance!

In [None]:
# lemmatize as adverb
lemmatizer.lemmatize('better', pos = nltk.corpus.wordnet.ADV) #good performance!

# Example 2: POS classification

In [None]:
import nltk 
nltk.download('averaged_perceptron_tagger')
#From 'Gone with the wind'
txt = "Frankly, my dear, I don't give a damn!" 
nltk.pos_tag(nltk.word_tokenize(txt))

In [None]:
# Example 17: creating a grammar and then chunking#Other languages: Russian
nltk.download('averaged_perceptron_tagger_ru')
nltk.pos_tag(nltk.word_tokenize("Илья оторопел и дважды перечитал бумажку."), lang='rus')  
#"Ilia' was astonished and twice read the notice"

# Example 3: creating a grammar and then chunking

In [None]:
grammar1 = ('''NP: {<DT>?<JJ>*<NN>} ''')
grammar2 = ('''V: {<VB\w?>} ''')

In [None]:
import nltk 
from nltk import  RegexpParser
text = "This is a simple example of chuncking a sentence"
tagged = nltk.pos_tag(nltk.word_tokenize(text))
tree = nltk.RegexpParser(grammar1).parse(tagged)
for subtree in tree.subtrees():
    print(subtree)

In [None]:
tree2 = nltk.RegexpParser(grammar2).parse(tagged)
for subtree in tree2.subtrees():
    print(subtree)

In [None]:
from nltk import  RegexpParser
# From "The Guardian", 11 gen 2021:
text = "With a government this bad in charge of the UK during Covid, how do we respond?" 
sentence = nltk.pos_tag(nltk.word_tokenize(text))
sentence

In [None]:
import nltk
#sentence = [("the", "DT"),("book", "NN"),("has","VBZ"),("many","JJ"),("chapters","NNS")]
chunker=nltk.RegexpParser(r'''
NP:{<DT><NN.*><.*>*<NN.*>}
}<VB.*>{
''')
chunker.parse(sentence)
Output=chunker.parse(sentence)
Output.draw()
#Recall to close the draw window to end execution of the cell

# Example 4: named entities

In [None]:
import nltk
text = "European authorities fined Google a record 5.1 billion dollars on Wednesday for abusing its power..."
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))

In [None]:
#import spacy
#nlp = spacy.load("en_core_web_sm") 
#doc = nlp(text)

In [None]:
# Example 4: named entitiesfor ent in doc.ents: 
#print(doc.text, doc.label_)

# Example 5: Regex and text data in Pandas

In [None]:
import pandas as pd
opinion = pd.read_csv('BikeMiSurvey_short2.csv', sep = ";")

In [None]:
opinion

In [None]:
texts = pd.DataFrame(opinion['English'])
texts

In [None]:
# find the number of characters for each string in texts['English']
texts['English'].str.len()

In [None]:
# find the number of tokens for each string in df['text']
texts['English'].str.split().str.len()

In [None]:
# find which entries contain the word 'bike'
texts['English'].str.contains('bike')

In [None]:
# find how many times a digit occurs in each string (found only number 2 in first row and the time numbers in sixth)
texts['English'].str.count(r'\d')

In [None]:
# find all occurences of the digits (only 2 in first row and the time numbers in fifth)
texts['English'].str.findall(r'\d')

In [None]:
# group and find the hours and minutes
texts['English'].str.findall(r'(\d?\d):(\d\d)')

In [None]:
# replace 'Yesterday' and 'Monday' with '???'
texts['English'].str.replace(r'\w+day\b', '???')

In [None]:
# replace 'Monday' with 'the first day of the week'
sixth_row = pd.DataFrame(texts['English'].str.replace(r'Monday', 'the first day of the week'))

In [None]:
sixth_row['English'].iloc[5]

In [None]:
# replace weekdays with 3 letter abbrevations (lambda represents an anonymous: If it is used with 
# in a df
#  each element of a series is fed into the lambda function)
# Be careful with cases like here where we have Yester-day and Mon-day
texts['English'].str.replace(r'(\w+day\b)', lambda x: x.groups()[0][:3])
#texts['English'].str.replace(r'(\w+nday\b)', lambda x: x.groups()[0][:3])

In [None]:
import pandas as pd
trips = pd.read_csv('BIKEMI_TRIPS.csv', sep = ";")
trips.head()

In [None]:
texts = pd.DataFrame(trips['CHECK_IN_TIME'])
# group and find the hours and minutes
texts['CHECK_IN_TIME'].str.findall(r'(\d?\d):(\d\d)')

In [None]:
# create new columns from first match of extracted groups
only_hour = pd.DataFrame(texts['CHECK_IN_TIME'].str.extract(r'(\d?\d):(\d\d)'))
only_hour

In [None]:
# extract the entire time, the hours, the minutes, and the period
#df['text'].str.extractall(r'((\d?\d):(\d\d) ?([ap]m))')
texts['CHECK_IN_TIME'].str.extractall(r'((\d?\d):(\d\d))')

In [None]:
trips['CHECK_IN_TIME'] = pd.to_datetime(trips['CHECK_IN_TIME'])


In [None]:
trips.dtypes

In [None]:
trips['CHECK_IN_DATE_ONLY'] = [d.date() for d in trips['CHECK_IN_TIME']]
trips['CHECK_IN_TIME_ONLY'] = [d.time() for d in trips['CHECK_IN_TIME']]
trips

# Example 6: bag of words

In [None]:
from sklearn.feature_extraction.text import CountVectorizer 


In [None]:
corpus = corpus = [
    'Donald Trump is expected to issue more than 100 presidential pardons.',
    'Trump is expected to end his time in office.',
    'US defense officials say they are worried about an insider attack.',
    'He would like to take the extraordinary step of issuing a pardon for himself']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

In [None]:
print(X.toarray())

In [None]:
# This time we use 2-grams:
vectorizer2 = CountVectorizer(analyzer='word', ngram_range=(2, 2))
X2 = vectorizer2.fit_transform(corpus)
print(vectorizer2.get_feature_names())


In [None]:
print(X2.toarray())

# Example 7: TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

In [None]:
print(X.shape)

In [None]:
print(X.toarray())# Example 8: Practice with SpaCy.toarray())

# Example 8: Practice with SpaCy

# SpaCy

spaCy is an open-source software library for advanced natural language processing: https://spacy.io/

The following code is based on: https://medium.com/@ageitgey/natural-language-processing-is-fun-9a0bff37854e

In [None]:
import spacy
import textacy.extract
from urllib import request
from bs4 import BeautifulSoup

# Load the large English NLP model
nlp = spacy.load('en_core_web_sm')

## Extracting Facts from text

In [None]:
def print_facts(keyword, url):

    # fetch url
    response = request.urlopen(url)
    
    # read html in utf8
    html = response.read().decode('utf8')
    
    # strip html and get raw text
    raw = BeautifulSoup(html, 'html.parser').get_text()
    
    # you should do some pre-processing...
    text = raw.replace('\n',' ')
    
    # Parse the document with spaCy
    doc = nlp(text)

    # Extract semi-structured statements
    statements = textacy.extract.semistructured_statements(doc, keyword)

    # Print the results
    if keyword == 'Biden':
        print("Here are the things I know about Biden:\n")
        for statement in statements:
            subject, verb, fact = statement
            print(f" - {fact}")
    else:
        print("Here are the things I know about Trump:\n")
        for statement in statements:
            subject, verb, fact = statement
            print(f" - {fact}")
    return

In [None]:
# print facts about London fetching the wikipedia page
print_facts("Biden", "https://en.wikipedia.org/wiki/Joe_Biden")

In [None]:
# print facts about Trump fetching the wikipedia page
print_facts("Trump", "https://en.wikipedia.org/wiki/Donald_Trump")

## What else can we do?

Imagine that you were building a website that let’s the user view information for every city in the world using the information we extracted in the last example. If you had a search feature on the website, it might be nice to __autocomplete__ common search queries like Google does. But to do this, we need a list of possible completions to suggest to the user. We can use NLP to quickly generate this data. Here’s one way to extract frequently-mentioned noun chunks from a document.

In [None]:
def autocomplete(keyword, url, min_freq):
    
    # fetch url
    response = request.urlopen(url)
    
    # read html in utf8
    html = response.read().decode('utf8')
    
    # strip html and get raw text
    raw = BeautifulSoup(html, 'html.parser').get_text()
    
    # you should do some pre-processing...
    text = raw.replace('\n',' ')
    
    # Parse the document with spaCy
    doc = nlp(text)

    # Extract noun chunks that appear
    noun_chunks = textacy.extract.noun_chunks(doc, min_freq = min_freq)

    # Convert noun chunks to lowercase strings
    noun_chunks = map(str, noun_chunks)
    noun_chunks = map(str.lower, noun_chunks)

    # Collect any nouns that are at least 2 words long
    res = []
    for noun_chunk in set(noun_chunks):
        if len(noun_chunk.split(" ")) > 1:
            res.append(noun_chunk)
        
    return res

In [None]:
# autocomplete Biden
autocomplete("Biden", "https://en.wikipedia.org/wiki/Joe_Biden", 7)

In [None]:
# autocomplete Trump
autocomplete("Trump", "https://en.wikipedia.org/wiki/Donald_Trump", 10)

## Example 9: PCA in text mining

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA

def vectorizing(data):
    vec = CountVectorizer()
    X = vec.fit_transform(data)
    df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names())
    return df

def find_principal_components(n, data):
    pca = PCA(n_components = n)
    principalComponents = pca.fit_transform(data)
    return pd.DataFrame(pca.components_, columns=data.columns)

text = ['Texas real estate agent Ryan Williams',
        'part mob Trump storm administration Capitol congress continue insist innocence',
        'even face charge breach Capitol guilt heart Ryan', 
        'tell today Pelosi show glad Ryan Williams there because witness history Trump administration', 
        'never get  chance do again Texas Capitol there mob', 
        'storm Pelosi laptop invade office congress Pelosi Trump Biden prison guilt prison breach steal laptop',
        'Trump Williams Biden Trump president elect president Trump']

df = vectorizing(text)

print(df) # 7 row x 44 columns

In [None]:
principalDF = find_principal_components(2, df)

print(principalDF) # 2 rows x 44 columns
