In [1]:
import nltk
import os
import numpy as np

In [2]:
# Example: Using the first sentences of Barack Obama's inauguration speech.

obama_speech = """I stand here today humbled by the task before us, grateful for the trust you have bestowed, mindful of the sacrifices borne by our ancestors.

I thank President Bush for his service to our nation, as well as the generosity and cooperation he has shown throughout this transition. Forty-four Americans have now taken the presidential oath. The words have been spoken during rising tides of prosperity and the still waters of peace. Yet, every so often the oath is taken amidst gathering clouds and raging storms. 

At these moments, America has carried on not simply because of the skill or vision of those in high office, but because We the People have remained faithful to the ideals of our forbearers, and true to our founding documents."""

In [3]:
# better representation of the text

for word in obama_speech:
    print(word, sep='', end='')

I stand here today humbled by the task before us, grateful for the trust you have bestowed, mindful of the sacrifices borne by our ancestors.

I thank President Bush for his service to our nation, as well as the generosity and cooperation he has shown throughout this transition. Forty-four Americans have now taken the presidential oath. The words have been spoken during rising tides of prosperity and the still waters of peace. Yet, every so often the oath is taken amidst gathering clouds and raging storms. 

At these moments, America has carried on not simply because of the skill or vision of those in high office, but because We the People have remained faithful to the ideals of our forbearers, and true to our founding documents.

In [4]:
# What data type is the text?

typ = type(obama_speech)
print("The speech is of type:\t{}".format(typ))

The speech is of type:	<class 'str'>


In [5]:
# Tokenize the text and return every token

from nltk.tokenize import word_tokenize

tokens = word_tokenize(obama_speech)
print(tokens)
print()
# representation as numpy-array: tokens = np.array(tokens)

number_of_tokens = len(tokens)
print("Number of single tokens in Obama's speech:\t" + str(number_of_tokens))
print("Whole Number of characters in Obama's speech:\t" + str(len(obama_speech)))


['I', 'stand', 'here', 'today', 'humbled', 'by', 'the', 'task', 'before', 'us', ',', 'grateful', 'for', 'the', 'trust', 'you', 'have', 'bestowed', ',', 'mindful', 'of', 'the', 'sacrifices', 'borne', 'by', 'our', 'ancestors', '.', 'I', 'thank', 'President', 'Bush', 'for', 'his', 'service', 'to', 'our', 'nation', ',', 'as', 'well', 'as', 'the', 'generosity', 'and', 'cooperation', 'he', 'has', 'shown', 'throughout', 'this', 'transition', '.', 'Forty-four', 'Americans', 'have', 'now', 'taken', 'the', 'presidential', 'oath', '.', 'The', 'words', 'have', 'been', 'spoken', 'during', 'rising', 'tides', 'of', 'prosperity', 'and', 'the', 'still', 'waters', 'of', 'peace', '.', 'Yet', ',', 'every', 'so', 'often', 'the', 'oath', 'is', 'taken', 'amidst', 'gathering', 'clouds', 'and', 'raging', 'storms', '.', 'At', 'these', 'moments', ',', 'America', 'has', 'carried', 'on', 'not', 'simply', 'because', 'of', 'the', 'skill', 'or', 'vision', 'of', 'those', 'in', 'high', 'office', ',', 'but', 'because', 

In [6]:
# Dealing with frequencies

from nltk.probability import FreqDist

freq = FreqDist()                     # create object

for word in tokens:
    freq[word.lower()] += 1

print(freq)                          # prints details of the FreqDist object

freq                                 # prints FreqDist dictionary with tokens (keys) und their frequency (value)


print("Frequency of token 'america':\t" + str(freq['america']))


top5_tokens = freq.most_common(5)
print("The top 5 are:\t", str(top5_tokens))



<FreqDist with 94 samples and 139 outcomes>
Frequency of token 'america':	1
The top 5 are:	 [('the', 11), (',', 7), ('of', 6), ('.', 6), ('have', 4)]


In [7]:
# Dealing with blanklines

from nltk.tokenize import blankline_tokenize      

blank = blankline_tokenize(obama_speech)

print("Number of blanklines:\t" + str(len(blank)))

Number of blanklines:	3


In [8]:
blank[0]

'I stand here today humbled by the task before us, grateful for the trust you have bestowed, mindful of the sacrifices borne by our ancestors.'

In [9]:
blank[1]

'I thank President Bush for his service to our nation, as well as the generosity and cooperation he has shown throughout this transition. Forty-four Americans have now taken the presidential oath. The words have been spoken during rising tides of prosperity and the still waters of peace. Yet, every so often the oath is taken amidst gathering clouds and raging storms.'

In [10]:
# Dealing with bigrams, trigrams and ngrams

from nltk.util import bigrams, trigrams, ngrams

last_sentence = "God bless you and God bless the United States of America."

tokens = nltk.word_tokenize(last_sentence)


# bigram: Tokens of two consecutive written words
bigrams = list(nltk.bigrams(tokens))
print("Bigrams:")
print(bigrams)
print()


# trigram: Tokens of three consecutive written words
print("Trigrams:")
trigrams = list(nltk.trigrams(tokens))
print(trigrams)
print()


# ngram: Tokens of any number of consecutive written words
print("5grams:")
ngrams = list(nltk.ngrams(tokens, 5))
print(ngrams)


Bigrams:
[('God', 'bless'), ('bless', 'you'), ('you', 'and'), ('and', 'God'), ('God', 'bless'), ('bless', 'the'), ('the', 'United'), ('United', 'States'), ('States', 'of'), ('of', 'America'), ('America', '.')]

Trigrams:
[('God', 'bless', 'you'), ('bless', 'you', 'and'), ('you', 'and', 'God'), ('and', 'God', 'bless'), ('God', 'bless', 'the'), ('bless', 'the', 'United'), ('the', 'United', 'States'), ('United', 'States', 'of'), ('States', 'of', 'America'), ('of', 'America', '.')]

5grams:
[('God', 'bless', 'you', 'and', 'God'), ('bless', 'you', 'and', 'God', 'bless'), ('you', 'and', 'God', 'bless', 'the'), ('and', 'God', 'bless', 'the', 'United'), ('God', 'bless', 'the', 'United', 'States'), ('bless', 'the', 'United', 'States', 'of'), ('the', 'United', 'States', 'of', 'America'), ('United', 'States', 'of', 'America', '.')]


In [11]:
# Using TextBlob for generating N-Grams

#!pip install -U textblob

from textblob import TextBlob

nltk.download('punkt')

unigram = TextBlob(last_sentence).ngrams(1)                 # unigram with n = 1
unigram


[nltk_data] Error loading punkt: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


[WordList(['God']),
 WordList(['bless']),
 WordList(['you']),
 WordList(['and']),
 WordList(['God']),
 WordList(['bless']),
 WordList(['the']),
 WordList(['United']),
 WordList(['States']),
 WordList(['of']),
 WordList(['America'])]

In [12]:
bigram = TextBlob(last_sentence).ngrams(2)                  # bigram with n = 2
bigram


[WordList(['God', 'bless']),
 WordList(['bless', 'you']),
 WordList(['you', 'and']),
 WordList(['and', 'God']),
 WordList(['God', 'bless']),
 WordList(['bless', 'the']),
 WordList(['the', 'United']),
 WordList(['United', 'States']),
 WordList(['States', 'of']),
 WordList(['of', 'America'])]

In [13]:
# Stemming: Normalize words into its base form

from nltk.stem import PorterStemmer

ps = PorterStemmer()

words = ["give", "giving", "given", "gave"]
for w in words:
    print(w + ": " + ps.stem(w))


give: give
giving: give
given: given
gave: gave


In [14]:
from nltk.stem import LancasterStemmer

lst = LancasterStemmer()

for w in words:
    print(w + ": " + lst.stem(w))


give: giv
giving: giv
given: giv
gave: gav


In [15]:
from nltk.stem import SnowballStemmer

sbs = SnowballStemmer('english')

for w in words:
    print(w + ": " + sbs.stem(w))


give: give
giving: give
given: given
gave: gave


In [16]:
# POS: Parts of Speech

nltk.download('averaged_perceptron_tagger')

for t in tokens:                 # remember: tokens = nltk.word_tokenize(last_sentence)
    print(nltk.pos_tag([t]))


[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [Errno 11001] getaddrinfo failed>


[('God', 'NNP')]
[('bless', 'NN')]
[('you', 'PRP')]
[('and', 'CC')]
[('God', 'NNP')]
[('bless', 'NN')]
[('the', 'DT')]
[('United', 'NNP')]
[('States', 'NNS')]
[('of', 'IN')]
[('America', 'NNP')]
[('.', '.')]


In [17]:
# NER: Named Entity Recognition

nltk.download('maxent_ne_chunker')
nltk.download('words')

from nltk import ne_chunk

string = "The US President stays in the White House."

tokens = word_tokenize(string)
tags = nltk.pos_tag(tokens)
ner = ne_chunk(tags)

print(ner)


[nltk_data] Error loading maxent_ne_chunker: <urlopen error [Errno
[nltk_data]     11001] getaddrinfo failed>
[nltk_data] Error loading words: <urlopen error [Errno 11001]
[nltk_data]     getaddrinfo failed>


(S
  The/DT
  (ORGANIZATION US/NNP)
  President/NNP
  stays/VBZ
  in/IN
  the/DT
  (FACILITY White/NNP House/NNP)
  ./.)


In [18]:
# CountVectorizer: Convert a collection of text documents to a matrix of token counts.

from sklearn.feature_extraction.text import CountVectorizer

sent_1 = "Our nation is at war, against a far-reaching network of violence and hatred."
sent_2 = "Our economy is badly weakened, a consequence of greed and irresponsibility on the part of some."
sent_3 = "God bless you and God bless the United States of America."
 
msg = [sent_1, sent_2, sent_3]

cv = CountVectorizer()
cv.fit(msg)                             # tokenizing
vector = cv.transform(msg)              # encoding

print(cv.vocabulary_)
print()
print(cv.get_feature_names())
print()
print(vector.toarray())

{'our': 18, 'nation': 14, 'is': 13, 'at': 3, 'war': 26, 'against': 0, 'far': 8, 'reaching': 20, 'network': 15, 'of': 16, 'violence': 25, 'and': 2, 'hatred': 11, 'economy': 7, 'badly': 4, 'weakened': 27, 'consequence': 6, 'greed': 10, 'irresponsibility': 12, 'on': 17, 'the': 23, 'part': 19, 'some': 21, 'god': 9, 'bless': 5, 'you': 28, 'united': 24, 'states': 22, 'america': 1}

['against', 'america', 'and', 'at', 'badly', 'bless', 'consequence', 'economy', 'far', 'god', 'greed', 'hatred', 'irresponsibility', 'is', 'nation', 'network', 'of', 'on', 'our', 'part', 'reaching', 'some', 'states', 'the', 'united', 'violence', 'war', 'weakened', 'you']

[[1 0 1 1 0 0 0 0 1 0 0 1 0 1 1 1 1 0 1 0 1 0 0 0 0 1 1 0 0]
 [0 0 1 0 1 0 1 1 0 0 1 0 1 1 0 0 2 1 1 1 0 1 0 1 0 0 0 1 0]
 [0 1 1 0 0 2 0 0 0 2 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 0 0 1]]


In [19]:
# HashingVectorizer: Convert a collection of text documents to a matrix of token occurrences.

from sklearn.feature_extraction.text import HashingVectorizer

msg = [sent_1, sent_2, sent_3]

hv = HashingVectorizer(n_features = 13)     # vector size
hashVec = hv.transform(msg)

print(hashVec.shape)                        # dimensions
print()
print(hashVec.toarray())

(3, 13)

[[ 0.         -0.2        -0.2         0.2         0.2         0.8
   0.         -0.2        -0.2         0.         -0.2         0.2
  -0.2       ]
 [ 0.62554324  0.         -0.20851441 -0.20851441  0.20851441  0.20851441
   0.20851441  0.          0.         -0.20851441  0.          0.41702883
  -0.41702883]
 [-0.25819889 -0.51639778  0.         -0.51639778  0.          0.51639778
   0.          0.          0.          0.          0.25819889  0.25819889
   0.        ]]


In [20]:
# TD-IDF (Term frequency-Inverse Document Frequency): Convert a collection of raw documents to a matrix of TF-IDF features.

from sklearn.feature_extraction.text import TfidfVectorizer

msg = [sent_1, sent_2, sent_3]

tdidf = TfidfVectorizer()
tdidf.fit(msg)

print(tdidf.vocabulary_)
print()
print(tdidf.idf_)                           # idf=log(N/n)

{'our': 18, 'nation': 14, 'is': 13, 'at': 3, 'war': 26, 'against': 0, 'far': 8, 'reaching': 20, 'network': 15, 'of': 16, 'violence': 25, 'and': 2, 'hatred': 11, 'economy': 7, 'badly': 4, 'weakened': 27, 'consequence': 6, 'greed': 10, 'irresponsibility': 12, 'on': 17, 'the': 23, 'part': 19, 'some': 21, 'god': 9, 'bless': 5, 'you': 28, 'united': 24, 'states': 22, 'america': 1}

[1.69314718 1.69314718 1.         1.69314718 1.69314718 1.69314718
 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718
 1.69314718 1.28768207 1.69314718 1.69314718 1.         1.69314718
 1.28768207 1.69314718 1.69314718 1.69314718 1.69314718 1.28768207
 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718]


In [None]:
# Tokenizing text data with Apache Spark

from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import IDF


spark = SparkSession.builder \
            .master("local") \
            .appName("Tokenize Text Data") \
            .config("spark.some.config.option", "some-value") \
            .getOrCreate()


# Create Spark DataFrame
data = [(1, "The people who are crazy enough"),(2, "to think they can change the world"), (3, "are the one who do.")]
qdf = spark.createDataFrame(data, ["id", "quote"])
qdf.show()
qdf.take(1)


# Using Tokenizer
qdf_token = Tokenizer(inputCol="quote", outputCol="words")
qdf_token_df = qdf_token.transform(qdf)
qdf_token_df.show()


# Using HashingTF (= hashing term frequency)
hashingTF = HashingTF(inputCol="words", outputCol="htf_features", numFeatures=20)
htf_df = hashingTF.transform(qdf_token_df)
htf_df.show()
htf_df.take(1)


# Using IDF (= inverse document frequency transformation)
idf = IDF(inputCol="htf_features", outputCol="idf_features")
idf_model = idf.fit(htf_df)
idf_df = idf_model.transform(htf_df)
idf_df.show()
idf_df.take(1)


In [None]:
# Reading in a pdf file

!pip install PyPDF2

import PyPDF2
from PyPDF2 import PdfFileReader
import re


pdf = open(r"your_pdf_file.pdf","rb")               # PDF file object

pdf_reader = PyPDF2.PdfFileReader(pdf)              # PDF reader object

print(pdf_reader.numPages)

page = pdf_reader.getPage(1)                        # page object

print(page.extractText())                           # extract the text

len(re.findall(r"the", pdf))                        # Count number of times "the" is appeared in the file

pdf.close()