# Natural Language Processing
---
---

## Requirements
---
### <font color="red"><center>A restart might be necessary after the installation.</center></font>

In [1]:
"""Stopwords"""
# nltk.download("<MODULE_NAME>") e.g. "stopwords", "gutenberg" or "wordnet"
# src: https://pypi.org/project/nltk/
"""Number to Word Translalator"""
# pip install num2words
# src: https://pypi.org/project/words2num/
"""NLP Model for Lemmatization and Named Entity Recognition (NER)"""
# pip install -U spacy
# python -m spacy download en_core_web_sm
# NOTE: If pip is not available to you, please refer to https://spacy.io/usage and select an installation option suiting to your environment :)
"""Cython - needs to be installed to make use of the 'worker' parameter in Word2Vec model (parallelization)"""
# pip install Cython
# src: https://pypi.org/project/Cython/
"""Valence Aware Dictionary and sEntiment Reasoner (VADER)"""
# pip install vaderSentiment
# src: https://pypi.org/project/vaderSentiment/

'Valence Aware Dictionary and sEntiment Reasoner (VADER)'

## Imports and loading the data
---

In [2]:
import numpy as np
import random
import re
import os
import spacy
import pandas as pd

from nltk.corpus import stopwords, gutenberg
from nltk import ngrams
from num2words import num2words
from gensim.models import Word2Vec
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from IPython.display import display

# src: https://wortschatz.uni-leipzig.de/en/download/English#eng_news_2020 (accessed: 14.12.21)
with open("eng_news_2020_10K-sentences.txt", "r", encoding = "utf-8") as f:
    corpus_raw = f.readlines()

## Preprocessing
---


In [3]:
# removing the enumeration infront of each doc
corpus_without_enum = [doc.split("\t")[1] for doc in corpus_raw]
# converting every word to lower case
corpus_lowercase = [doc.lower() for doc in corpus_without_enum]
# insert whitespace between numbers and words
corpus_words_and_numbers_separated = [re.sub("([a-zA-Z]*)(\d+)([a-zA-Z]*)", r"\1 \2 \3", doc) for doc in corpus_lowercase]
# convert numbers into their word equivalent while maintaining their position
corpus_num2words = []
for doc in corpus_words_and_numbers_separated:
    numbers_in_doc = re.findall("\d+", doc)  
    # returns false if list is empty
    if numbers_in_doc:
        for number in numbers_in_doc:
            doc = doc.replace(number, num2words(number, lang="en"))
        corpus_num2words.append(doc)    
    else:
        corpus_num2words.append(doc)   
# removing words shorter than 4 characters
corpus_longer_words = [re.sub(r"\b\w{,3}\b", "", doc) for doc in corpus_num2words]
# removing punctuation
corpus_no_punctuation = [re.sub(r"(\W+)", " ", doc) for doc in corpus_longer_words]

## Lemmatization
---

In [4]:
# Lemmatization, loading the small standard Model with all components 
nlp = spacy.load("en_core_web_sm")
# Loop is necessary due to spaCys nlp() method does not accept iterables
lemmatized_corpus = []
for doc in corpus_no_punctuation:
    loaded_doc = nlp(doc)
    lemmatized_corpus.append(" ".join([token.lemma_ for token in loaded_doc])) 

## Part-of-Speech-Tagging (POS-Tagging)
---

In [5]:
# Select a random document from corpus
random_doc = random.choice(lemmatized_corpus)
loaded_doc = nlp(random_doc.strip())

print("CHOSEN DOCUMENT:", loaded_doc, "\n")
# Use spaCy POS-Tagging
for token in loaded_doc:
    print(
        f"TEXT: {token.text:15}",
        f"POS: {token.pos_:10}", 
        f"DETAILS: {spacy.explain(token.tag_):5}"
        )

print("\nVISUALIZATION:")
# specify options for visualization        
options = {
    "compact": True, 
    "bg": "#041C32",
    "color": "#ECB365", 
    "font": "Source Sans Pro"
    }
# display POS and relationship of words within the sentence    
spacy.displacy.render(loaded_doc, style="dep", options=options)        

CHOSEN DOCUMENT: actress political activist longoria praise host ability first night democratic national convention last night 

TEXT: actress         POS: VERB       DETAILS: verb, base form
TEXT: political       POS: ADJ        DETAILS: adjective (English), other noun-modifier (Chinese)
TEXT: activist        POS: ADJ        DETAILS: adjective (English), other noun-modifier (Chinese)
TEXT: longoria        POS: PROPN      DETAILS: noun, proper singular
TEXT: praise          POS: NOUN       DETAILS: noun, singular or mass
TEXT: host            POS: NOUN       DETAILS: noun, singular or mass
TEXT: ability         POS: NOUN       DETAILS: noun, singular or mass
TEXT: first           POS: ADJ        DETAILS: adjective (English), other noun-modifier (Chinese)
TEXT: night           POS: PROPN      DETAILS: noun, proper singular
TEXT: democratic      POS: PROPN      DETAILS: noun, proper singular
TEXT: national        POS: PROPN      DETAILS: noun, proper singular
TEXT: convention      POS: N

## Named Entitiy Recognition (NER)
---

In [6]:
# Due to not every doc in the corpus contains entities, a sample is provided manually
loaded_doc = nlp("As Gregor Samsa awoke one morning from uneasy dreams he found himself transformed in his bed into a gigantic insect.")

print("CHOSEN DOCUMENT:", loaded_doc, "\n")
# Use spaCy POS-Tagging
for token in loaded_doc.ents:
    print(
        f"TEXT: {token.text:15}",
        f"LABEL: {token.label_:10}", 
        f"DETAILS: {spacy.explain(token.label_):19}"
        )

# display POS and relationship of words within the sentence    
spacy.displacy.render(loaded_doc, style="ent", options=options)

CHOSEN DOCUMENT: As Gregor Samsa awoke one morning from uneasy dreams he found himself transformed in his bed into a gigantic insect. 

TEXT: Gregor Samsa    LABEL: PERSON     DETAILS: People, including fictional
TEXT: one morning     LABEL: TIME       DETAILS: Times smaller than a day


## Sentiment Analysis (naive)
---

In [7]:
# Using the force of VADER for a simple sentiment analysis
# NOTE: VADERs results are based on the words, NOT the context they appear in
analyzer = SentimentIntensityAnalyzer()
# Select a random document from corpus
random_doc = random.choice(lemmatized_corpus).strip()
print("CHOSEN DOCUMENT:", random_doc.strip(),"\n")
print("SCORES: ", analyzer.polarity_scores(random_doc))
# NOTE: a detailed explaination regarding the scores and their computation can be found here: https://github.com/cjhutto/vaderSentiment (accessed: 19.12.21)

CHOSEN DOCUMENT: never look like be enough 

SCORES:  {'neg': 0.345, 'neu': 0.655, 'pos': 0.0, 'compound': -0.2755}


## Tokenization
---

In [8]:
# NOTE: python lists maintain order, context is still intact
# NOTE: for a more sophisticated split (e.g. "N.Y."), please refer to token.text of the object the nlp() method returns 
# split() will result in a nested structure of lists inside a list -> corpus = [["word", "word", "anotherword"], ["word"], [...]]
corpus_tokenized = [doc.split() for doc in lemmatized_corpus]

## Removing stopwords
---

In [9]:
stop_words = stopwords.words("english")
corpus_cleaned = [[token for token in doc if token not in stop_words] for doc in corpus_tokenized]

## Display some info 
---

In [10]:
# building a vocabulary
unique_words = set()
[[unique_words.add(word) for word in doc] for doc in corpus_cleaned]
print("Number of unique words:   {:10}".format(len(unique_words)))
# counting number of words in total
word_count_total = sum([len(sentence) for sentence in corpus_cleaned])
print("Number of words in total: {:10}".format(word_count_total))
print("Lexical Diversity: {:17.2f} %".format(len(unique_words)/word_count_total), "\n")

# creating a dictionary, mapping a word with its 
# 1) frequency of occurence within the entire corpus
# 2) number of documents the term appears in (document frequency) 
# NOTE: the dict.fromkeys() method cannot be used here, due to it would assign the same list object as value for every key
freq_dict = {key: [0, set()] for key in unique_words}
for idx, doc in enumerate(corpus_cleaned):
    #print(idx, doc)
    for word in doc:
        # updating the term frequency
        freq_dict[word][0] += 1
        # length of this set is equal to the document frequency
        freq_dict[word][1].add(idx)
        
print("Most commonly used words across all documents:\n")
most_common_words = sorted(freq_dict.items(), key=lambda item: item[1][0], reverse=True)
for element in most_common_words[:10]:
    print("{0:15} : {1}".format(element[0], element[1][0]))

Number of unique words:        16486
Number of words in total:     106480
Lexical Diversity:              0.15 % 

Most commonly used words across all documents:

say             : 1214
hundred         : 692
thousand        : 622
twenty          : 589
year            : 527
three           : 489
five            : 472
also            : 467
make            : 464
people          : 448


## TF-IDF
---

In [11]:
random_doc = random.choice(corpus_cleaned)
print("CHOSEN DOCUMENT: ", random_doc, "\n")
# np.unique() returns a set-like copy of the chosen document, this way the original list can be referenced when using the count() method 
for word in np.unique(random_doc):
    print("{0:15} :      {1}".format(
        word, ( 
        # TF * IDF = (number of occurrences of word / total number of words in document) * log (number of documents in corpus / number of documents that contain the word+1)
        ( random_doc.count(word) / len(random_doc) ) * 
        np.log( len(corpus_cleaned) / (len(freq_dict[word][1])+1) )
        )))

CHOSEN DOCUMENT:  ['pride', 'wayne', 'eight', 'fifty', 'fail', 'reduce', 'speed'] 

eight           :      0.5401421061459631
fail            :      0.7999174941902797
fifty           :      0.5715506027335373
pride           :      0.9732064427396874
reduce          :      0.8038316333600104
speed           :      0.8678351651495761
wayne           :      1.001873684948566


## Cosine Similarity
---

In [12]:
# randomly choose two documents from corpus and calculate their cosine similarity
doc1, doc2 = random.sample(corpus_cleaned, 2)
print("DOCUMENTS:\n")
print(doc1)
print(doc2,"\n")

# creating a vocabulary of the two documents, which will be used to vectorize the two documents 
vocabulary = set(doc1).union(set(doc2))
v1, v2 = [], []
for word in vocabulary:
    if word in doc1: v1.append(1)
    else: v1.append(0)
    if word in doc2: v2.append(1)
    else: v2.append(0)

print("VECTOR REPRESENTATION:\n")
print(v1)
print(v2,"\n")

# cosine similarity = dot-product(v1,v2) / (absolute value(v1) * absolute value(v2)) = (v1.v2) / (||v1||.||v2||)
dot_prod = 0
for idx in range(len(vocabulary)):
        dot_prod += v1[idx] * v2[idx]
cosine_similarity = dot_prod / float((sum(v1) * sum(v2)) ** 0.5)
print("COSINE SIMILARITY:", cosine_similarity)

DOCUMENTS:

['laugh', 'different', 'certainly', 'kitchen', 'splendido', 'across', 'industry']
['hear', 'lion', 'safari', 'park', 'noise', 'night', 'terrify'] 

VECTOR REPRESENTATION:

[1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0]
[0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1] 

COSINE SIMILARITY: 0.0


## MinHashing (or approximating the Jaccard Similarity)
---

In [13]:
""" for more details see: http://infolab.stanford.edu/~ullman/mmds/ch3.pdf """

# for demonstration purpose two very similar documents are choosen
doc1 = "the quick brown fox jumped over the lazy dog"
doc2 = "the quick brown dog jumped over the lazy fox"

# helper method to create shingles of given size (basically n-grams with words as morphems) 
# INPUT: ["word1 word2 word3 word4" ...], OUTPUT: ["word1 word2 word3", "word2 word3 word4", ...]
def get_shingles(document, size = 3):
    return ngrams(document.split(), size)

# method to calculate a signature matrix from given list of hashfunctions and document vectors
def get_signature_matrix(document_vectors: list, permutated_lists: list) -> np.array:
    # defining a yet uninitialized signature matrix (S) of dimension S(number of hash functions × number of documents)
    # using positive infinity to represent the uninitialized state 
    signature_matrix = np.empty((len(permutated_lists), len(document_vectors)))
    signature_matrix[:] = np.inf

    for row, permutated_list in enumerate(permutated_lists):
        # using permutated order of shingle-IDs to update signature matrix
        for shingle_id in permutated_list:
            # if shingle is present in given vector, initialize/update signature matrix at index corresponding to given hash-function (col) and document (row)
            for col, doc_vec in enumerate(document_vectors):
                if doc_vec[shingle_id] == 1:
                    if signature_matrix[row][col] > shingle_id:
                        signature_matrix[row][col] = shingle_id
    return signature_matrix

# use this method to calculate the approximate similarity between the documents specified by their index in the given signature matrix
def approximate_similarity(signature_matrix: np.array, idx1: int, idx2: int) -> float:
    # count of matching values / total number of hash-functions used for permutation (number of rows in signature matrix)
    return len(np.where(signature_matrix[idx1] == signature_matrix[idx2])[0]) / signature_matrix.shape[0]

# divide sentences into list shingles of given size
doc1_shingles = list(get_shingles(doc1))
doc2_shingles = list(get_shingles(doc2))
# create a set of shingles
set_of_shingles = set(doc1_shingles + doc2_shingles)
# assign a unique ID to every shingle
shingle_dict = {}
for idx, shingle in enumerate(set_of_shingles):
    shingle_dict[idx] = shingle

# use the dictionary to represent documents as vectors (see src, p.81), 
# in combination the dictionary and vectors can be interpreted as the "document matrix"
v1, v2 = [], []
for shingle in shingle_dict.values():
    if shingle in doc1_shingles: v1.append(1)
    else: v1.append(0)
    if shingle in doc2_shingles: v2.append(1)
    else: v2.append(0)

# using two hash-functions to permutate the order of shingle-IDs, 
# NOTE: make sure the modulus is a prime number greater or equal to the number of shingles within the set of shingles to avoid hash collisions
# NOTE: the more hash-functions are used, the more accurate the approximation of the Jaccard Similarity will become 
permutated_list_1 = [(id + 1) % 11 for id in shingle_dict.keys()]
permutated_list_2 = [(3*id + 1) % 11 for id in shingle_dict.keys()]

print("DOCUMENTS:\n")
print(doc1, "\n")
print(doc2, "\n")
print("APPROXIMATED SIMILARITY: ~", 
    approximate_similarity(
        get_signature_matrix(
            # list of document vectors
            [v1, v2], 
            # list of permuted shingle-IDs 
            [permutated_list_1, permutated_list_2]),
            # indexes of documents in signature matrix that should be compared with respect to their similarity
            0, 1
        ) * 100, "%"
    )

DOCUMENTS:

the quick brown fox jumped over the lazy dog 

the quick brown dog jumped over the lazy fox 

APPROXIMATED SIMILARITY: ~ 100.0 %


## Word Embedding
---

In [14]:
# due to the nature of the previously used corpus, for the following example a more coherent dataset will be used: "Moby Dick" by Herman Melville
md = gutenberg.sents('melville-moby_dick.txt')
# dataset is already tokenized, just need to ge rid off punctuation, convert strings to lower case and remove short words and stopwords
md_puctuation_removed = [[re.sub("\W+", "", word.lower()) for word in doc] for doc in md]
md_long_words = [[word for word in doc if len(word) > 1] for doc in md_puctuation_removed]
md_cleaned = [[word for word in sentence if word not in stop_words] for sentence in md_long_words]

model = Word2Vec(
    # use cleaned corpus for training the model
    sentences = md_cleaned, 
    # size of the layers within the neural network
    vector_size = 300, 
    # number of words that should be considered when modelling the context
    window = 5, 
    # consider only words with an occurrence greater or equal to min_count
    min_count = 3, 
    # number of threads accessible to the model during training
    workers = 4
    )

# find words that appeared in similar contexts 
term = "captain"
print(f"Words that appear in context similar to \'{term}\':\n")   
similars = model.wv.most_similar(term, topn = 10)
for word in similars:
    print(word[0])

Words that appear in context similar to 'captain':

ahab
said
cried
hands
starbuck
know
hand
men
old
well


## Latent Semantic Analysis (LSA) aka Latent Semantic Indexing (LSI)
---

In [21]:
# Source for this part is the LSA-Series of Databricks Academy video series: youtube.com/watch?v=hB51kkus-Rc (accessed: 12.02.22)
"""
Used for topic modelling or comparing document similarity based on topics -> similar words appear in similar contexts

Latent = hidden

1) Retrieve Data
2) Build Document-Term Matrix
3) Perform Singular Value Decomposition (SVD) on Document-Term Matrix, representing it as the product of two matricies
4) Examine the generated topic-encoded data
"""

# 1) naive corpus, consisting of four documents
body = [
    "Mary had a little lamb",
    "whose fleece was white as snow",
    "and everywhere that Mary went",
    "the lamb was sure to go",
    ]

# 2) building the Document-Term Matrix, CountVectorizers default configuration also takes care of transforming words to lowercase before tokenization
# NOTE: for larger data sets the "min_df" parameter can be used to regulate in how many documents a term must occur to be taken into consideration
vectorizer  = CountVectorizer(stop_words = "english")
bag_of_words = vectorizer.fit_transform(body)

# display intermediate results
features = vectorizer.get_feature_names_out() 
print("Corpus Dictionary (Index, Token)")
print(list(zip(range(len(features)), features)), os.linesep)
print("Count Vector representation of the Body (rows = document, columns = count of occurrence of Token within given document)")
# .todense() transforms results into matrix-like representation
print(bag_of_words.todense(), os.linesep)

# 3) initialize Singular Value Decomposition Model, passing amount of topics the vectors should be reduced to (similar to PCA)
svd = TruncatedSVD(n_components = 2)
# topic encoded data
lsa = svd.fit_transform(bag_of_words)

# 4) display the modelled numeric relationship between the give documents
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_df["body"] = body
print("topic modelling")
display(topic_encoded_df[["body", "topic_1", "topic_2"]])

# display numerically modelled relationship of each token with given number of topics -> latent features
encoding_matrix = pd.DataFrame( svd.components_,
                                index = ["topic_1", "topic_2"],
                                columns = features).T
print("Componenwise")                                
display(encoding_matrix)

Corpus Dictionary (Index, Token)
[(0, 'fleece'), (1, 'lamb'), (2, 'little'), (3, 'mary'), (4, 'snow'), (5, 'sure'), (6, 'went'), (7, 'white')] 

Count Vector representation of the Body (rows = document, columns = count of occurrence of Token within given document)
[[0 1 1 1 0 0 0 0]
 [1 0 0 0 1 0 0 1]
 [0 0 0 1 0 0 1 0]
 [0 1 0 0 0 1 0 0]] 

topic modelling


Unnamed: 0,body,topic_1,topic_2
0,Mary had a little lamb,1.632993,4.496036e-18
1,whose fleece was white as snow,3.931749e-16,1.732051
2,and everywhere that Mary went,0.8164966,1.320577e-15
3,the lamb was sure to go,0.8164966,-1.548072e-15


Componenwise


Unnamed: 0,topic_1,topic_2
fleece,1.940353e-16,0.5773503
lamb,0.6123724,-7.187998e-16
little,0.4082483,5.092452e-17
mary,0.6123724,6.723713e-16
snow,9.956981e-17,0.5773503
sure,0.2041241,-8.292722e-16
went,0.2041241,6.482057e-16
white,9.956981e-17,0.5773503


In [24]:
"""
further analysis; find what words account for the greatest amount of variance within the data, 
NOTE: positive/negative values can be of equal importance, hence the absolute values are taken into consideration, 
the greater the distance between certain topic values the better it can be distinguished with regards to their topic-wise similarity
"""
encoding_matrix["abs_topic_1"] = np.abs(encoding_matrix["topic_1"])
encoding_matrix["abs_topic_2"] = np.abs(encoding_matrix["topic_2"])
print("Sorted for Topic 1")
display(encoding_matrix.sort_values("abs_topic_1", ascending = False))
print("Sorted for Topic 2")
display(encoding_matrix.sort_values("abs_topic_2", ascending = False))

Sorted for Topic 1


Unnamed: 0,topic_1,topic_2,abs_topic_1,abs_topic_2
lamb,0.6123724,-7.187998e-16,0.6123724,7.187998e-16
mary,0.6123724,6.723713e-16,0.6123724,6.723713e-16
little,0.4082483,5.092452e-17,0.4082483,5.092452e-17
sure,0.2041241,-8.292722e-16,0.2041241,8.292722e-16
went,0.2041241,6.482057e-16,0.2041241,6.482057e-16
fleece,1.940353e-16,0.5773503,1.940353e-16,0.5773503
snow,9.956981e-17,0.5773503,9.956981e-17,0.5773503
white,9.956981e-17,0.5773503,9.956981e-17,0.5773503


Sorted for Topic 2


Unnamed: 0,topic_1,topic_2,abs_topic_1,abs_topic_2
fleece,1.940353e-16,0.5773503,1.940353e-16,0.5773503
snow,9.956981e-17,0.5773503,9.956981e-17,0.5773503
white,9.956981e-17,0.5773503,9.956981e-17,0.5773503
sure,0.2041241,-8.292722e-16,0.2041241,8.292722e-16
lamb,0.6123724,-7.187998e-16,0.6123724,7.187998e-16
mary,0.6123724,6.723713e-16,0.6123724,6.723713e-16
went,0.2041241,6.482057e-16,0.2041241,6.482057e-16
little,0.4082483,5.092452e-17,0.4082483,5.092452e-17
