# Natural Language Processing
---
---

## Requirements
---

In [1]:
"""Stopwords"""
# nltk.download("<MODULE_NAME>") e.g. "stopwords" or "wordnet"
# src: https://pypi.org/project/nltk/
"""Number to Word Translalator"""
# pip install num2words
# src: https://pypi.org/project/words2num/
"""NLP Model for Lemmatization and Named Entity Recognition (NER)"""
# pip install -U spacy
# python -m spacy download en_core_web_sm
# NOTE: If pip is not available to you, please refer to https://spacy.io/usage and select an installation option suiting to your environment :)
"""Cython - needs to be installed to make use of the 'worker' parameter in Word2Vec model (parallelization)"""
# pip install Cython
# src: https://pypi.org/project/Cython/
"""Valence Aware Dictionary and sEntiment Reasoner (VADER)"""
# pip install vaderSentiment
# src: https://pypi.org/project/vaderSentiment/

'NLP Model for Lemmatization and Named Entity Recognition (NER)'

## Imports and loading the data
---

In [1]:
import numpy as np
import random
import re
import spacy

from nltk.corpus import stopwords, gutenberg
from num2words import num2words
from gensim.models import Word2Vec
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# src: https://wortschatz.uni-leipzig.de/en/download/English#eng_news_2020 (accessed: 14.12.21)
with open("eng_news_2020_10K-sentences.txt", "r", encoding = "utf-8") as f:
    corpus_raw = f.readlines()
f.close()

## Preprocessing
---


In [2]:
# removing the enumeration infront of each doc
corpus_without_enum = [doc.split("\t")[1] for doc in corpus_raw]
# converting every word to lower case
corpus_lowercase = [doc.lower() for doc in corpus_without_enum]
# insert whitespace between numbers and words
corpus_words_and_numbers_separated = [re.sub("([a-zA-Z]*)(\d+)([a-zA-Z]*)", r"\1 \2 \3", doc) for doc in corpus_lowercase]
# convert numbers into their word equivalent while maintaining their position
corpus_num2words = []
for doc in corpus_words_and_numbers_separated:
    numbers_in_doc = re.findall("\d+", doc)  
    # returns false if list is empty
    if numbers_in_doc:
        for number in numbers_in_doc:
            doc = doc.replace(number, num2words(number, lang="en"))
        corpus_num2words.append(doc)    
    else:
        corpus_num2words.append(doc)   
# removing words shorter than 4 characters
corpus_longer_words = [re.sub(r"\b\w{,3}\b", "", doc) for doc in corpus_num2words]
# removing punctuation
corpus_no_punctuation = [re.sub(r"(\W+)", " ", doc) for doc in corpus_longer_words]

## Lemmatization
---

In [3]:
# Lemmatization, loading the small standard Model with all components 
nlp = spacy.load("en_core_web_sm")
# Loop is necessary due to spaCys nlp() method does not accept iterables
lemmatized_corpus = []
for doc in corpus_no_punctuation:
    loaded_doc = nlp(doc)
    lemmatized_corpus.append(" ".join([token.lemma_ for token in loaded_doc])) 

## Part-of-Speech-Tagging (POS-Tagging)
---

In [4]:
# Select a random document from corpus
random_doc = random.choice(lemmatized_corpus)
loaded_doc = nlp(random_doc.strip())

print("CHOSEN DOCUMENT:", loaded_doc, "\n")
# Use spaCy POS-Tagging
for token in loaded_doc:
    print(
        f"TEXT: {token.text:15}",
        f"POS: {token.pos_:10}", 
        f"DETAILS: {spacy.explain(token.tag_):5}"
        )

print("\nVISUALIZATION:")
# specify options for visualization        
options = {
    "compact": True, 
    "bg": "#041C32",
    "color": "#ECB365", 
    "font": "Source Sans Pro"
    }
# display POS and relationship of words within the sentence    
spacy.displacy.render(loaded_doc, style="dep", options=options)        

CHOSEN DOCUMENT: late nature announcement thursday make only hour before measure come into effect leave many confused 

TEXT: late            POS: ADJ        DETAILS: adjective (English), other noun-modifier (Chinese)
TEXT: nature          POS: NOUN       DETAILS: noun, singular or mass
TEXT: announcement    POS: NOUN       DETAILS: noun, singular or mass
TEXT: thursday        POS: PROPN      DETAILS: noun, proper singular
TEXT: make            POS: VERB       DETAILS: verb, non-3rd person singular present
TEXT: only            POS: ADJ        DETAILS: adjective (English), other noun-modifier (Chinese)
TEXT: hour            POS: NOUN       DETAILS: noun, singular or mass
TEXT: before          POS: SCONJ      DETAILS: conjunction, subordinating or preposition
TEXT: measure         POS: NOUN       DETAILS: noun, singular or mass
TEXT: come            POS: VERB       DETAILS: verb, non-3rd person singular present
TEXT: into            POS: ADP        DETAILS: conjunction, subordinating or

## Named Entitiy Recognition (NER)
---

In [5]:
# Due to not every doc in the corpus contains entities, a sample is provided manually
loaded_doc = nlp("As Gregor Samsa awoke one morning from uneasy dreams he found himself transformed in his bed into a gigantic insect.")

print("CHOSEN DOCUMENT:", loaded_doc, "\n")
# Use spaCy POS-Tagging
for token in loaded_doc.ents:
    print(
        f"TEXT: {token.text:15}",
        f"LABEL: {token.label_:10}", 
        f"DETAILS: {spacy.explain(token.label_):19}"
        )

# display POS and relationship of words within the sentence    
spacy.displacy.render(loaded_doc, style="ent", options=options)

CHOSEN DOCUMENT: As Gregor Samsa awoke one morning from uneasy dreams he found himself transformed in his bed into a gigantic insect. 

TEXT: Gregor Samsa    LABEL: PERSON     DETAILS: People, including fictional
TEXT: one morning     LABEL: TIME       DETAILS: Times smaller than a day


## Sentiment Analysis (naive)
---

In [12]:
# Using the force of VADER for a simple sentiment analysis
# NOTE: VADERs results are based on the words, NOT the context they appear in
analyzer = SentimentIntensityAnalyzer()
# Select a random document from corpus
random_doc = random.choice(lemmatized_corpus).strip()
print("CHOSEN DOCUMENT:", random_doc.strip(),"\n")
print("SCORES: ", analyzer.polarity_scores(random_doc))
# NOTE: a detailed explaination regarding the scores and their computation can be found here: https://github.com/cjhutto/vaderSentiment (accessed: 19.12.21)

CHOSEN DOCUMENT: participant encourage call ahead confirm open hour 

SCORES:  {'neg': 0.0, 'neu': 0.645, 'pos': 0.355, 'compound': 0.5106}


## Tokenization
---

In [119]:
# NOTE: python lists maintain order, context is still intact
# NOTE: for a more sophisticated split (e.g. "N.Y."), please refer to token.text of the object the nlp() method returns 
# split() will result in a nested structure of lists inside a list -> corpus = [["word", "word", "anotherword"], ["word"], [...]]
corpus_tokenized = [doc.split() for doc in lemmatized_corpus]

## Removing stopwords
---

In [120]:
stop_words = stopwords.words("english")
corpus_cleaned = [[token for token in doc if token not in stop_words] for doc in corpus_tokenized]

## Display some info 
---

In [123]:
# building a vocabulary
unique_words = set()
[[unique_words.add(word) for word in doc] for doc in corpus_cleaned]
print("Number of unique words:   {:10}".format(len(unique_words)))
# counting number of words in total
word_count_total = sum([len(sentence) for sentence in corpus_cleaned])
print("Number of words in total: {:10}".format(word_count_total))
print("Lexical Diversity: {:17.2f} %".format(len(unique_words)/word_count_total), "\n")

# creating a dictionary, mapping a word with its 
# 1) frequency of occurence within the entire corpus
# 2) number of documents the term appears in (document frequency) 
# NOTE: the dict.fromkeys() method cannot be used here, due to it would assign the same list object as value for every key
freq_dict = {key: [0, set()] for key in unique_words}
for idx, doc in enumerate(corpus_cleaned):
    #print(idx, doc)
    for word in doc:
        # updating the term frequency
        freq_dict[word][0] += 1
        # length of this set is equal to the document frequency
        freq_dict[word][1].add(idx)
        
print("Most commonly used words across all documents:\n")
most_common_words = sorted(freq_dict.items(), key=lambda item: item[1][0], reverse=True)
for element in most_common_words[:10]:
    print("{0:15} : {1}".format(element[0], element[1][0]))

Number of unique words:        16486
Number of words in total:     106480
Lexical Diversity:              0.15 % 

Most commonly used words across all documents:

say             : 1214
hundred         : 692
thousand        : 622
twenty          : 589
year            : 527
three           : 489
five            : 472
also            : 467
make            : 464
people          : 448


## TF-IDF
---

In [176]:
random_doc = random.choice(corpus_cleaned)
print("CHOSEN DOCUMENT: ",random_doc,"\n")
# np.unique() returns a set-like copy of the chosen document, this way the original list can be referenced when using the count() method 
for word in np.unique(random_doc):
    print("{0:15} :      {1}".format(
        word, ( 
        # TF * IDF = (number of occurrences of word / total number of words in document) * log (number of documents in corpus / number of documents that contain the word+1)
        ( random_doc.count(word) / len(random_doc) ) * 
        np.log( len(corpus_cleaned) / (len(freq_dict[word][1])+1) )
        )))

Chosen document:  ['square', 'make', 'loan', 'aggressively', 'grow', 'loan', 'business', 'take', 'banking', 'license'] 

aggressively    :      0.8111728083308073
banking         :      0.7600902459542083
business        :      0.4358310108056566
grow            :      0.48928522584398726
license         :      0.6645391014514647
loan            :      1.2238595837235735
make            :      0.31033174842339284
square          :      0.6907755278982137
take            :      0.3272804166893757


## Cosine Similarity
---

In [201]:
# randomly choose two documents from corpus and calculate their cosine similarity
doc1, doc2 = random.sample(corpus_cleaned, 2)
print("DOCUMENTS:\n")
print(doc1)
print(doc2,"\n")

# creating a vocabulary of the two documents, which will be used to vectorize the two documents 
vocabulary = set(doc1).union(set(doc2))
v1, v2 = [], []
for word in vocabulary:
    if word in doc1: v1.append(1)
    else: v1.append(0)
    if word in doc2: v2.append(1)
    else: v2.append(0)

print("VECTOR REPRESENTATION:\n")
print(v1)
print(v2,"\n")

# cosine similarity = dot-product(v1,v2) / (absolute value(v1) * absolute value(v2)) = (v1.v2) / (||v1||.||v2||)
dot_prod = 0
for idx in range(len(vocabulary)):
        dot_prod += v1[idx]*v2[idx]
cosine_similarity = dot_prod / float((sum(v1)*sum(v2))**0.5)
print("COSINE SIMILARITY:", cosine_similarity)

DOCUMENTS:

['italy', 'way', 'mess', 'leave', 'euro', 'force', 'northern', 'bloc', 'uncle']
['malta', 'international', 'airport', 'remain', 'ninety', 'level', 'across', 'zero', 'share'] 

VECTOR REPRESENTATION:

[0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1]
[1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0] 

COSINE SIMILARITY: 0.0


## Word Embedding
---

In [175]:
# due to the nature of the previously used corpus, for the following example a more coherent dataset will be used: "Moby Dick" by Herman Melville
md = gutenberg.sents('melville-moby_dick.txt')
# dataset is already tokenized, just need to ge rid off punctuation, convert strings to lower case and remove short words and stopwords
md_puctuation_removed = [[re.sub("\W+", "", word.lower()) for word in doc] for doc in md]
md_long_words = [[word for word in doc if len(word) > 1] for doc in md_puctuation_removed]
md_cleaned = [[word for word in sentence if word not in stop_words] for sentence in md_long_words]

model = Word2Vec(
    # use cleaned corpus for training the model
    sentences = md_cleaned, 
    # size of the layers within the neural network
    vector_size = 300, 
    # number of words that should be considered when modelling the context
    window = 5, 
    # consider only words with an occurrence greater or equal to min_count
    min_count = 3, 
    # number of threads accessible to the model during training
    workers = 4
    )

# find words that appeared in similar contexts 
term = "captain"
print(f"Words that appear in context similar to \'{term}\':\n")   
similars = model.wv.most_similar(term, topn = 10)
for word in similars:
    print(word[0])

Words that appear in context similar to 'captain':

ahab
said
cried
starbuck
old
stubb
well
know
thought
tell
