# Welcome to Natural Language Processing!

# First, let's look at the Bag of Words-Model.

# We want to import some Libraries from sklearn and nltk that help us with this task.

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import PunktSentenceTokenizer
import re
import pandas as pd
import gensim

# Next we want to use a simple string as our test-content.

In [2]:
content = """
    "It's exciting to watch flying fish after a hard day's work. I don't know why some fish prefer flying and other fish would rather swim. It seems like the fish just woke up one day and decided, 'hey, today is the day to fly away.'"
"""

# Now we're making a function that tokenizes and cleans this data for us so we can get rid of some noise:

In [3]:
def tokenize_and_clean_data(content):
    sentences = nltk.sent_tokenize(content)
    lemmatizer = WordNetLemmatizer()

    corpus = []

    for i in range(len(sentences)):
        sent = re.sub('[^a-zA-Z]', ' ', sentences[i])
        sent = sent.lower()
        sent= sent.split()
        sent = [lemmatizer.lemmatize(word) for word in sent if not word in set(stopwords.words('english'))]
        sent = ' '.join(sent)   
        corpus.append(sent)
    return corpus

tokenized_data = tokenize_and_clean_data(content)

# Next up we're using this cleaned up token and create our Bag of Words Model.
# To be able to do that, we need to initialize the tokenizer and run it with the token we just created. (To simplify things, I initialized all needed Vectorizers)

In [4]:
cv = CountVectorizer()
tfidf = TfidfVectorizer(norm=None)
tokenizer = PunktSentenceTokenizer()

# Im converting the tokenized sentences into a list of words to be able to make a dataframe out of it
tokenized_data_words = " ".join(tokenized_data).split(" ")
test_data_list = list()
for word in tokenized_data_words:
    if not word in test_data_list:
        test_data_list.append(word)

# Here we create the Bag_of_Words model
bow_model = cv.fit_transform(tokenized_data).toarray()
bow_model_df = pd.DataFrame(bow_model, columns=test_data_list)
bow_model_df.head()

Unnamed: 0,exciting,watch,flying,fish,hard,day,work,know,prefer,would,...,swim,seems,like,woke,one,decided,hey,today,fly,away
0,0,1,0,1,1,0,1,1,0,0,...,0,0,0,0,0,0,1,0,1,0
1,0,0,0,0,2,0,1,0,0,1,...,0,1,1,0,1,0,0,0,0,1
2,1,2,1,0,1,1,0,0,1,0,...,1,0,0,1,0,1,0,1,0,0


# Now let's look at the bag of words in a sorted DataFrame and see which words were used the most.

In [5]:
def create_sorted_df(tokenized_data):
    tokenized_list = " ".join(tokenized_data).split(" ")
    bow_dict = dict()
    for word in tokenized_list:
        if word not in bow_dict:
            bow_dict[word] = 1
        else:
            bow_dict[word] += 1
    sorted_df = pd.DataFrame([dict(sorted(bow_dict.items(), key=lambda item: item[1], reverse=True))])
    return sorted_df

df_sorted = create_sorted_df(tokenized_data)
df_sorted.head()

Unnamed: 0,fish,day,flying,exciting,watch,hard,work,know,prefer,would,...,swim,seems,like,woke,one,decided,hey,today,fly,away
0,4,3,2,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


# Next up let's look at the tfidf model:

In [6]:
df_bow_model_tfidf = pd.DataFrame(tfidf.fit_transform(tokenized_data).toarray(), columns=test_data_list)
df_bow_model_tfidf.head()

Unnamed: 0,exciting,watch,flying,fish,hard,day,work,know,prefer,would,...,swim,seems,like,woke,one,decided,hey,today,fly,away
0,0.0,1.287682,0.0,1.693147,1.0,0.0,1.287682,1.693147,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.693147,0.0,1.693147,0.0
1,0.0,0.0,0.0,0.0,2.0,0.0,1.287682,0.0,0.0,1.693147,...,0.0,1.693147,1.693147,0.0,1.693147,0.0,0.0,0.0,0.0,1.693147
2,1.693147,2.575364,1.693147,0.0,1.0,1.693147,0.0,0.0,1.693147,0.0,...,1.693147,0.0,0.0,1.693147,0.0,1.693147,0.0,1.693147,0.0,0.0


# We can see here very clearly the similarities between these two models.

# Furthermore, the most important words to a text, or also called keywords, are the ones with the highest values and show the importance of those words in the sentence.

# One problem we have though is, that these kind of BoW-Models do not take word context into account.

# Let's try getting word context by looking at a very famous book: "The Adventures of Sherlock Holmes" by Arthur Conan Doyle.

# Due to the great ressource "www.gutenberg.org", where a very big amount of text can be used for these kind of things, I will import the text in here and look at which words are most similar to sherlock or other important figures in this novel.

# Let's get started!

In [10]:
# First we use our old trusty open-function to import the data into a string-variable:
with open("SherlockHolmes.txt", 'r+', encoding='utf-8') as file:
    file_text = file.read()

# Next we cut to the good part of the story and make sure we leave everything but the actual book content out:
corpus = file_text[file_text.find("To Sherlock Holmes she is always the woman."):file_text.find("END OF THE PROJECT GUTENBERG EBOOK") - 4].replace("\n", " ")

# Now we need to define two functions that preprocess the data for us:
def process_corpus(corpus):
    sentence_tokenized_corpus = tokenizer.tokenize(corpus)
    returned_list = list()
    helper_list = list()
    for sentence in sentence_tokenized_corpus:
        word_tokenized_sentence = [word.lower().strip('.').strip('?').strip('!') for word in sentence.replace(",","").replace("-"," ").replace(":","").split()]
        helper_list.append(word_tokenized_sentence)
        returned_list.append(helper_list)
    return returned_list

# This creates a list of lists, where the sentences are the inside lists. This is needed to use the gensim Model Function Word2Vec
def create_list_of_lists(corpus):
    all_sentences = list()
    for input in corpus:
        for sentence in input:
            all_sentences.append(sentence)
    return all_sentences

# now we tokenize this book:
word_tokenized_book = process_corpus(corpus)

# and make the list of lists to be used as a trained model:
all_sentences = create_list_of_lists(word_tokenized_book)

# Finally we are ready to create our model with this book.
# Since I don't want to wait forever, I'll only use the first 10000 words of this book.
print(len(all_sentences))
corpus_model = gensim.models.Word2Vec(all_sentences[:10000], vector_size=96, window=5, min_count=1, workers=3, sg=1)

# Let's see which words are most similar to sherlock:
similar_extraction = corpus_model.wv.most_similar("hudson", topn=20)
print(similar_extraction)


26081449
[('abroad', 0.9899264574050903), ('waylaid', 0.9898195266723633), ('mania', 0.9890733361244202), ('constitution', 0.9887520670890808), ('blasted', 0.9863550066947937), ('recently', 0.9863343834877014), ('admirers', 0.9858965873718262), ('finder', 0.9855631589889526), ('destroyed', 0.985058069229126), ('wronged', 0.9850126504898071), ('eton', 0.984745979309082), ('loading', 0.9845803380012512), ('ascertained', 0.9842536449432373), ('ragged', 0.984205961227417), ('suspended', 0.9840380549430847), ('definitely', 0.9837009310722351), ('cure', 0.9831808805465698), ('advertisement—how', 0.9829784631729126), ('tiniest', 0.9828833937644958), ('“five', 0.9828272461891174)]
