# Setup

In [1]:
import re
import requests
from bs4 import BeautifulSoup as bs

import numpy as np
import pandas as pd

import spacy
from spacy import displacy 

N gram module for word similarity

In [2]:
import ngram

N gram sub-packages from the Natural Language Tool Kit

In [3]:
from nltk import bigrams, trigrams
from collections import Counter, defaultdict

In [4]:
from sympy import Matrix

# Download data

In [5]:
response_northern = requests.get("http://www.gutenberg.org/files/18947/18947-h/18947-h.htm")

In [6]:
html_text_northern = response_northern.text

In [7]:
m = re.findall(r"<p[^>]*>", html_text_northern)
n = re.findall(r"</p>", html_text_northern)
(len(m), len(n))

(2266, 2261)

In [8]:
response_greek = requests.get("http://www.gutenberg.org/files/32326/32326-h/32326-h.htm")

In [9]:
html_text_greek = response_greek.text

# Preprocessing

## Parse HTML with BeautifulSoup

In [10]:
soup_greek = bs(html_text_greek, 'html.parser')

In [11]:
match = re.search(r"^([^\n]*\n){20}", soup_greek.prettify())
print(match.group(0))

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <title>
   The Project Gutenberg eBook of Tales of Troy and Greece, by Andrew Lang
  </title>
  <style type="text/css">
   body {
    margin-left: 10%;
    margin-right: 10%;
}

    h1,h2,h3,h4,h5,h6 {
    text-align: center; /* all headings centered */
    clear: both;
}

p {



## Extract stories

In [12]:
body_northern = soup_greek.find("body")
story_started = False
story_ended = False

epos_name = ""
chapter_name = ""
chapter_body = []
epos_count = 0
chapter_count = 0

def create_record(epos_name, chapter_name, chapter_body):
    record = {}
    record["epos_name"] = epos_name
    record["chapter_name"] = chapter_name
    record["body"] = "\n".join(chapter_body)
    return record

record_list = []
for e in body_northern:
    
    # Detect story start
    if e.name == "h2":
        # print(e)
        a1 = e.find("a")
        if a1 is not None:
            if a1.get("id") == "ULYSSES_THE_SACKER_OF_CITIES":
                story_started = True
    # Detect story end
    elif e.name == "p":
        a2 = e.find("a")
        if a2 is not None:
            if a2.get("id") == "Page_288":
                story_ended = True
        
    
    if story_started is False:
        continue
        
    # Extract story
    # Extract the name of epos
    if e.name == "h2":
        a = e.find("a")
        title = a.get("id")
        if re.match(r"[IXVacb]+", title) is None:
            
            if epos_count > 0:
                record_list.append(create_record(epos_name, chapter_name, chapter_body))
                chapter_body = []
            
            epos_name = title
            chapter_count = 0
            epos_count = epos_count + 1
                

    if e.name == "p":
        class_list = e.get("class")
        if class_list is not None:
            if "center" in class_list:
                
                if chapter_count > 0:
                    record_list.append(create_record(epos_name, chapter_name, chapter_body))
                    chapter_body = []
                    
                chapter_name = e.string
                chapter_count = chapter_count + 1
                
        else:
            if e.string is not None:
                # remove page numbers
                span = e.find("span")
                
                if span is not None:
                    span.decompose()
                
                chapter_body.append(e.string)
        
    
    if story_ended:
        record_list.append(create_record(epos_name, chapter_name, chapter_body))
        chapter_body = []
        break
    
df = pd.DataFrame(record_list) 
df

Unnamed: 0,epos_name,chapter_name,body
0,ULYSSES_THE_SACKER_OF_CITIES,THE BOYHOOD AND PARENTS OF ULYSSES,"Long ago, in a little island called Ithaca, on..."
1,ULYSSES_THE_SACKER_OF_CITIES,HOW PEOPLE LIVED IN THE TIME OF ULYSSES,"There were bath rooms with polished baths, whe..."
2,ULYSSES_THE_SACKER_OF_CITIES,THE WOOING OF HELEN OF THE FAIR HANDS,Tyndarus first made all the princes take an oa...
3,ULYSSES_THE_SACKER_OF_CITIES,THE STEALING OF HELEN,"This happy time did not last long, and Telemac..."
4,ULYSSES_THE_SACKER_OF_CITIES,TROJAN VICTORIES,"Then Agamemnon hastened here and there, biddin..."
5,ULYSSES_THE_SACKER_OF_CITIES,BATTLE AT THE SHIPS,The armies rushed on each other and hewed each...
6,ULYSSES_THE_SACKER_OF_CITIES,THE SLAYING AND AVENGING OF PATROCLUS,"At this moment, when torches were blazing roun..."
7,ULYSSES_THE_SACKER_OF_CITIES,"THE CRUELTY OF ACHILLES, AND THE RANSOMING OF\...",When Achilles was asleep that night the ghost ...
8,ULYSSES_THE_SACKER_OF_CITIES,HOW ULYSSES STOLE THE LUCK OF TROY,"After Hector was buried, the siege went on slo..."
9,ULYSSES_THE_SACKER_OF_CITIES,THE BATTLES WITH THE AMAZONS AND MEMNON—THE\nD...,"The Amazons were a race of warlike maids, who ..."


Extract all body text for later use

In [13]:
corpus = df["body"].str.cat(sep=' ')

In [14]:
text = df[df["chapter_name"] == "THE STEALING OF HELEN"]["body"].iloc[0]

In [15]:
match = re.match(r"^([^\n]*\n){10}", text)
print(match.group(0))

This happy time did not last long, and Telemachus was
still a baby, when war arose, so great and mighty and
marvellous as had never been known in the world. Far
across the sea that lies on the east of Greece, there dwelt
the rich King Priam. His town was called Troy, or Ilios, and
it stood on a hill near the seashore, where are the straits of
Hellespont, between Europe and Asia; it was a great city
surrounded by strong walls, and its ruins are still standing.
The kings could make merchants who passed through the
straits pay toll to them, and they had allies in Thrace, a



# Analyzis

## Tokoenization

In [16]:
tokens_1 = re.findall(r"[\w]+", corpus)
tokens_2 = re.findall(r"[a-zA-Z]+", corpus)
tokens_1[0:10]

['Long', 'ago', 'in', 'a', 'little', 'island', 'called', 'Ithaca', 'on', 'the']

In [17]:
(len(tokens_1), len(tokens_2))

(47584, 47614)

In [18]:
tokens_2_lower = [t.lower() for t in tokens_2]

In [19]:
tokens_df = pd.DataFrame(tokens_2_lower) 
tokens_df.columns = ["token"]
tokens_df

Unnamed: 0,token
0,long
1,ago
2,in
3,a
4,little
...,...
47609,he
47610,had
47611,killed
47612,king


In [20]:
vocab_df = pd.DataFrame() 
vocab_df["count"] = tokens_df["token"].value_counts()
vocab_df["token"] = vocab_df.index
vocab_df.reset_index(level = 0, inplace = True)
del(vocab_df["index"])
vocab_df

Unnamed: 0,count,token
0,3525,the
1,2769,and
2,1536,of
3,1182,to
4,1004,he
...,...,...
4076,1,bless
4077,1,spite
4078,1,instance
4079,1,hence


In [21]:
vocab_df["len"] = vocab_df["token"].apply(lambda x: len(x))

In [22]:
vocab_df.sort_values(by="count")

Unnamed: 0,count,token,len
4080,1,clanked,7
2938,1,lorded,6
2939,1,worse,5
2940,1,worthy,6
2941,1,behave,6
...,...,...,...
4,1004,he,2
3,1182,to,2
2,1536,of,2
1,2769,and,3


In [23]:
vocab_df.sort_values(by="len")

Unnamed: 0,count,token,len
280,24,n,1
32,209,i,1
1423,3,o,1
5,851,a,1
67,92,s,1
...,...,...,...
1577,3,breastplates,12
1853,2,clytaemnestra,13
2369,2,dishonourable,13
2635,1,forgetfulness,13


# N Gram for word completion

In [24]:
G = ngram.NGram(tokens_2_lower)

In [25]:
similar_words = G.search("hel")[0:20]
similar_words

[('heel', 0.5714285714285714),
 ('help', 0.375),
 ('held', 0.375),
 ('heal', 0.375),
 ('helm', 0.375),
 ('helen', 0.3333333333333333),
 ('helle', 0.3333333333333333),
 ('helped', 0.3),
 ('helmet', 0.3),
 ('he', 0.2857142857142857),
 ('helmets', 0.2727272727272727),
 ('helenus', 0.2727272727272727),
 ('helpers', 0.2727272727272727),
 ('her', 0.25),
 ('helmsman', 0.25),
 ('helpless', 0.25),
 ('hellward', 0.25),
 ('hear', 0.2222222222222222),
 ('hall', 0.2222222222222222),
 ('hill', 0.2222222222222222)]

In [26]:
prediction_canidates = filter(lambda x: x[0].startswith("hel"), similar_words)
for i, w in enumerate(prediction_canidates):
    print(w)
    if i > 30:
        break

('help', 0.375)
('held', 0.375)
('helm', 0.375)
('helen', 0.3333333333333333)
('helle', 0.3333333333333333)
('helped', 0.3)
('helmet', 0.3)
('helmets', 0.2727272727272727)
('helenus', 0.2727272727272727)
('helpers', 0.2727272727272727)
('helmsman', 0.25)
('helpless', 0.25)
('hellward', 0.25)


# Sentence tokenization

In [27]:
sentences = re.findall(r"[^?.!]{2,}[?.!]", text)
(sentences[0], sentences[-1])

('This happy time did not last long, and Telemachus was\nstill a baby, when war arose, so great and mighty and\nmarvellous as had never been known in the world.',
 ' But they knew not where he was, and the\nGreeks claimed the victory, and thought that, as Paris had\nthe worst of the fight, Helen would be restored to them,\nand they would all sail home.')

In [28]:
sent_df = pd.DataFrame(sentences) 
sent_df.columns = ["sentence"]
sent_df

Unnamed: 0,sentence
0,"This happy time did not last long, and Telemac..."
1,Far\nacross the sea that lies on the east of ...
2,"His town was called Troy, or Ilios, and\nit s..."
3,\nThe kings could make merchants who passed th...
4,Priam had many beautiful things; he\nhad a vi...
...,...
71,It was just like the\ntapestry at Bayeux on w...
72,"Helen\nwas very fond of embroidering, like po..."
73,Probably\nthe work kept both Helen and Mary f...
74,\nMeanwhile Menelaus was seeking for Paris eve...


# N gram next word prediction

In [29]:
model = defaultdict(lambda: defaultdict(lambda: 0))

In [30]:
tokens_1 = re.findall(r"[\w]+", corpus)
for i, sentence in enumerate(sentences):
    if i < 1:
        print(sentence, end="\n\n")
    for w1, w2, w3 in trigrams(re.findall(r"[\w]+", sentence), pad_right=True, pad_left=True):
        # print the trigram process for the first sentence
        if i < 1:
            print(w1, w2, w3)
        model[(w1, w2)][w3] += 1

This happy time did not last long, and Telemachus was
still a baby, when war arose, so great and mighty and
marvellous as had never been known in the world.

None None This
None This happy
This happy time
happy time did
time did not
did not last
not last long
last long and
long and Telemachus
and Telemachus was
Telemachus was still
was still a
still a baby
a baby when
baby when war
when war arose
war arose so
arose so great
so great and
great and mighty
and mighty and
mighty and marvellous
and marvellous as
marvellous as had
as had never
had never been
never been known
been known in
known in the
in the world
the world None
world None None


In [31]:
record_list_trigram = []

for i, key_1 in enumerate(model):
    bigram = key_1
    next_words = model[bigram]
    for key_2 in next_words:
        third_word = key_2
        count = next_words[third_word]
        
        records_trigram = {}
        records_trigram["first_word"] = bigram[0]
        records_trigram["second_word"] = bigram[1]
        records_trigram["third_word"] = third_word
        records_trigram["count"] = count
        record_list_trigram.append(records_trigram)
        
        
df_trigrams = pd.DataFrame(record_list_trigram) 
df_trigrams

Unnamed: 0,first_word,second_word,third_word,count
0,,,This,2
1,,,Far,1
2,,,His,2
3,,,The,10
4,,,Priam,1
...,...,...,...,...
2137,they,would,all,1
2138,would,all,sail,1
2139,all,sail,home,1
2140,sail,home,,1


In [32]:
df_trigrams[df_trigrams["first_word"] == "he"]

Unnamed: 0,first_word,second_word,third_word,count
271,he,had,a,2
272,he,had,the,1
273,he,had,to,1
274,he,had,not,1
275,he,had,taken,1
390,he,left,the,1
427,he,wore,a,1
443,he,was,taken,1
444,he,was,too,1
445,he,was,young,1


In [33]:
for w1_w2 in model:
    total_count = float(sum(model[w1_w2].values()))
    for w3 in model[w1_w2]:
        model[w1_w2][w3] /= total_count

In [34]:
model["he", "had"]

defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
            {'a': 0.3333333333333333,
             'the': 0.16666666666666666,
             'to': 0.16666666666666666,
             'not': 0.16666666666666666,
             'taken': 0.16666666666666666})

## Spacy

In [35]:
nlp_model =  spacy.load("en_core_web_lg")

OSError: [E050] Can't find model 'en_core_web_lg'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [None]:
nlp_doc = nlp_model(corpus)

### Spacy document structure

In [None]:
print("SENTENCE:\n")
for s in nlp_doc.sents:
    print(s)
    print("\nTOKENS:\n")
    for token in s:
        print(token)
    break

In [None]:
ind_1 = 0
ind_10 = 0
for i, s in enumerate(nlp_doc.sents):
    if i == 1:
        ind_1 = s.end
    elif i > 10:
        ind_10 = s.end
        break
(ind_1, ind_10)

In [None]:
first_sent = nlp_doc[0:ind_1]
head_sent = nlp_doc[0:ind_10]

first_sent

### Sentence level syntactical dependecy parsing

In [None]:
displacy_image = displacy.render(first_sent, jupyter = True, style = "dep")

### Named Entitiy Recognition (NER)

Entity labels:
    
- **GPE**: Geopolitical enitity
- **LOC**: Non-GPE entity
- **ORG**: Organization

In [None]:
displacy_image = displacy.render(head_sent, jupyter = True, style = "ent")

### Similarity search

In [None]:
record_list_spacy = []

for i, s in enumerate(nlp_doc.sents):
    for token in s:
        if token.has_vector:
            record_spacy = {}
            record_spacy["sentence"] = i
            record_spacy["token"] = token
            record_spacy["token_text"] = token.text
            
            record_list_spacy.append(record_spacy)
        
df_spacy = pd.DataFrame(record_list_spacy)
df_spacy

In [None]:
zeus_tokens = df_spacy[df_spacy["token"].apply(lambda t: t.text == "Zeus")]
zeus_tokens

In [None]:
token_zeus = zeus_tokens.iloc[1]["token"]
(token_zeus, type(token_zeus))

In [None]:
df_tokens = df_spacy.drop_duplicates("token_text")
df_tokens.set_index("token_text", inplace = True)
df_tokens

In [None]:
df_tokens["token"]["Zeus"]

In [None]:
token_zeus.similarity(token_zeus)

In [None]:
df_tokens["similarity_zeus"] = df_tokens["token"].apply(lambda t: t.similarity(token_zeus))
df_tokens

In [None]:
pd.set_option('display.max_rows', None)
df_tokens\
    .sort_values(by = "similarity_zeus", ascending = False) \
    .tail(10)

In [None]:
pd.set_option('display.max_rows', None)
df_tokens\
    .sort_values(by = "similarity_zeus", ascending = False) \
    .head(20)

In [None]:
pd.set_option('display.max_rows', 10)

Gold standard for gods, godesses and titans

In [None]:
df_gold_standard = pd.DataFrame(["Zeus", "Hera", "Poseidon", "Hades", "Aphrodite", "Demeter", "Dionysus", "Arthemis", "Prometheus", "Apollo", "Persephone"])
df_gold_standard.columns = ["token_text"]
df_decoy = pd.DataFrame(["Greek", "Cyclops", "Perseus", "Gods", "gods", "Medusa", "Orion", "godesses", "Godesses"])
df_decoy.columns = ["token_text"]

df_decoy

In [None]:
df_tokens["gold_standard"] = df_tokens.index.isin(df_gold_standard["token_text"]).astype(int)
df_tokens["decoy"] = df_tokens.index.isin(df_decoy["token_text"]).astype(int)
df_tokens 

In [None]:
df_tokens.loc["Hera", :]

In [None]:
df_tokens.loc["Greek", :]

In [None]:
gold_standard_arr = df_tokens[
    df_tokens["gold_standard"] == 1
]["token"]\
    .apply(lambda t: t.vector) \
    .to_numpy()
gold_standard_arr = np.concatenate(gold_standard_arr)

decoy_arr = df_tokens[
    df_tokens["decoy"] == 1
]["token"]\
    .apply(lambda t: t.vector) \
    .to_numpy()
decoy_arr = np.concatenate(decoy_arr)

(gold_standard_arr.shape, decoy_arr.shape)

In [None]:
gold_standard_arr = gold_standard_arr.reshape((300, -1))
gold_standard_arr.shape

In [None]:
decoy_arr = decoy_arr.reshape((300, -1))
decoy_arr.shape

In [None]:
gold_standard_mean_arr =  np.mean(gold_standard_arr, axis = 1)
decoy_mean_arr =  np.mean(decoy_arr, axis = 1)