# Using Fasttext word embeddings to search Danish Wikipedia
This notebook goes step-by-step through the following:
1. Load the Fasttext model for Danish language
2. Write a function to compute the vectorized representations of text in Danish
3. Compute vectorized representations for all the abstracts in Danish wikipedia
4. Write a function to compute the cosine distance between the vectorized representations
5. Test everything on some news headlines

**Global setup**

In [None]:
try:
    with open("../global_setup.py") as setupfile:
        exec(setupfile.read())
except FileNotFoundError:
    print('Setup already completed')

In [None]:
from gensim.models.fasttext import FastText
import gensim.models.keyedvectors as word2vec
from gensim.test.utils import common_texts
from scipy.spatial.distance import cdist
from pathlib import Path
import re
import numpy as np

## Load the Fasttext model for Danish language
1. In the root folder of the project, create a directory "data" and a subdirectory "fasttext".
2. Download the pretrained danish fasttext model from here: 
   https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.da.zip
3. Place the files **wiki.da.bin** and **wiki.da.vec** in the folder you have created in (1).

In [None]:
bin_path = Path("data", "fasttext", "wiki.da.bin")
model = FastText.load_fasttext_format(str(bin_path))
#print(model.most_similar('æble'))

Next steps:
1. Compute vectorized representations of all danish wikipedia abstracts
2. Compute vectorized representations of news titles
3. Find wikipedia article with minimal cosine distance from a given news title

## 1. Working with Danish wikipedia abstracts
Here we compute the vectorized representations of the Danish wikipedia abstracts using the pretrained Fasttext model that we just loaded.

### First step: load the Danish wikipedia

In [None]:
from src.text.document_retrieval.wikipedia import Wikipedia
wikipedia = Wikipedia(
    language="Danish",
    cache_directory_url=False
)

### Second step: write a function to calculate the vectorized representation of any text

In [None]:
def sumVectorRepresentation(text, verbose = False):
    pattern = re.compile('[^a-zA-Z0-9åÅøØæÆ ]+', re.UNICODE)
    text = pattern.sub('', text)
    words = text.lower().strip().split()
    text_vector = np.zeros(model.wv["a"].shape)
    if verbose:
        print("len: {}, words: {}".format(len(words), words))
    for i in range(len(words)):
        try:
            text_vector = text_vector + model.wv[words[i]]
        except KeyError as e:
            if verbose:
                print("i: {}, e: {}".format(i, e))
            continue
    return text_vector
    
#sumVectorRepresentation("Han sagde")

### Third step: compute vectorized representations for all Danish wikipedia abstracts
* All empty abstracts and those that do not have any alphanumeric symbols are removed and not considered.

In [None]:
i = 0
i_max = 0
n_removed = 0
wikipedia.documents_clean = wikipedia.documents.copy()
wikipedia_abstract_vectors = []
wikipedia_title_vectors = []
pattern1 = re.compile('[^a-zA-Z0-9åÅøØæÆ]+', re.UNICODE)

for n in range(len(wikipedia.documents)):
    # if abstract length is zero, remove it
    try:
        if len(pattern1.sub('', wikipedia.documents[n].abstract)) == 0:
            del wikipedia.documents_clean[n - n_removed]
            n_removed = n_removed + 1
        else:
            wikipedia_abstract_vectors.append(sumVectorRepresentation(wikipedia.documents[n].abstract))
            wikipedia_title_vectors.append(sumVectorRepresentation(wikipedia.documents[n].title))
            i = i + 1
            if i_max > 0 and i > i_max:
                break
    except IndexError as e:
        print("n: {}, n_removed: {}, w.d: {}, w.d_c: {}".format(n, n_removed, len(wikipedia.documents), len(wikipedia.documents_clean)))
        

## 2. Calculate the cosine distance between the embeddings

In [None]:
def cdist_func(A, B):
    dists = cdist(A, B, 'cosine')
    return np.argmin(dists, axis=0), dists #np.min(dists, axis=0)

## 3. Test the results on some news headlines

In [None]:
# Variable below defines the number of relevant wikipedia articles to consider
n_wiki_matches = 3
# Variable below is an example news headline
example_title = "Tyske myndigheder undersøger 95.000 biler af mærket Opel"
# Hawaii: Flyv med helikopter hen over Kauai — en af verdens smukkeste øer
# Tyske myndigheder undersøger 95.000 biler af mærket Opel
# Salmonella fundet i kalkunbryst solgt i Aldi-butikker


# Calculate the vectorized representation
example_title_vector = sumVectorRepresentation(example_title)

cdist_result = cdist_func(wikipedia_abstract_vectors, [example_title_vector])
cdist_list = cdist_result[1] # List of all the cosine distances
cdist_list_sorted = np.sort(cdist_list, axis = 0) # Sorted list of cosine distances - to get top N matches

# Print the results
print("Example headline: {}\r\n".format(example_title))
## Print all the matches with their abstracts
for i in range(n_wiki_matches):
    result = np.where(cdist_list == cdist_list_sorted[i])
    print("{} Wikipedia article {}: \r\n Abstract: {}\r\n".format(i, 
                                                       wikipedia.documents_clean[result[0][0]],
                                                       wikipedia.documents_clean[result[0][0]].abstract))

In [None]:
%%capture
from ipywidgets.widgets import Accordion, HTML
from notebooks.exercises.src.text.news_sentiment_1 import RSSDashboard
RSSdb = RSSDashboard()
s = RSSdb._do_sentiment_analysis(selected_value = 0)

list_labels = []
for i in range(len(RSSdb.data_titles)):
    result_content = "<ol>"
    cdist_result = cdist_func(wikipedia_abstract_vectors, [sumVectorRepresentation(RSSdb.data_titles[i])])
    cdist_list = cdist_result[1] # List of all the cosine distances
    cdist_list_sorted = np.sort(cdist_list, axis = 0) # Sorted list of cosine distances - to get top N matches
    
    
    ## Print all the matches with their abstracts
    for i in range(n_wiki_matches):
        result = np.where(cdist_list == cdist_list_sorted[i])
        result_content = result_content + "<li>{}: <p> {}</p>".format(wikipedia.documents_clean[result[0][0]].title,
                                                                          wikipedia.documents_clean[result[0][0]].abstract)
    result_content = result_content + "<ol>"
    list_labels.append(HTML(value = result_content))

accordion = Accordion(children = (list_labels),)

for i in range(len(RSSdb.data_titles)):
    accordion.set_title(i, "{}. {}".format(i + 1, RSSdb.data_titles[i]))

In [None]:
display(accordion)

In [None]:
from polyglot.text import Text
#polyglot download embeddings2.da
#polyglot download ner2.da

blob = "Finanstilsynet afviser Danske Banks kronprins som direktør"
text = Text(blob, hint_language_code='da')
text.entities

In [None]:
for i in range(len(RSSdb.data_titles)):
    text = Text(RSSdb.data_titles[i], hint_language_code='da')
    print("{} \r\n {} \r\n".format(RSSdb.data_titles[i], text.entities))

In [None]:
#polyglot download pos2.da
for i in range(len(RSSdb.data_titles)):
    text = Text(RSSdb.data_titles[i], hint_language_code='da')
    print("\r\n\r\n {} \r\n".format(RSSdb.data_titles[i]))
    print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30)
    for word, tag in text.pos_tags:
        print(u"{:<16}{:>2}".format(word, tag))

In [None]:
from treetagger import TreeTagger
tt = TreeTagger(path_to_treetagger='C:/TreeTagger', language='danish')
tt.tag('Dette er en sætning.')
tt.get_installed_lang()
treetagger.__file__

In [None]:
s = "Søren Hansen om den »nøgne sandhed«: Erdogan går på diplomatiske listefødder for at forbedre sit forhold til USA"
pattern = re.compile('[^a-zA-Z0-9åÅøØæÆ ]+', re.UNICODE)
s = pattern.sub('', s)
words = s.lower().strip().split()
words_copy = words.copy()
n_removed = 0
stop_words = ["den", "det", "en", "et", "om", "for", "til", "at", "på", "som", "jeg", "mig", "mine", "min", "mit", "du", "dig", "din", "dit", "dine", "han", "ham", "hun", "hende", "de", "dem", "vi", "os", "sin", "sit", "sine", "sig"]
for i in range(len(words)):
    if words[i] in stop_words:
        words_copy.pop(i - n_removed)
        n_removed = n_removed + 1

for i in range(len(words_copy)):
    if i > 0:
        words_copy.append(words_copy[i - 1] + " " + words_copy[i])

words = words_copy

In [None]:
r = []
for i in range(len(words)):
    cdist_result = cdist_func(wikipedia_title_vectors, [sumVectorRepresentation(words[i])])
    cdist_result2 = cdist_func([sumVectorRepresentation(s)], [sumVectorRepresentation(words[i])])
    
    cdist_list1 = cdist_result[1] # List of all the cosine distances
    cdist_list2 = cdist_result2[1]
    cdist_list = (cdist_list1 + cdist_list2) / 2
    cdist_list_sorted = np.sort(cdist_list, axis = 0) # Sorted list of cosine distances - to get top N matches
    
    x = np.where(cdist_list == cdist_list_sorted[0])[0]
    r.append( (x, cdist_list[x][0]))
    #print("{} {} {} {}".format(x, wikipedia.documents_clean[x[0]].title, cdist_list[x], words[i]))

# When np.where returns multiple matches, we flatten them
r_copy = r.copy()
uniques = []
for i in range(len(r)-1, -1, -1):
    if len(r[i][0]) > 1:
        r_copy.pop(i)
        for j in range(len(r[i][0])):
            r_copy.append( (np.array([r[i][0][j]]), r[i][1]))

# Remove duplicate wikipedia pages. They occur because different n-grams can match the same pages
for i in range(len(r_copy)-1,-1,-1):
    if r_copy[i][0] in uniques:
        r_copy.pop(i)
    else:
        uniques.append(r_copy[i][0])

In [None]:
r = r_copy
# Transform into list of tuples
r = [ (r[i][0][0], r[i][1][0]) for i in range(len(r))]
# Sort the list of tuples by cosine distance
r = sorted(r, key=lambda tup: tup[1])

In [None]:
for i in range(len(r)):
    print("{:.2f} {}: \r\n Abstract: {}\r\n".format(r[i][1], 
                                                       wikipedia.documents_clean[r[i][0]].title,
                                                       wikipedia.documents_clean[r[i][0]].abstract))