In [3]:
try:
    with open("../global_setup.py") as setupfile:
        exec(setupfile.read())
except FileNotFoundError:
    print('Setup already completed')

In [18]:
from gensim.models.fasttext import FastText
import gensim.models.keyedvectors as word2vec
from gensim.test.utils import common_texts
from pathlib import Path
bin_path = Path("data", "fasttext", "wiki.da.bin")
model = FastText.load_fasttext_format(str(bin_path))

print(model.most_similar('orange'))

[('oranges', 0.8172897696495056), ('orangegult', 0.8097963333129883), ('rødorange', 0.7998033761978149), ('oranger', 0.7939025163650513), ('orangegule', 0.7664146423339844), ('orangerød', 0.7639530897140503), ('orangerødt', 0.7569674253463745), ('orang', 0.7512965202331543), ('orangerøde', 0.7378300428390503), ('gulorange', 0.7288374900817871)]


In [16]:
print(bin_path)

data/fasttext/wiki.da.bin


Next steps:
1) Compute vectorized representations of all danish wikipedia abstracts
2) Compute vectorized representations of news titles
3) Find wikipedia article with minimal cosine distance from a given news title

## 1. Working with Danish wikipedia abstracts

In [19]:
from src.text.document_retrieval.wikipedia import Wikipedia
wikipedia = Wikipedia(
    language="Danish",
    cache_directory_url=False
)

Loading parsed documents.
Loading preprocessed documents.
Wikipedia loaded.


In [20]:
wikipedia_vocabulary = wikipedia.vocabulary
display(wikipedia_vocabulary)

{'arkæologi': 21784,
 'er': 92267,
 'studiet': 340178,
 'af': 7900,
 'tidligere': 357225,
 'tiders': 357170,
 'menneskelige': 227900,
 'aktivitet': 11370,
 'primært': 279737,
 'gennem': 123947,
 'menneskets': 227996,
 'materielle': 223908,
 'levn': 207131,
 'langt': 202852,
 'det': 74466,
 'meste': 228767,
 'al': 11570,
 'menneskelig': 227899,
 'foregik': 108793,
 'før': 119726,
 'vi': 385731,
 'lærte': 216174,
 'at': 24200,
 'skrive': 321525,
 'så': 347228,
 'den': 73492,
 'vigtigste': 386784,
 'metode': 229484,
 'til': 357721,
 'studere': 340047,
 'ældre': 403432,
 'menneskeskabte': 227980,
 'samfund': 304810,
 'asien': 23367,
 'verdens': 384235,
 'største': 341206,
 'kontinent': 188826,
 'med': 225418,
 'et': 94094,
 'areal': 21028,
 'på': 284582,
 'cirka': 62400,
 '44': 3261,
 '58': 3667,
 'millioner': 231942,
 'km²': 184069,
 'aalborg': 5007,
 'universitet': 377188,
 'esbjerg': 93384,
 'kendt': 178847,
 'som': 327620,
 'aue': 25127,
 'og': 256026,
 'aaue': 5198,
 'men': 227602,
 '

In [22]:
print(wikipedia.documents[0].abstract)

Arkæologi er studiet af tidligere tiders menneskelige aktivitet, primært gennem studiet af menneskets materielle levn. Langt det meste af al menneskelig aktivitet foregik, før vi lærte at skrive, så arkæologi er den vigtigste metode til at studere ældre menneskeskabte samfund.


In [33]:
model.wv.vocab["andrei".lower()]

<gensim.models.keyedvectors.Vocab at 0x11d8ca588>

In [69]:
print(type(model.wv["a"]))
#print(model.wv["b"])
#print(model.wv["a b"])

<class 'numpy.ndarray'>


In [161]:
import re
import numpy as np
def sumVectorRepresentation(text):
    pattern = re.compile('[^a-zA-Z0-9åÅøØæÆ ]+', re.UNICODE)
    text = pattern.sub('', text)
    words = text.lower().split()
    text_vector = np.zeros(model.wv["a"].shape)
    for i in range(len(words)):
        try:
            text_vector = text_vector + model.wv[words[i]]
        except KeyError as e:
            continue
    return text_vector
    
#sumVectorRepresentation("Han sagde")

In [163]:
i = 0
i_max = 0
wikipedia_abstract_vectors = []
for n in range(len(wikipedia.documents)):
    wikipedia_abstract_vectors.append(sumVectorRepresentation(wikipedia.documents[n].abstract))
    i = i + 1
    if i_max > 0 and i > i_max:
        break

In [166]:
from scipy.spatial.distance import cdist

def cdist_func(A, B):
    dists = cdist(A, B, 'cosine')
    return np.argmin(dists, axis=1), np.min(dists, axis=1)

example_title = "Anmeldelse: Stærkt og aldeles udsat står Morten Hee Andersen alene på scenen over for en omverden, der emmer af bøsseskræk"
example_title_vector = sumVectorRepresentation(example_title)

cdist_func(wikipedia_abstract_vectors, [example_title_vector])

(array([0, 0, 0, ..., 0, 0, 0]),
 array([0.25099826, 0.41781022, 0.28794112, ..., 0.21076537, 0.22810348,
        0.15573341]))

In [130]:
v = 0
for i in range(6):
    print("i: {}, v:{}".format(i, v))
    if i == 3:
        try:
            v = v + "3"
        except Exception as e:
            print(e)
        finally:
            print("error v:{}".format(v))
    else:
        v = v + 1
        
v

i: 0, v:0
i: 1, v:1
i: 2, v:2
i: 3, v:3
unsupported operand type(s) for +: 'int' and 'str'
error v:3
i: 4, v:3
i: 5, v:4


5

In [157]:
v2 = sumVectorRepresentation(wikipedia.documents[6].abstract)

110
111


In [158]:
vv = v2 - v1
vv

array([ -0.25081357,  -0.37374297,  -8.70032255,  -1.39718067,
        -6.69347515,   1.32755473,  -4.8495525 ,   1.42868779,
         6.51091161,  -0.9197196 ,  -1.68284701,  -2.89145496,
        -0.60771697,   6.40681166,   4.68465476,   7.9193832 ,
         3.31766193,  10.51652775,   2.84766019,   4.25437706,
         7.94595906,   6.7485671 ,  -2.45475044,  -6.87458656,
         1.43761542,   0.76734577,  -6.180908  ,   2.46961494,
        -4.66331288,  -7.73874751,  -9.63316556,  11.40678389,
        -7.78773492,  -2.93635203,  -6.57619451,   5.61561657,
        14.3631576 , -12.0336968 ,   3.50136954, -13.29616358,
         6.9068464 ,   9.5348591 ,  -4.95678569,   3.9912668 ,
         0.10586943,  -9.25968176,   6.14396067,   0.37695618,
        -8.64906464,  -8.1037889 ,  -9.83267982,  -3.41835872,
         0.48193329,  -1.68631451,   3.12382008,  -6.20116489,
        10.87010924,  -7.73049943,  -1.98311402,   1.27330616,
        -4.29380171,  11.50710221,  14.16192581,   4.48