In [None]:
try:
    with open("../global_setup.py") as setupfile:
        exec(setupfile.read())
except FileNotFoundError:
    print('Setup already completed')

In [None]:
import pprint
import numpy as np
from gensim.models.fasttext import FastText
from scipy.spatial.distance import cdist
import re
# ESA relatedness package
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from src.text.document_retrieval.wikipedia import Wikipedia # Generic Wikipedia class
wikipedia = Wikipedia(
    language="Danish",
    cache_directory_url=False
)

In [None]:
class RsspediaInit:
    def __init__(self, wikipedia: Wikipedia, embedding_composition = "sum"):
        self.embedding_composition = embedding_composition
        self.search_results = []
        self.content = self.wikipedia_results = None
        # Initialize wikipedia
        self.wikipedia = wikipedia
        # Remove all the line breaks and caret returns from wiki texts
        pattern = re.compile('[\n\r ]+', re.UNICODE)
        self.texts = [self.wikipedia.documents[i].text for i in range(len(self.wikipedia.documents))]
        self.texts = [pattern.sub(' ', self.texts[i]) for i in range(len(self.texts))]
        self.texts_clean = [self.getCleanWordsList(self.texts[i], return_string = True) for i in range(len(self.texts))]
        
        # Calculate tf-idf representation for wiki texts (takes time)
        self._transformer = TfidfVectorizer(stop_words = None, norm = "l2", use_idf = True, sublinear_tf = False)
        self._Y = self._transformer.fit_transform(self.texts_clean)
        
        # Fasttext: initialize the model
        bin_path = Path("data", "fasttext", "wiki.da.bin")
        self.model = FastText.load_fasttext_format(str(bin_path))
        
        # Fasttext: Compute vectorized representation for all wikipedia articles (takes time)
        i = 0
        i_max = 0
        n_removed = 0
        self.wikipedia.documents_clean = self.wikipedia.documents.copy()
        self.wikipedia_abstract_vectors = []
        self.wikipedia_title_vectors = []
        pattern1 = re.compile('[^a-zA-Z0-9åÅøØæÆ]+', re.UNICODE)

        for n in range(len(self.wikipedia.documents)):
            # if abstract length is zero, remove it
            try:
                if len(pattern1.sub('', self.wikipedia.documents[n].abstract)) == 0:
                    del self.wikipedia.documents_clean[n - n_removed]
                    n_removed = n_removed + 1
                else:
                    self.wikipedia_abstract_vectors.append(self.sumVectorRepresentation(text = self.wikipedia.documents[n].abstract, type = self.embedding_composition))
                    self.wikipedia_title_vectors.append(self.sumVectorRepresentation(text = self.wikipedia.documents[n].title, type = self.embedding_composition))
                    
                    i = i + 1
                    if i_max > 0 and i > i_max:
                        break
            except IndexError as e:
                print("n: {}, n_removed: {}, w.d: {}, w.d_c: {}".format(n, n_removed, len(self.wikipedia.documents), len(self.wikipedia.documents_clean)))
    
    def getCleanWordsList(self, text, return_string = False):
        pattern = re.compile('[^a-zA-Z0-9åÅøØæÆ ]+', re.UNICODE)
        text = pattern.sub('', text)
        words = text.lower().strip().split()
        words_copy = words.copy()
        #stop_words = ["den", "det", "en", "et", "om", "for", "til", "at", "af", "på", "som", "og", 
        #              "jeg", "mig", "mine", "min", "mit", "du", "dig", "din", "dit", "dine", "han", "ham", "hun", "hende", 
        #              "de", "dem", "vi", "os", "sin", "sit", "sine", "sig"]
        
        stop_words = ["den", "det", "denne", "dette", "en", "et", "om", "for", "til", "at", "af", "på", "som", "og", "er"]
        
        n_removed = 0
        for i in range(len(words)):
            if words[i] in stop_words:
                words_copy.pop(i - n_removed)
                n_removed = n_removed + 1
        if return_string:
            return ' '.join(words_copy)
        else:
            return words_copy
    
    def sumVectorRepresentation(self, text, verbose = False, type = "sum"):
        # Calculates vectorized represetnation of some text
        words = self.getCleanWordsList(text)
        text_vector = np.zeros(self.model.wv["a"].shape)
        if verbose:
            print("len: {}, words: {}".format(len(words), words))
        for i in range(len(words)):
            try:
                if type == "average":
                    text_vector = text_vector + self.model.wv[words[i]] / len(words)
                else: #sum
                    text_vector = text_vector + self.model.wv[words[i]]
            except KeyError as e:
                if verbose:
                    print("i: {}, e: {}".format(i, e))
                continue
        return text_vector


In [None]:
#rsspediainitAVG = RsspediaInit(wikipedia = wikipedia, embedding_composition = "average")
rsspediainit = RsspediaInit(wikipedia = wikipedia, embedding_composition = "sum")

In [None]:
class Rsspedia:
    def __init__(self, wikipedia: Wikipedia, rsspediainit: RsspediaInit):
        self.rsspediainit = rsspediainit
        self.embedding_composition = rsspediainit.embedding_composition
        self.search_results = []
        self.content = self.wikipedia_results = None
        # Initialize wikipedia
        self.wikipedia = wikipedia
        
        self.texts = rsspediainit.texts
        
        # Calculate tf-idf representation for wiki texts (takes time)
        self._transformer = rsspediainit._transformer
        self._Y = rsspediainit._Y
        
        # Fasttext: initialize the model
        self.model = rsspediainit.model
        
        #self.wikipedia.documents_clean 
        self.wikipedia_abstract_vectors = rsspediainit.wikipedia_abstract_vectors
        self.wikipedia_title_vectors = rsspediainit.wikipedia_title_vectors
    
    def get_ngrams(self, text):
        words = self.rsspediainit.getCleanWordsList(text)
        words_copy = words.copy()
        n_removed = 0
        

        for i in range(len(words_copy)):
            if i > 0:
                words_copy.append(words_copy[i - 1] + " " + words_copy[i])

        return words_copy
    
    def cdist_func(self, A, B):
        # Calculates cosine distance
        dists = cdist(A, B, 'cosine')
        return np.argmin(dists, axis=0), dists #np.min(dists, axis=0)

    def display_beautifully(self, titles, texts, urls):
        formatted_result_list = ["<ol>"]
        for i in range(len(titles)):
            formatted_result = "\n".join([
                "<li>",
                f"<p><a href=\"{urls[i]}\">{titles[i]}</a></p>",
                f"<p>{texts[i]}</p>",
                "</li>"
            ])
            formatted_result_list.append(formatted_result)
        formatted_result_list.append("</ol>")
        formatted_results = "\n".join(formatted_result_list)
        return formatted_results

    def search_wiki(self, search_texts, n_matches = 3, search_type = "okapibm25", remove_similar = False, verbose = False, p = 0.5):
        n_mult_factor = 3 # factor to multiply n_matches with
        n_matches = n_matches * 3 # this is done to remove very similar values from the results and ensure we have enough to return
        titles = [] 
        texts = []
        urls = []
        scores = []
        
        # (1) Remove unnecessary symbols from the search text
        pattern = re.compile('[^a-zA-Z0-9åÅøØæÆ ]+', re.UNICODE)

        if search_texts:
            for i, text in enumerate(search_texts):
                # (1) Remove unnecessary symbols from the search text
                text = pattern.sub('', text)
                
                if search_type == "okapibm25":
                    wikipedia_results, search_terms = self.wikipedia.search(query = text, k_1 = 1.2, b = 0.75)
                    for index, score in wikipedia_results[:n_matches].items():
                        document = self.wikipedia.documents[index]
                        titles.append(document.title)
                        texts.append(document.abstract)
                        urls.append(document.url)
                        scores.append(score)
                elif search_type == "esa_relatedness":
                    y = self._transformer.transform([text])
                    D = np.array((self._Y * y.T).todense())
                    indices = np.argsort(-D, axis=0)
                    titles = [self.wikipedia.documents[index].title for index in indices[:n_matches, 0]]
                    texts = [self.wikipedia.documents[index].abstract for index in indices[:n_matches, 0]]
                    urls = [self.wikipedia.documents[index].url for index in indices[:n_matches, 0]]
                elif search_type == "fasttext_a":
                    # Calculate the vectorized representation
                    text_vector = self.rsspediainit.sumVectorRepresentation(text = text, type = self.embedding_composition)

                    cdist_result = self.cdist_func(self.wikipedia_abstract_vectors, [text_vector])
                    cdist_list = cdist_result[1] # List of all the cosine distances
                    cdist_list_sorted = np.sort(cdist_list, axis = 0) # Sorted list of cosine distances - to get top N matches

                    for i in range(n_matches):
                        result = np.where(cdist_list == cdist_list_sorted[i])
                        document = self.wikipedia.documents_clean[result[0][0]]
                        titles.append(document.title)
                        texts.append(document.abstract)
                        urls.append(document.url)
                        scores.append(cdist_list[result])
                elif search_type == "fasttext_b":
                    ngrams = self.get_ngrams(text)
                    r = []
                    for i in range(len(ngrams)):
                        cdist_result = self.cdist_func(self.wikipedia_title_vectors, [self.rsspediainit.sumVectorRepresentation(text = ngrams[i], type = self.embedding_composition)])
                        cdist_result2 = self.cdist_func([self.rsspediainit.sumVectorRepresentation(text = text, type = self.embedding_composition)], [self.rsspediainit.sumVectorRepresentation(text = ngrams[i], type = self.embedding_composition)])

                        cdist_list1 = cdist_result[1] # List of all the cosine distances
                        cdist_list2 = cdist_result2[1]
                        cdist_list = (cdist_list1 * p + cdist_list2 * (1 - p))
                        cdist_list_sorted = np.sort(cdist_list, axis = 0) # Sorted list of cosine distances - to get top N matches
                        
                        for j in range(5):
                            x = np.where(cdist_list == cdist_list_sorted[j])[0]
                            r.append( (x, cdist_list[x][0]))
                            if verbose:
                                print("{} {} {} {}".format(x, wikipedia.documents_clean[x[0]].title, cdist_list[x], ngrams[i]))

                    # When np.where returns multiple matches, we flatten them
                    r_copy = r.copy()
                    uniques = []
                    for i in range(len(r)-1, -1, -1):
                        if len(r[i][0]) > 1:
                            r_copy.pop(i)
                            for j in range(len(r[i][0])):
                                r_copy.append( (np.array([r[i][0][j]]), r[i][1]))

                    # Remove duplicate wikipedia pages. They occur because different n-grams can match the same pages
                    for i in range(len(r_copy)-1,-1,-1):
                        if r_copy[i][0] in uniques:
                            r_copy.pop(i)
                        else:
                            uniques.append(r_copy[i][0])
                    
                    r = r_copy
                    # Transform into list of tuples
                    r = [ (r[i][0][0], r[i][1][0]) for i in range(len(r))]
                    # Sort the list of tuples by cosine distance
                    r = sorted(r, key=lambda tup: tup[1])
                    
                    for i in range(len(r)):
                        document = self.wikipedia.documents_clean[r[i][0]]
                        titles.append(document.title)
                        #print("{} {}".format(document.title, r[i][1]))
                        texts.append(document.abstract)
                        urls.append(document.url)
                        scores.append(r[i][1])
            
            if remove_similar:
                # Removing too similar search results
                # Get vectors of the result titles
                title_result_vectors = [self.rsspediainit.sumVectorRepresentation(text = titles[i], type = self.embedding_composition) for i in range(len(titles))]
                titles_pruned = titles.copy()
                n_removed = 0
                ids_removed = []
                for i in range(len(titles)):
                    # Get cosine distances
                    cdist_result = self.cdist_func(title_result_vectors, [self.rsspediainit.sumVectorRepresentation(text = titles[i], type = self.embedding_composition)])[1]
                    # Sort cosine distances
                    cdist_result_sorted = np.sort(cdist_result, axis = 0)
                    rd = []
                    for j in range(len(titles) - i):
                        if i != j + i:
                            x = np.where(cdist_result == cdist_result_sorted[j + i])[0]
                            rd.append( (x, cdist_result[x][0]))
                            if cdist_result[x][0] < 0.10 and i + j not in ids_removed:
                                titles_pruned.pop(i + j - n_removed)
                                n_removed = n_removed + 1
                                ids_removed.append(i + j)
                                #print("removed: {}".format(i + j))
                            #print("{}-th title: {}, {}-th title: {}, dist: {}".format(i, titles[i], j + i, titles[j + i], cdist_result[x]))
                titles = titles_pruned[:int(n_matches / n_mult_factor)]
        return titles, texts, urls, scores


#rsspedia = Rsspedia(wikipedia, rsspediainitAVG)
rsspedia = Rsspedia(wikipedia, rsspediainitSUM)

In [None]:
#test_headline = "FN's verdensmål sættes i centrum på Folkemødet"
#test_headline = "Google fyrer 13 chefer og 35 medarbejdere for sexchikane"
#test_headline = "Søren Hansen om den »nøgne sandhed«: Erdogan går på diplomatiske listefødder for at forbedre sit forhold til USA"
test_headline = "Søren Hansen"
titles, texts, urls, scores = rsspedia.search_wiki(search_texts = [test_headline], n_matches = 5, search_type = "okapibm25")
pprint.pprint(titles)

titles, texts, urls, scores = rsspedia.search_wiki(search_texts = [test_headline], n_matches = 5, search_type = "esa_relatedness")
pprint.pprint(titles)

titles, texts, urls, scores = rsspedia.search_wiki(search_texts = [test_headline], n_matches = 5, search_type = "fasttext_a")
pprint.pprint(titles)

titles, texts, urls, scores = rsspedia.search_wiki(search_texts = [test_headline], n_matches = 5, search_type = "fasttext_b")
pprint.pprint(titles)

In [None]:
titles, texts, urls, scores = rsspedia.search_wiki(search_texts = [test_headline], n_matches = 5, search_type = "fasttext_b", p = 0.4)
pprint.pprint(titles)

In [None]:
#test_headline = "Søren Hansen om den »nøgne sandhed«: Erdogan går på diplomatiske listefødder for at forbedre sit forhold til USA"

p = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

test_headlines = ["Justin Bieber elsker burger",
                  "Søren Hansen sover godt"]

for i in range(len(p)):
    for j in range(len(test_headlines)):
        titles, texts, urls, scores = rsspedia.search_wiki(search_texts = [test_headlines[j]], n_matches = 15, search_type = "fasttext_b", p = p[i], verbose = False)
        print("*** ({}) {} ***".format(p[i], test_headlines[j]))
        [print("({}) {:.5f} {}".format(k + 1, scores[k], titles[k])) for k in range(len(titles))]
        print("\n")


In [None]:
rsspedia.cdist_func([rsspedia.wikipedia_title_vectors[44919]], [rsspedia.sumVectorRepresentation("sørEN")])[1]