In [None]:
try:
    with open("../global_setup.py") as setupfile:
        exec(setupfile.read())
except FileNotFoundError:
    print('Setup already completed')

In [None]:
import pprint
import numpy as np
from gensim.models.fasttext import FastText
import re
# ESA relatedness package
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from src.text.document_retrieval.wikipedia import Wikipedia # Generic Wikipedia class
wikipedia = Wikipedia(
    language="Danish",
    cache_directory_url=False
)

In [None]:
class Rsspedia:
    def __init__(self, wikipedia: Wikipedia):
        self.search_results = []
        self.content = self.wikipedia_results = None
        # Initialize wikipedia
        self.wikipedia = wikipedia
        # Remove all the line breaks and caret returns from wiki texts
        pattern = re.compile('[\n\r ]+', re.UNICODE)
        self.texts = [self.wikipedia.documents[i].text for i in range(len(self.wikipedia.documents))]
        self.texts = [pattern.sub(' ', self.texts[i]) for i in range(len(self.texts))]
        
        # Calculate tf-idf representation for wiki texts (takes time)
        self._transformer = TfidfVectorizer(stop_words = None, norm = "l2", use_idf = True, sublinear_tf = False)
        self._Y = self._transformer.fit_transform(self.texts)
        
        # Fasttext: Compute vectorized representation for all wikipedia articles (takes time)
        i = 0
        i_max = 0
        n_removed = 0
        self.wikipedia.documents_clean = self.wikipedia.documents.copy()
        self.wikipedia_abstract_vectors = []
        self.wikipedia_title_vectors = []
        pattern1 = re.compile('[^a-zA-Z0-9åÅøØæÆ]+', re.UNICODE)

        for n in range(len(self.wikipedia.documents)):
            # if abstract length is zero, remove it
            try:
                if len(pattern1.sub('', self.wikipedia.documents[n].abstract)) == 0:
                    del self.wikipedia.documents_clean[n - n_removed]
                    n_removed = n_removed + 1
                else:
                    try:
                        self.wikipedia_abstract_vectors.append(self.sumVectorRepresentation(text = self.wikipedia.documents[n].abstract))
                    except Exception:
                        print(self.wikipedia.documents[n].abstract)
                        print(self.sumVectorRepresentation(self.wikipedia.documents[n].abstract))
                        break
                    try:
                        self.wikipedia_title_vectors.append(self.sumVectorRepresentation(text = self.wikipedia.documents[n].title))
                    except Exception:
                        print(self.wikipedia.documents[n].title)
                        print(self.sumVectorRepresentation(self.wikipedia.documents[n].title))
                        break
                    
                    i = i + 1
                    if i_max > 0 and i > i_max:
                        break
            except IndexError as e:
                print("n: {}, n_removed: {}, w.d: {}, w.d_c: {}".format(n, n_removed, len(self.wikipedia.documents), len(self.wikipedia.documents_clean)))
  

    def loadTexts(self, texts):
        self.texts = texts
    
    def sumVectorRepresentation(text, verbose = False):
        # Calculates vectorized represetnation of some text
        pattern = re.compile('[^a-zA-Z0-9åÅøØæÆ ]+', re.UNICODE)
        try:
            text = pattern.sub('', text)
        except Exception:
            print(text)
            print(type(text))
        words = text.lower().strip().split()
        text_vector = np.zeros(model.wv["a"].shape)
        if verbose:
            print("len: {}, words: {}".format(len(words), words))
        for i in range(len(words)):
            try:
                text_vector = text_vector + model.wv[words[i]]
            except KeyError as e:
                if verbose:
                    print("i: {}, e: {}".format(i, e))
                continue
        return text_vector
    
    def cdist_func(A, B):
        # Calculates cosine distance
        dists = cdist(A, B, 'cosine')
        return np.argmin(dists, axis=0), dists #np.min(dists, axis=0)

    def display_beautifully(self, titles, texts, urls):
        formatted_result_list = ["<ol>"]
        for i in range(len(titles)):
            formatted_result = "\n".join([
                "<li>",
                f"<p><a href=\"{urls[i]}\">{titles[i]}</a></p>",
                f"<p>{texts[i]}</p>",
                "</li>"
            ])
            formatted_result_list.append(formatted_result)
        formatted_result_list.append("</ol>")
        formatted_results = "\n".join(formatted_result_list)
        return formatted_results

    def search_wiki(self, search_texts, n_matches = 3, search_type = "okapibm25"):
        
        titles = [] 
        texts = []
        urls = []
        
        # (1) Remove unnecessary symbols from the search text
        pattern = re.compile('[^a-zA-Z0-9åÅøØæÆ ]+', re.UNICODE)

        if search_texts:
            for i, text in enumerate(search_texts):
                # (1) Remove unnecessary symbols from the search text
                text = pattern.sub('', text)
                
                if search_type == "okapibm25":
                    wikipedia_results, search_terms = self.wikipedia.search(query = text, k_1 = 1.2, b = 0.75)
                    for index, score in wikipedia_results[:n_matches].items():
                        document = self.wikipedia.documents[index]
                        titles.append(document.title)
                        texts.append(document.abstract)
                        urls.append(document.url)
                    pprint.pprint(wikipedia_results)
                elif search_type == "esa_relatedness":
                    y = self._transformer.transform([text])
                    D = np.array((self._Y * y.T).todense())
                    indices = np.argsort(-D, axis=0)
                    titles = [self.wikipedia.documents[index].title for index in indices[:n_matches, 0]]
                    texts = [self.wikipedia.documents[index].abstract for index in indices[:n_matches, 0]]
                    urls = [self.wikipedia.documents[index].url for index in indices[:n_matches, 0]]
                elif search_type == "fasttext_a":
                    # Calculate the vectorized representation
                    text_vector = self.sumVectorRepresentation(text)

                    cdist_result = self.cdist_func(self.wikipedia_abstract_vectors, [text])
                    cdist_list = self.cdist_result[1] # List of all the cosine distances
                    cdist_list_sorted = np.sort(cdist_list, axis = 0) # Sorted list of cosine distances - to get top N matches

                    # Print the results
                    print("Example headline: {}\r\n".format(example_title))
                    ## Print all the matches with their abstracts
                    for i in range(n_wiki_matches):
                        result = np.where(cdist_list == cdist_list_sorted[i])
                        print("{} Wikipedia article {}: \r\n Abstract: {}\r\n".format(i, 
                                                       wikipedia.documents_clean[result[0][0]],
                                                       wikipedia.documents_clean[result[0][0]].abstract))
                
        return titles, texts, urls

rsspedia = Rsspedia(wikipedia)

In [None]:
test_headline = "FN's verdensmål sættes i centrum på Folkemødet"

titles, texts, urls = rsspedia.search_wiki(search_texts = [test_headline],
                                           search_type = "okapibm25")
pprint.pprint(titles)

titles, texts, urls = rsspedia.search_wiki(search_texts = [test_headline],
                                           search_type = "esa_relatedness")
pprint.pprint(titles)

In [None]:
pattern = re.compile('[\n\r ]+', re.UNICODE)
texts = [wikipedia.documents[i].text for i in range(len(wikipedia.documents))]
texts = [pattern.sub(' ', texts[i]) for i in range(len(texts))]

In [None]:
pattern.sub(' ', texts[6])