In [9]:
import requests
import json
import pandas as pd
import re
import spacy
import time

In [68]:
class DBSpotlight:
    def __init__(self, url, port):
        self.url = url + ":" + port + "/rest/"

    def annotate(self, text, confidence):

        r = requests.get(self.url+"annotate",
                         params={
                            "text":text,
                            "confidence":confidence
                         },
                         headers={
                            "Accept":"application/json"
                         }
        )
        
        return r.text

    def candidate(self, text, confidence):

        r = requests.get(self.url+"candidate",
                         params={
                            "text":text,
                            "confidence":confidence
                         },
                         headers={
                            "Accept":"application/json"
                         }
        )
        
        return r.text

    def spot(self, text, confidence):

        r = requests.get(self.url+"spot",
                         params={
                            "text":text,
                            "confidence":confidence
                         },
                         headers={
                            "Accept":"application/json"
                         }
        )
        
        return r.text

In [69]:
dbspotlight = DBSpotlight("http://localhost", "2222")
r = dbspotlight.annotate('Lausanne er en by i den fransktalende del af Schweiz ved bredden af Genevesøen.', 0.5)

In [71]:
df = pd.read_json("DanishWikiData.ndjson", lines=True)

df["isArticle"] = [i.split(":")[0] != "Kategori" for i in df["title"]]

In [102]:

nlp = spacy.load("da_core_news_sm")

pagecontent = df[df["isArticle"]]["pagecontent"][0]

pagecontent = re.sub(r"( )\([^)]*\)|\([^)]*\)|\={2,}[^=]*\={2,}", "", pagecontent)

doc = nlp(pagecontent)
sentences = []
for sent in doc.sents:
    if len(sent.text.split(" ")) > 3:
        sentences.append(re.sub(r"(\n)+", " ",sent.text).strip(" "))
sentences

['Belgien, officielt Kongeriget Belgien, er en suveræn stat i Vesteuropa.',
 'Det grænser op til Frankrig, Nederlandene, Tyskland, Luxembourg og Nordsøen.',
 'Belgien er et lille og tætbefolket land.',
 'Det har et samlet areal på 30.528 kvadratkilometer og et indbyggertal på 11.521.238.',
 'Belgien ligger på grænsen mellem det germanske og romanske Europa og er således hjemsted for to lingvistiske hovedgrupper: De nederlandsktalende flamlændere, som udgør omkring 59 % af befolkningen, og de fransktalende vallonere, som udgør cirka 41 %.',
 'Der findes desuden en lille gruppe tysktalende i den østlige del af landet.',
 'De moderne stater Belgien, Nederlandene og Luxembourg er historisk set blevet kaldt for Nederlandene.',
 'Foruden de nuværende Benelux-lande, omfattede dette område oprindeligt også mindre dele af det nordlige Frankrig og vestlige Tyskland.',
 'Regionen blev på latin kaldt Belgica, som var afledt af navnet på den romerske provins Gallia Belgica.',
 'Fra slutningen af mi

In [None]:
df[df["isArticle"]]["pagecontent"]

In [103]:
db_spotlight_json_05 = []
counter = 0
nlp = spacy.load("da_core_news_sm")
restartCounter = 0
for pagecontent in df[df["isArticle"]]["pagecontent"]:

    pagecontent_filtered = re.sub(r"( )\([^)]*\)|\([^)]*\)|\={2,}[^=]*\={2,}", "", pagecontent)

    doc = nlp(pagecontent_filtered)

    for sent in doc.sents:
        if len(sent.text.strip().split(" ")) > 3:
            while True:
                try:
                    db_spotlight_json_05.append(dbspotlight.annotate(re.sub(r"(\n)+", " ",sent.text).strip(" "), 0.5))
                    break
                except (ConnectionError, OSError):
                    restartCounter +=1
                    print("retrying x " + str(restartCounter), end="\n\r")
                    time.sleep(1)
    counter += 1
    print(counter, end= "\r")

retrying x 1
retrying x 2
retrying x 3
retrying x 4
retrying x 5
retrying x 6
retrying x 7
retrying x 8
retrying x 9
retrying x 10
retrying x 11
retrying x 12
retrying x 13
retrying x 14
retrying x 15
retrying x 16
retrying x 17
retrying x 18
retrying x 19
retrying x 20
retrying x 21
retrying x 22
retrying x 23
retrying x 24
retrying x 25
retrying x 26
retrying x 27
retrying x 28
retrying x 29
retrying x 30
retrying x 31
retrying x 32
retrying x 33
retrying x 34
retrying x 35
retrying x 36
retrying x 37
retrying x 38
279638

In [105]:
with open("db_spotlight_05.json", "w+", encoding="utf-8") as file:
    for sent in db_spotlight_json_05:
        file.write(sent + '\n')

In [4]:
with open("db_spotlight_05.json", "r", encoding="utf-8") as file:
    db_spotlight_05 = json.load(file)

In [107]:
len(db_spotlight_json_05)

4027712

In [37]:


def findTokenOffsets(sentence):
    # This function only works with grammatically correct english
    # where there are spaces after commas, colons and semi-colons.
    sentenceSplit = sentence.split(" ")
    offsets = [0]
    for idx, char in enumerate(sentence):
        if char == " ":
            offsets.append(idx+1)

    return {offsets[i]:sentenceSplit[i] for i in range(len(sentenceSplit))}

def createTarget(dbEntry):

    targetString = ""

    tokenOffsets = findTokenOffsets(dbEntry["@text"])


    for offset in list(tokenOffsets.keys()):
        for word in dbEntry["Resources"]:

            # Is the current offset the same as the current entry
            if int(word["@offset"]) == offset:
                if targetString == "":
                    targetString = targetString + word["@surfaceForm"]
                else:
                    targetString = targetString + " " + word["@surfaceForm"]

            # Has it already been marked?
            elif targetString.split(" ")[-1] != "*":
                if targetString != "":
                    targetString = targetString + " *"


    return targetString

def createSource(nlp, dbEntry):
    doc = nlp(dbEntry["@text"])

    sourceString = ""

    for token in doc:
        sourceString = sourceString + token.text + " " + token.pos_ + " "

    return sourceString[:-1]


In [43]:
nlp = spacy.load("da_core_news_sm")
annotatedData = []
counter = 0
errorCounter = 0
for entry in db_spotlight_05:
    counter += 1
    print(counter, end="\r")

    try:
        dbEntry = json.loads(entry)
    
    # Failed dbpedia spotlight request
    except json.JSONDecodeError:
        continue
    
    
    
    try:
        annotatedData.append({
                                "source":createSource(nlp, dbEntry),
                                "target":createTarget(dbEntry)
                            })
    # No targets
    except KeyError:

        continue



Done compiling, concatinating


TypeError: cannot concatenate object of type '<class 'dict'>'; only Series and DataFrame objs are valid

In [44]:
dfAnnotatedData = pd.DataFrame(annotatedData)

cleanedTargets = []

for target in dfAnnotatedData["target"]:
    
    if target.split(" ")[-1] == "*":
        cleanedTargets.append(target[:-2])
    else:
        cleanedTargets.append(target)

dfAnnotatedData["cleaned_target"]= cleanedTargets

In [60]:

validationSplit = int(len(dfAnnotatedData)*0.8)


dfAnnotatedDataShuffled = dfAnnotatedData.sample(frac=1)

train_set = dfAnnotatedDataShuffled.iloc[:validationSplit]
validation_set = dfAnnotatedDataShuffled.iloc[validationSplit:]

In [64]:
with open("src_train_dbpedia_spotlight05_da.txt", "w+", encoding="utf-8") as srcfile:
    with open("tgt_train_dbpedia_spotlight05_da.txt", "w+", encoding="utf-8") as tgtfile:

        for idx, row in train_set.iterrows():
            srcfile.write(row[0]+ "\n")
            tgtfile.write(row[2]+ "\n")



In [62]:
with open("src_valid_dbpedia_spotlight05_da.txt", "w+", encoding="utf-8") as srcfile:
    with open("tgt_valid_dbpedia_spotlight05_da.txt", "w+", encoding="utf-8") as tgtfile:

        for idx, row in validation_set.iterrows():
            srcfile.write(row[0]+ "\n")
            tgtfile.write(row[2]+ "\n")



Unnamed: 0,source,target,cleaned_target
84913,Instruktion NOUN : PUNCT Richard PROPN Fleisch...,Richard * Martin Balsam * Joseph Cotten * Pear...,Richard * Martin Balsam * Joseph Cotten * Pear...
60271,Cavallis PROPN anden DET opera NOUN var AUX Gl...,opera * Dafne,opera * Dafne
1418862,Han PRON er AUX langt ADV ude ADV i ADP famili...,Otto Leisner *,Otto Leisner
57107,Sassanidernes NOUN drøm NOUN om ADP at PART ge...,Jerusalem * Damaskus *,Jerusalem * Damaskus
129948,Det PRON var AUX da ADV også ADV hans DET berø...,Karl VI * Wien * historiograf * 1718,Karl VI * Wien * historiograf * 1718
...,...,...,...
1590844,Navnet NOUN blev AUX brugt VERB af ADP de DET ...,gudinde *,gudinde
2758997,\n SPACE I ADP geografisk ADJ og CCONJ politis...,islam,islam
668265,Der ADV var VERB dog ADV ingen DET sure NOUN m...,miner *,miner
2574333,Blandt ADP Gestapofolkene NOUN var AUX Ib PROP...,Ib Birkedal Hansen *,Ib Birkedal Hansen
