# Properties

In [1]:
input_path = "./candidates/output/candidates_similarity.with_categories.20000.csv"
output_path = "./similarity_categories_11-6-2021.xmi"

# Install

## Install Package to write CAS XMI files

See https://github.com/dkpro/dkpro-cassis

In [1]:
#pip install dkpro-cassis

Note: you may need to restart the kernel to use updated packages.


## Installing Pandas for better csv/tsv support

In [2]:
#pip install pandas

Note: you may need to restart the kernel to use updated packages.


# Skript

## Import TSV Input Files

In [3]:
import pandas as pd
segments = pd.read_csv(input_path,sep=',')

In [4]:
segments

Unnamed: 0,a_doc_id,a_start,a_end,a_text,a_url,a_title,a_categories,b_doc_id,b_start,b_end,b_text,b_url,b_title,b_categories
0,148545,236,254,"Microsoft encouraged users to upgrade to IE8, ...",https://en.wikinews.org/wiki?curid=148545,"France, Germany officials warn against using I...","Microsoft,Science and technology,France,German...",88561,209,246,Netscape's early success and virtual monopoly ...,https://en.wikinews.org/wiki?curid=88561,Netscape navigating the World Wide Web no more,"Science and technology,Mozilla,Microsoft,Inter..."
1,320196,1583,1637,"President Obama for instance, received more mo...",https://en.wikinews.org/wiki?curid=320196,Wikinews interviews former Salt Lake City mayo...,"Interview,Utah,Washington, D.C.,Mitt Romney,Hu...",113891,804,859,Obama said that he and McCain disagreed fundam...,https://en.wikinews.org/wiki?curid=113891,McCain and Obama face off in U.S. presidential...,"United States,Politics and conflicts,Mississip..."
2,130602,325,366,"In defense of the actions, Rod McInnis, the so...",https://en.wikinews.org/wiki?curid=130602,Government shuts off water to California farms...,"North America,United States,California,Environ...",57234,0,108,Today the Interior Department's U.S. Fish and ...,https://en.wikinews.org/wiki?curid=57234,"Polar bears reviewed to become ""threatened"" sp...","Science and technology,United States,Environme..."
3,201658,101,132,In response to the outbreak the FDA deployed a...,https://en.wikinews.org/wiki?curid=201658,US food regulators recall 380 million eggs aft...,"United States,Colorado,Minnesota,California,No...",121431,991,1030,Both the DA and DOH officials have decided to ...,https://en.wikinews.org/wiki?curid=121431,Philippines is culling 6500 hogs after Reston ...,"Asia,Philippines,Biology,Disease,Infectious di..."
4,79609,556,606,"* 'Red Sox 5, Devil Rays 4': B.J. Upton hit a...",https://en.wikinews.org/wiki?curid=79609,"Major League Baseball roundup: September 12, 2007","Sports,Baseball,Major League Baseball,United S...",79467,117,146,"* 'Twins 5, White Sox 2': Jason Kubel drove i...",https://en.wikinews.org/wiki?curid=79467,"Major League Baseball roundup: September 9, 2007","Sports,Baseball,Major League Baseball,United S..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1790,74116,146,183,Finding water on this planet implies that othe...,https://en.wikinews.org/wiki?curid=74116,NASA says water found on exoplanet,"Science and technology,Space,NASA,European Spa...",209716,0,44,Astronomers at the University of California at...,https://en.wikinews.org/wiki?curid=209716,New planet found in 'Habitable Zone',"California,Astronomy,United States,Science and..."
1791,130717,55,76,"The second bomb killed three young boys, in th...",https://en.wikinews.org/wiki?curid=130717,Two roadside bombs kill at least 14 in Afghani...,"Asia,Afghanistan,Afghan Taliban,Politics and c...",208980,14,38,Sgt. Kazimierz Kasprzak was killed in the prov...,https://en.wikinews.org/wiki?curid=208980,Polish soldier killed in eastern Afghanistan,"Afghanistan,Poland,Politics and conflicts,Afgh..."
1792,107064,1744,1770,NASA Associate Administrator for Space Operati...,https://en.wikinews.org/wiki?curid=107064,Space Shuttle Discovery launches on mission ST...,"Space,Florida,Japan,United States,Internationa...",44449,43,71,The expected 13-day mission has been described...,https://en.wikinews.org/wiki?curid=44449,NASA to launch Discovery despite crack in insu...,"International Space Station,Space Shuttle Disc..."
1793,42868,116,135,The Argentines had scored first when an in-swi...,https://en.wikinews.org/wiki?curid=42868,Argentina start with 2-1 win in Group C,"Argentina,Côte d'Ivoire,Football (soccer),FIFA...",2846899,493,521,Quaresma was booked in the 64th minute for a b...,https://en.wikinews.org/wiki?curid=2846899,"FIFA World Cup 2018 day 12, 13, 14, 15: Iran, ...","Saudi Arabia,Egypt,Uruguay,Luis Suárez,Denis C..."


## Load Typesystem

In [5]:
from cassis import *
import numpy as np

with open('typesystem.xml', 'rb') as f:
    typesystem = load_typesystem(f)

## Create CAS & Sofa

In [6]:
# This creates automatically the view `_InitialView`
cas = Cas(typesystem=typesystem)

# Sentence Annotation Type
Sentence = typesystem.get_type('webanno.custom.Sentence')
SentenceRelation = typesystem.get_type('webanno.custom.SentenceRelation')

# Creating Sentence Annotation & Sofa
sofa = ""
# Store Sentence generated IDs
alreadyIncludedSentences = []

def createId(doc_id, sent_start, sent_end):
    return "doc_" + str(doc_id) + "_start_" + str(sent_start) + "_end_" + str(sent_end)

# get Sentence Annotation by id
def searchForSentenceAnnotation(id):
    global cas
    for s in cas.select('webanno.custom.Sentence'):
        if s.sent_id == id:
            return s

relations = []
def createAnnotation_pairInput(x):
    global cas
    global relations
    # Add Sentence to Sofa & remember begin & end
    def createAnnotation(doc_id, sent_start, sent_end, text, url, title):
        global sofa
        global cas
        internalID = createId(doc_id, sent_start, sent_end)
        # If Sentence Annotation is not already created, create and add to CAS
        if internalID not in alreadyIncludedSentences:
            b = len(sofa)
            e = len(sofa + text)
            sofa += text
            alreadyIncludedSentences.append(internalID)
            cas.add_annotation(Sentence(begin=b, end=e, doc_id=doc_id, sent_id=internalID, sent_start=sent_start, sent_end=sent_end, url=url, title=title))
        return internalID            
    # apply for both sentences
    id1 = createAnnotation(x["a_doc_id"], x["a_start"], x["a_end"], x["a_text"], x["a_url"], x["a_title"])
    id2 = createAnnotation(x["b_doc_id"], x["b_start"], x["b_end"], x["b_text"], x["b_url"], x["b_title"])
    sent1 = searchForSentenceAnnotation(id1)
    sent2 = searchForSentenceAnnotation(id2)
    # create Relation Annotation
    relations.append(SentenceRelation(begin=sent1.begin, end=sent1.end, Dependent=sent1, Governor=sent2, label="unset"))
    relations.append(SentenceRelation(begin=sent2.begin, end=sent2.end, Dependent=sent2, Governor=sent1, label="unset"))
    
segments.apply(createAnnotation_pairInput, axis=1)

cas.sofa_string = sofa
cas.sofa_mime = "text/plain"

# Save changes to cas
for relation in relations:
    cas.add_annotation(relation)
    


## Save

In [7]:
cas.to_xmi(output_path)