# Install

## Install Package to write CAS XMI files
See https://github.com/dkpro/dkpro-cassis

In [1]:
#pip install numpy dkpro-cassis "scikit-learn==0.23.1" datasets transformers[torch] ipywidgets

In [2]:
import torch
import numpy as np
import random
import pandas as pd
from IPython.display import display, HTML

# Import Project Data

## Load CAS

In [3]:
from cassis import *

with open('./data/TypeSystem.xml', 'rb') as f:
    typesystem = load_typesystem(f)

cas = []
# Test data
with open('./data/test/test_data.xmi', 'rb') as f:
   cas.append(load_cas_from_xmi(f, typesystem=typesystem))


## Get Labels

In [4]:
# Without Translation (not used)
labels = ["none", "attribution", "causal", "conditional", "contrast", "description", "equivalence", "fulfillment", "identity", "purpose", "summary", "temporal"]

## Get Annotations

### Set annotation Preference
- Set whether to include news article headings or not
- If news headings are included, define separator (heading1 + separator + sentence1)
- Set whether to include timestamp of article
- If timestamp is used, define separator

In [5]:
annotation_with_news_title = True
annotation_title_separator = ". "
annotation_with_timestamp = True
annotation_timestamp_separator = " "

### Get Additional Doc Meta data (timestamp)

In [6]:
if annotation_with_timestamp:
    doc_df = pd.read_csv("./malte-candidates/meta-output.docs.tsv", sep="\t")
    doc_df = doc_df.set_index("doc_id")
    display(doc_df)

Unnamed: 0_level_0,url,title,categories,timestamp
doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
736,https://en.wikinews.org/wiki?curid=736,President of China lunches with Brazilian Pres...,"Politics and conflicts,South America,Asia,Braz...","November 12, 2004"
741,https://en.wikinews.org/wiki?curid=741,Palestinians to elect new president on January 9,"Palestine,Elections,Mahmoud Abbas,Yasser Arafa...","November 14, 2004"
743,https://en.wikinews.org/wiki?curid=743,Brazilian delegation returns from Arafat funeral,"Palestine,Brazil,Politics and conflicts,Middle...","November 13, 2004"
764,https://en.wikinews.org/wiki?curid=764,Hearing begins over David Hookes death,"Australia,Cricket,Crime and law,Oceania","November 15, 2004"
797,https://en.wikinews.org/wiki?curid=797,Brazilian soccer player's mother has been kidn...,"South America,Brazil,Football (soccer),Crime a...","November 15, 2004"
...,...,...,...,...
2909791,https://en.wikinews.org/wiki?curid=2909791,"Mohsen Fakhrizadeh, leader of Iranian nuclear ...","Iran,Asia,Middle East,Nuclear technology,Obitu...",2020-11-30T21:59:34Z
2909805,https://en.wikinews.org/wiki?curid=2909805,Former US national security advisor Michael Fl...,"United States,North America,Crime and law,Poli...",2020-11-28T22:49:40Z
2909818,https://en.wikinews.org/wiki?curid=2909818,"Wikinews interviews Sandra Jephcott, Sustainab...","Australia,Elections,Climate change,COVID-19,Qu...",2020-11-27T15:33:59Z
2909884,https://en.wikinews.org/wiki?curid=2909884,"Wikinews interviews Craig Farquharson, Liberal...","Australia,Elections,Queensland,Democracy,Polit...","November 27, 2020"


In [7]:
#doc_df.loc[741]

In [8]:
def get_timestamp_from_doc(doc_id):
    global doc_df
    try: 
        item = doc_df.loc[int(doc_id)]
        return item.timestamp
    except (TypeError, ValueError, KeyError) as e:
        print(f"Cannot find Doc #{doc_id}: {e}")
        return ""

### Read Annotations

In [9]:
def read_annotations(tmp_cas, controlPairList = []):
    global labels, annotation_with_news_title, annotation_title_separator, annotation_with_timestamp, annotation_timestamp_separator
    origin = []
    target = []
    label = []
    for sentence in tmp_cas.select('webanno.custom.Sentence'):
        for token in tmp_cas.select_covered('webanno.custom.SentenceRelation', sentence):
            # Only use annotated data
            if token.label != "unset":
                # Check for redundant pairs
                uid = f"g{token.Governor.sent_id}_d_{token.Dependent.sent_id}"
                if uid not in controlPairList:
                    origin_string = ""
                    target_string = ""
                    label.append(labels.index(token.label))
                    # Also add meta data: title
                    if annotation_with_news_title:
                        origin_string += token.Governor.title + annotation_title_separator
                        target_string += token.Dependent.title + annotation_title_separator
                    # Add Sentences
                    origin_string += token.Governor.get_covered_text()
                    target_string += token.Dependent.get_covered_text()
                    # Also add meta data: timestamp
                    if annotation_with_timestamp:
                        origin_string += annotation_timestamp_separator + get_timestamp_from_doc(token.Governor.doc_id)
                        target_string += annotation_timestamp_separator + get_timestamp_from_doc(token.Dependent.doc_id)
                    # Add String to list
                    origin.append(origin_string)
                    target.append(target_string)
                    controlPairList.append(uid)
                    #print('Dependent: ' + token.Dependent.get_*covered_text())
                    #print('Governor: ' + token.Governor.get_covered_text())
                    #print('Label: ' + token.label)
                    #print('')
    return origin, target, label, controlPairList


## Combine CAS Systems

In [10]:
def combined_cas_read(cas_list):
    origin = []
    target = []
    label = []
    # control list of pairs to not add redundant pairs
    controlPairList = []
    for cas in cas_list:
        origin_tmp, target_tmp, label_tmp, controlPairList_tmp = read_annotations(cas, controlPairList)
        origin += origin_tmp
        target += target_tmp
        label += label_tmp
        controlPairList += controlPairList_tmp
    return origin, target, label

origin, target, label = combined_cas_read(cas)

## Print Examples

In [11]:
#train_encodings

In [12]:
def show_random_elements(origin_list, target_list, label_list, num_examples=10):
    global labels
    assert num_examples <= len(origin_list), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(origin_list)-1)
        while pick in picks:
            pick = random.randint(0, len(origin_list)-1)
        picks.append(pick)
    data = []
    for n in picks:
        data.append([n, origin_list[n], labels[label_list[n]], target_list[n]])
    df = pd.DataFrame(data, columns=['index', 'Origin', 'Label', 'Target'])
    display(HTML(df.to_html()))

In [13]:
show_random_elements(origin, target, label)

Unnamed: 0,index,Origin,Label,Target
0,234,"2007 Rugby World Cup: Georgia claim first World Cup victory. Merab Kvirikashvili opened the scoring with two penalties while Akvsenti Giorgadze scored the first try just before half time bringing the score to 13-0. September 24, 2007",temporal,"2007 Rugby World Cup: New Zealand 18 - 20 France. New Zealand led 13-3 at half time, after Dan Carter kicked two penalties and Luke McAlister scored a try. October 6, 2007"
1,188,Sri Lankan government withdraws truce with Tamil rebels. There have be many clashes between the government and the rebels since mid-2006. January 2 2008,none,"Colombo land mine explosion kills seven. In the past few days, there have been fierce hostilities between government forces and Tamil Tigers in the north and the east of the country. August 14, 2006"
2,89,"John Reed on Orwell, God, self-destruction and the future of writing. 'DS: What about this theory that the drop in advertising for the print media, even though it's going to go through a period of growing pains, in the end might actually be better for the literary press like the Brooklyn Rail, because people who have keys to funding will see more of a need to do grants, which will allow them to be more experimental as opposed to worrying about, Are we going to offend our advertisers? December 3, 2002",none,"British government advises against non-essential travel to Kenya. The travel warning recommends that those already in Kenya stay indoors and exercise extreme caution and seek advice locally either from their tour operator or the local authorities if they want to travel. January 3, 2008"
3,184,"San Diego ends Colts' bid for perfect season. The Colts attempted a comeback but Manning was intercepted in the Indianapolis endzone, bringing to an end the game and the Colts' 13-game winning streak. December 18, 2005",none,"Oregon running back LeGarrette Blount's college football career ends with a punch. The volatile Blount, coming off a 1,000+ yard season in which he broke a school record for rushing touchdowns, had struggled in several 2008 games, including the encounter with the Broncos. September 4, 2009"
4,28,"Japanese adults rank high in literacy and numeracy in OECD survey. Yesterday, the Organisation for Economic Co-operation and Development (OECD), a 34 member international economic organization founded in 1961 that traces its origins back to the World War II-era , published results on the state of adult skills in 24 countries, most of them European. 2013-11-17T02:39:22Z",none,"OECD releases report on New Zealand's environmental performance. The Organisation for Economic Co-operation and Development (OECD) has released a report on the environmental performance of the New Zealand Government. April 5, 2007"
5,186,"Steelers, Cardinals win championship games to advance to Super Bowl XLIII. Early in the game the Steelers defense dominated the field, forcing him to only complete 3 out of 14 pass attempts and end the half with a lowly 9.8 quarterback rating and his team losing 13 to 7. January 19, 2009",none,"San Diego ends Colts' bid for perfect season. The Colts attempted a comeback but Manning was intercepted in the Indianapolis endzone, bringing to an end the game and the Colts' 13-game winning streak. December 18, 2005"
6,99,"Bloomberg, Warren end US presidential campaigns following Super Tuesday. In the aftermath of Super Tuesday, yesterday and on Wednesday, two candidates, Senator Elizabeth Warren of Massachusetts and Michael Bloomberg, formerly the mayor of New York City, ended their campaigns. 2020-04-16T17:30:34Z",none,"Activist ejected from Virginia senator’s re-election rally. A mid-term re-election campaign rally held Tuesday morning by Virginia Senator George Allen at the Charlottesville Omni Hotel turned ugly after a University of Virginia law student, Daily Kos contributor and blogger, Mike Stark, was man-handled by Allen staffers in the hotel lobby after he asked the Senator about his 1970's divorce and arrest record. October 31, 2006"
7,210,"US Senate says no to pullout of US troops from Iraq. Independent Democrat senator from Connecticut, who caucuses with Democrats, voted with Republicans against the motion, as he has done with all Iraq war legislation this year. July 18, 2007",none,"Tea Party-endorsed Christine O'Donnell wins Delaware Senate primary election. Mike Castle in a Republican primary election for U.S. Senator from Delaware, voting precinct|precincts counted, O'Donnell, who has received the endorsement of the Tea Party movement, led Representative Castle by 30561 votes to 27021, 53.1% to 46.9%. September 14, 2010"
8,125,"Kimi Räikkönen wins 2009 Belgian Grand Prix. Kimi qualified 6th, but jumped to second after a multiple crash in the first corner involving Jenson Button (Brawn) and Lewis Hamilton (McLaren) collided with the rookies Romain Grosjean and Jaime Alguersuari and all four drivers were out of the race. 30 Aug 2009",none,"Formula 1: Barrichello and Alonso top timesheets at European GP Friday Practice. Rubens Barrichello was out to prove that his speed in the morning session was not a one-off, and posted the third fastest time overall (1:40.209), however his team mate Jenson Button was quicker this time out (1:40.178). August 21, 2009"
9,0,"Mugabe is 'prepared to fight' if Zimbabwe elects opposition. Zimbabwean President Robert Mugabe, who is slated to face opposition leader Morgan Tsvangirai in a run-off election on June 27, said today that the opposition would never govern Zimbabwe as long as he is alive, vowing to go to war if the Movement for Democratic Change (MDC) wins the election. June 14, 2008",equivalence,"Mugabe claims 'sweeping victory' of Zimbawe election. A file photograph of MugabeRobert Mugabe, the Zimbabwean president who has recently been widely criticized by world leaders, has claimed the current results show that he will have a 'sweeping victory' in the unopposed presidential run-off elections. June 29, 2008"


# Write Folds to disk

In [14]:
export_path = "data/test/"

In [15]:
def create_df(origin, target, label):
    return pd.DataFrame(list(zip(origin, target, label)), columns=['origin', 'target', 'label'])

df = create_df(origin, target, label)
df.to_csv(f'{export_path}data.csv', index=False)