# Install

## Install Package to write CAS XMI files
See https://github.com/dkpro/dkpro-cassis

In [1]:
pip install numpy dkpro-cassis "scikit-learn==0.23.1" datasets transformers[torch] ipywidgets matplotlib



Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display, HTML

# Import Project Data

## Load CAS

In [None]:
from cassis import *

with open('./data/TypeSystem.xml', 'rb') as f:
    typesystem = load_typesystem(f)

cas = []

# Random Dataset    
with open('./data/random.xmi', 'rb') as f:
    cas.append(load_cas_from_xmi(f, typesystem=typesystem))

# Random Categories Dataset    
with open('./data/random_categories.xmi', 'rb') as f:
    cas.append(load_cas_from_xmi(f, typesystem=typesystem))

# NSP Dataset    
with open('./data/nsp.xmi', 'rb') as f:
    cas.append(load_cas_from_xmi(f, typesystem=typesystem))

# NSP Categories Dataset    
with open('./data/nsp_categories.xmi', 'rb') as f:
    cas.append(load_cas_from_xmi(f, typesystem=typesystem))

# NSP Categories Dataset    03.06.
with open('./data/nsp_categories_3_6.xmi', 'rb') as f:
    cas.append(load_cas_from_xmi(f, typesystem=typesystem))

# Similarity Dataset    
with open('./data/similarity.xmi', 'rb') as f:
    cas.append(load_cas_from_xmi(f, typesystem=typesystem))

# Similarity Categories Dataset    
with open('./data/similarity_categories.xmi', 'rb') as f:
    cas.append(load_cas_from_xmi(f, typesystem=typesystem))

# Similarity Categories Dataset 03.06.    
with open('./data/similarity_categories_3_6.xmi', 'rb') as f:
    cas.append(load_cas_from_xmi(f, typesystem=typesystem))

# Similarity Categories Dataset 10.06.    
with open('./data/similarity_categories_10_6.xmi', 'rb') as f:
    cas.append(load_cas_from_xmi(f, typesystem=typesystem))

# Similarity Categories Dataset 11.06.    
with open('./data/similarity_categories_11_6.xmi', 'rb') as f:
    cas.append(load_cas_from_xmi(f, typesystem=typesystem))


## Get Labels

In [None]:
# Without Translation (not used)
labels = ["none", "attribution", "causal", "conditional", "contrast", "description", "equivalence", "fulfillment", "identity", "purpose", "summary", "temporal"]

## Get Annotations

### Set annotation Preference
- Set whether to include news article headings or not
- If news headings are included, define separator (heading1 + separator + sentence1)
- Set whether to include timestamp of article
- If timestamp is used, define separator

In [None]:
annotation_with_news_title = True
annotation_title_separator = ". "
annotation_with_timestamp = True
annotation_timestamp_separator = " "

### Get Additional Doc Meta data (timestamp)

In [None]:
if annotation_with_timestamp:
    doc_df = pd.read_csv("./malte-candidates/meta-output.docs.tsv", sep="\t")
    doc_df = doc_df.set_index("doc_id")
    display(doc_df)

In [None]:
#doc_df.loc[741]

In [None]:
def get_timestamp_from_doc(doc_id):
    global doc_df
    try: 
        item = doc_df.loc[int(doc_id)]
        return item.timestamp
    except (TypeError, ValueError, KeyError) as e:
        print(f"Cannot find Doc #{doc_id}: {e}")
        return ""

### Read Annotations

In [None]:
total_number_pairs = 0
label_count = [0] * len(labels)
# Random, NSP, Similarity
pair_matching_count = [0, 0, 0]

def read_annotations(tmp_cas, controlPairList = []):
    global labels, annotation_with_news_title, annotation_title_separator, annotation_with_timestamp, annotation_timestamp_separator
    origin = []
    target = []
    label = []
    for sentence in tmp_cas.select('webanno.custom.Sentence'):
        for token in tmp_cas.select_covered('webanno.custom.SentenceRelation', sentence):
            # Only use annotated data
            if token.label != "unset":
                # Check for redundant pairs
                uid = f"g{token.Governor.sent_id}_d_{token.Dependent.sent_id}"
                if uid not in controlPairList:
                    origin_string = ""
                    target_string = ""
                    label.append(labels.index(token.label))
                    # Also add meta data: title
                    if annotation_with_news_title:
                        origin_string += token.Governor.get_covered_text() + annotation_title_separator
                        target_string += token.Dependent.get_covered_text() + annotation_title_separator
                    # Add Sentences
                    origin_string += token.Governor.title
                    target_string += token.Dependent.title
                    # Also add meta data: timestamp
                    if annotation_with_timestamp:
                        origin_string += annotation_timestamp_separator + get_timestamp_from_doc(token.Governor.doc_id)
                        target_string += annotation_timestamp_separator + get_timestamp_from_doc(token.Dependent.doc_id)
                    # Add String to list
                    origin.append(origin_string)
                    target.append(target_string)
                    controlPairList.append(uid)
                    #print('Dependent: ' + token.Dependent.get_*covered_text())
                    #print('Governor: ' + token.Governor.get_covered_text())
                    #print('Label: ' + token.label)
                    #print('')
    return origin, target, label, controlPairList


## Combine CAS Systems

In [None]:
def combined_cas_read(cas_list):
    global pair_matching_count
    origin = []
    target = []
    label = []
    count = 0
    # control list of pairs to not add redundant pairs
    controlPairList = []
    for cas in cas_list:
        count += 1
        origin_tmp, target_tmp, label_tmp, controlPairList_tmp = read_annotations(cas, controlPairList)
        # Count Statistics
        if(count <= 2):
            pair_matching_count[0] += len(label_tmp)
        elif(count <= 5):
            pair_matching_count[1] += len(label_tmp)
        else:
            pair_matching_count[2] += len(label_tmp)
        origin += origin_tmp
        target += target_tmp
        label += label_tmp
        controlPairList += controlPairList_tmp
    return origin, target, label

origin, target, label = combined_cas_read(cas)
total_number_pairs = len(label)


## Split / K-Fold

### Set Random Seeds for reproducability

In [None]:
def set_seed(seed_number: int):
    global seed
    seed = seed_number
    np.random.seed(seed_number)
    torch.manual_seed(seed_number)
    torch.cuda.manual_seed_all(seed_number)

set_seed(122)

### k-Fold Split with same distribution

In [None]:
from sklearn.model_selection import StratifiedKFold, train_test_split

# o: origin, t: target, l: label list
def k_fold_train_test_split(o, t, l, random_state=None):
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=random_state)
    # Empty dataset (not needed)
    empty_X = np.zeros(len(l))
    skf.get_n_splits(empty_X, l)
    origin_array = np.array(o)
    target_array = np.array(t)
    label_array = np.array(l)
    # Store all k-folds
    k_fold_origin = []
    k_fold_target = []
    k_fold_labels = []
    for train_index, test_index in skf.split(empty_X, l):
        #print("TRAIN:", train_index, "TEST:", test_index)
        train_origin, test_origin = origin_array[train_index], origin_array[test_index]
        train_target, test_target = target_array[train_index], target_array[test_index]
        train_labels, test_labels = label_array[train_index], label_array[test_index]
        k_fold_origin.append([train_origin.tolist(), test_origin.tolist()])
        k_fold_target.append([train_target.tolist(), test_target.tolist()])
        k_fold_labels.append([train_labels.tolist(), test_labels.tolist()])
    return k_fold_origin, k_fold_target, k_fold_labels

#train_origin, test_origin, train_target, test_target, train_labels, test_labels = train_test_split(origin, target, label,random_state=seed, stratify=label)

#tmp_train_origin, val_origin, tmp_train_target, val_target, tmp_train_labels, val_labels = train_test_split(origin, target, label, random_state=seed)
#train_origin, test_origin, train_target, test_target, train_labels, test_labels = train_test_split(tmp_train_origin, tmp_train_target, tmp_train_labels,random_state=seed)
k_fold_origin, k_fold_target, k_fold_labels = k_fold_train_test_split(origin, target, label,random_state=seed)

## Print Examples

In [None]:
#train_encodings

In [None]:
def show_random_elements(origin_list, target_list, label_list, num_examples=10):
    global labels
    assert num_examples <= len(origin_list), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(origin_list)-1)
        while pick in picks:
            pick = random.randint(0, len(origin_list)-1)
        picks.append(pick)
    data = []
    for n in picks:
        data.append([n, origin_list[n], labels[label_list[n]], target_list[n]])
    df = pd.DataFrame(data, columns=['index', 'Origin', 'Label', 'Target'])
    display(HTML(df.to_html()))

In [None]:
# show_random_elements(train_origin, train_target, train_labels, train_encodings)
# Output adjustet to Folds
show_random_elements(k_fold_origin[0][0], k_fold_target[0][0], k_fold_labels[0][0])

## Print Statistics

In [None]:
distribution_count = [0] * len(labels)
def get_label_statistics(label_list):
    global labels, distribution_count
    count = [0] * len(labels)
    total = 0
    # total numbers
    for l in label_list:
        total = total + 1
        count[l] = count[l] + 1
        distribution_count[l] += 1
    # display percentages
    if total > 0:
        for i in range(len(count)):
            count[i] = str(count[i]) + " (" + str(round(count[i] * 100 / total, 2)) + "%)"
    return count



### Number of Pairs & Matchings

In [None]:
pair_matching_count.append(total_number_pairs)
df_matching = pd.DataFrame([pair_matching_count], columns=["Random", "NSP", "Similarity", "Total"])
display(df_matching)

In [None]:
plt.pie(np.array(pair_matching_count[:-1]), labels = ["Random", "NSP", "Similarity"])
plt.savefig('data/export/pairs_matching.png', dpi=300)
plt.show() 

### Label Distribution

In [None]:
df_distribution = pd.DataFrame([get_label_statistics(label)], columns=labels)
display(df_distribution)


In [None]:
myexplode = [0] * len(labels)
myexplode[0] = 0.2
patches, texts = plt.pie(np.array(distribution_count), explode = myexplode)
# Sort Labels
patches, labels, dummy =  zip(*sorted(zip(patches, labels, np.array(distribution_count)),key=lambda x: x[2],reverse=True))
plt.legend(patches, labels, loc='center left', bbox_to_anchor=(1., 0.5),fontsize=8)
plt.savefig('data/export/pairs_distribution.png', dpi=300)
plt.show() 

# Write Folds to disk

In [None]:
export_path = "data/export-umgekehrt/"

In [None]:
def create_df(origin, target, label):
    return pd.DataFrame(list(zip(origin, target, label)), columns=['origin', 'target', 'label'])

def export_dfs(k_fold_origin, k_fold_target, k_fold_labels):
    global export_path
    for i in range(len(k_fold_origin)):
        train_df = create_df(k_fold_origin[i][0], k_fold_target[i][0], k_fold_labels[i][0])
        test_df = create_df(k_fold_origin[i][1], k_fold_target[i][1], k_fold_labels[i][1])
        train_df.to_csv(f'{export_path}train.{i}.csv', index=False)
        test_df.to_csv(f'{export_path}test.{i}.csv', index=False)

In [None]:
export_dfs(k_fold_origin, k_fold_target, k_fold_labels)