# Data preparation

In this Notebook: data is prepared for classification experiments. The resulting datasets are stored in pickle files named **train_final.pkl**, **eval_final.pkl** and **test_eval.pkl**.

In [81]:
import json
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.utils import shuffle
import numpy as np
from datasets import Dataset
import pickle

## Load the annotations

In [74]:
# load the annotated data
all_anno = pd.read_csv("processed-annotations-11-06.csv", encoding = "utf-8")
print(all_anno.shape)
# # prefix sentences with the name of the section they are found in 
# all_anno["text"] = all_anno.apply(lambda x: f"section: {x.section}, text: {x.text}", axis = 1)
all_anno.head()

(14792, 11)


Unnamed: 0,id,doc_id,paper_title,paper_structure,year,text,section,label,rw,error,Comments
0,59852,3435904,Multi-Task Active Learning for Neural Semantic...,0. abstract\n1. Introduction\n2. Related Work\...,2018,Most Semantic Role Labeling (SRL) approaches a...,abstract,context-AIC,False,False,
1,59853,3435905,Multi-Task Active Learning for Neural Semantic...,0. abstract\n1. Introduction\n2. Related Work\...,2018,"In this paper, we propose a Multi-Task Active ...",abstract,contribution-AIC,False,False,
2,59854,3435906,Multi-Task Active Learning for Neural Semantic...,0. abstract\n1. Introduction\n2. Related Work\...,2018,We evaluate our approach on Indonesian convers...,abstract,contribution-AIC,False,False,
3,59855,3435907,Multi-Task Active Learning for Neural Semantic...,0. abstract\n1. Introduction\n2. Related Work\...,2018,Our experiments show that multi-task active le...,abstract,result,False,False,
4,59856,3435908,Multi-Task Active Learning for Neural Semantic...,0. abstract\n1. Introduction\n2. Related Work\...,2018,"According to our results, active learning is m...",abstract,result,False,False,


## Load the repartition in train/eval/test splits
For all the experiments, we will use the same repartition of data into train/eval/test, with a balanced distribution of the claims categories. Each sentence id is assigned a split, according to its category.

In [75]:
# with open("id_values_per_label.json", "r") as f:
#     d = json.load(f)

# print(d['context-AIC'].keys())

# id2split = {}
# for label in d.keys():
#     for split in d[label].keys():
#         ids = d[label][split]
#         for id in ids:
#             id2split[id] = split

# with open("id_values_per_label.json", "w") as f:
#     new_d = {"id_values_per_label": d,
#              "id2split": id2split,
#             }
#     json.dump(new_d, f)

In [76]:
with open("id_values_per_label.json", "r") as f:
    d = json.load(f)

id2split = d["id2split"]

all_anno["split"] = all_anno["id"].apply(lambda x: id2split[str(x)])
with open("labels.json", "r") as f:
    LABELS = json.load(f)

# labels encoded as 1 hot vectors
def from_integer_list_to_one_hot_vector(n_list, vector_size):
    return [1 if i in n_list else 0 for i in range(vector_size)]

label2id = {l:i for i, l in enumerate(LABELS)}
all_anno["label_as_int"] = all_anno["label"].apply(lambda x: [label2id[l] for l in x.split("#")])
all_anno["label_as_one_hot"] = all_anno["label_as_int"].apply(lambda x: from_integer_list_to_one_hot_vector(x, len(LABELS)))

anno_train = all_anno[all_anno["split"] == "train"]
anno_eval = all_anno[all_anno["split"] == "eval"]
anno_test = all_anno[all_anno["split"] == "test"]

print(anno_train.shape, anno_eval.shape, anno_test.shape)

# detailed composition of the splits
for label in LABELS:
    counts = []
    for df in [anno_train, anno_eval, anno_test]:
        counts.append(df[df["label"] == label].shape[0])
    print(label, counts)

# and multi labels
counts = []
for df in [anno_train, anno_eval, anno_test]:
    counts.append(df[df["label"].str.contains("#")].shape[0])
print("multi", counts)

(11830, 14) (1478, 14) (1484, 14)
context-AIC [1704, 213, 214]
contribution-AIC [1343, 168, 168]
result [2433, 304, 305]
impact [52, 7, 7]
directions [292, 36, 37]
limitation [218, 27, 28]
outline-AIC [210, 26, 27]
nc [5102, 638, 638]
multi [476, 59, 60]


In [77]:
all_anno.head()

Unnamed: 0,id,doc_id,paper_title,paper_structure,year,text,section,label,rw,error,Comments,split,label_as_int,label_as_one_hot
0,59852,3435904,Multi-Task Active Learning for Neural Semantic...,0. abstract\n1. Introduction\n2. Related Work\...,2018,Most Semantic Role Labeling (SRL) approaches a...,abstract,context-AIC,False,False,,train,[0],"[1, 0, 0, 0, 0, 0, 0, 0]"
1,59853,3435905,Multi-Task Active Learning for Neural Semantic...,0. abstract\n1. Introduction\n2. Related Work\...,2018,"In this paper, we propose a Multi-Task Active ...",abstract,contribution-AIC,False,False,,train,[1],"[0, 1, 0, 0, 0, 0, 0, 0]"
2,59854,3435906,Multi-Task Active Learning for Neural Semantic...,0. abstract\n1. Introduction\n2. Related Work\...,2018,We evaluate our approach on Indonesian convers...,abstract,contribution-AIC,False,False,,train,[1],"[0, 1, 0, 0, 0, 0, 0, 0]"
3,59855,3435907,Multi-Task Active Learning for Neural Semantic...,0. abstract\n1. Introduction\n2. Related Work\...,2018,Our experiments show that multi-task active le...,abstract,result,False,False,,train,[2],"[0, 0, 1, 0, 0, 0, 0, 0]"
4,59856,3435908,Multi-Task Active Learning for Neural Semantic...,0. abstract\n1. Introduction\n2. Related Work\...,2018,"According to our results, active learning is m...",abstract,result,False,False,,eval,[2],"[0, 0, 1, 0, 0, 0, 0, 0]"


## Prepare data

In [78]:
# text of the sentences
text_train = list(anno_train["text"].values)
text_eval = list(anno_eval["text"].values)
text_test = list(anno_test["text"].values)

# section names
sec_train = list(anno_train["section"].values)
sec_eval = list(anno_eval["section"].values)
sec_test = list(anno_test["section"].values)

# labels encoded as a list of integers
li_train = list(anno_train["label_as_int"].values)
li_eval = list(anno_eval["label_as_int"].values)
li_test = list(anno_test["label_as_int"].values)

# labels encoded as 1 hot vectors
lv_train = list(anno_train["label_as_one_hot"].values)
lv_eval = list(anno_eval["label_as_one_hot"].values)
lv_test = list(anno_test["label_as_one_hot"].values)

In [79]:
# store the ids of surrounding sentences (2 sentences to the left, 1 to the right)
context_ids = {split: {-2: [], -1: [], 1: []} for split in ["train", "eval", "test"]}

for i, row in all_anno.iterrows():
    
    current_paper = row["paper_title"]
    current_section = row["section"]
    split = row["split"]

    ## LEFT CONTEXT
    # -2
    # check if second to last sentence is in the same paper and section
    if i > 1 and current_paper == all_anno.at[i-2, "paper_title"] and current_section == all_anno.at[i-2, "section"]:
        context_ids[split][-2].append(all_anno.at[i-2, "id"])
    else:
        context_ids[split][-2].append(-1)

    # -1 
    # check if last sentence is in the same paper and section
    if i > 0 and current_paper == all_anno.at[i-1, "paper_title"] and current_section == all_anno.at[i-1, "section"]:
        context_ids[split][-1].append(all_anno.at[i-1, "id"])
    else:
        context_ids[split][-1].append(-1)

    ## RIGHT CONTEXT
    # +1
    # check if next sentence is in the same paper and section
    if i < all_anno.shape[0]-1 and current_paper == all_anno.at[i+1, "paper_title"] and current_section == all_anno.at[i+1, "section"]:
        context_ids[split][1].append(all_anno.at[i+1, "id"])
    else:
        context_ids[split][1].append(-1)

In [80]:
# create Dataset objects
train_ds = Dataset.from_dict({"text": text_train, "section": sec_train, "li": li_train, "label": lv_train, "-2": context_ids["train"][-2], "-1": context_ids["train"][-1], "+1": context_ids["train"][1]})
eval_ds = Dataset.from_dict({"text": text_eval, "section": sec_eval, "li": li_eval, "label": lv_eval, "-2": context_ids["eval"][-2], "-1": context_ids["eval"][-1], "+1": context_ids["eval"][1]})
test_ds = Dataset.from_dict({"text": text_test, "section": sec_test, "li": li_test, "label": lv_test, "-2": context_ids["test"][-2], "-1": context_ids["test"][-1], "+1": context_ids["test"][1]})

# shuffle the data
train_ds = train_ds.shuffle(seed = 42)
eval_ds = eval_ds.shuffle(seed = 42)
test_ds = test_ds.shuffle(seed = 42)

## Save datasets

In [82]:
with open("train_final.pkl", "wb") as f:
    pickle.dump(train_ds, f)

with open("eval_final.pkl", "wb") as f:
    pickle.dump(eval_ds, f)

with open("test_final.pkl", "wb") as f:
    pickle.dump(test_ds, f)