In [1]:
import json 
path = "../synthetic_sampling/synthetic_samples.json"
with open(path, "r", encoding="utf-8") as f:
    total_data = json.load(f)

In [2]:
with open("../synthetic_sampling/for_synth_sampling.json", 'r') as f:
    real_smpls = json.load(f)

total_real = []
for papers in real_smpls:
    for sample in real_smpls[papers]:
        total_real.append({"span": sample['text'], "document": sample['full_text'], "start": sample['start'], "end": sample['end'], "label": sample['label']})

In [3]:
categories = ['Unsupported claim', 'Format', 'Coherence', 'Lacks synthesis']

for category in categories:
    for sample in total_real:
        if sample['label'] == category:
            total_data[category].append(sample)


In [4]:
# Flattening the structure
categories = ['Unsupported claim', 'Format', 'Coherence', 'Lacks synthesis']
data = total_data
flattened = []
for category in categories:
    for group in data[category]:
        if category == 'Unsupported claim':
            label = 'Unsupported_claim'
        elif category == 'Lacks synthesis':
            label = 'Lacks_synthesis'
        else:
            label = category
            
        flattened.append({**group, "label": label})

In [6]:
import random
import json
from typing import List, Dict, Any, Tuple
from collections import defaultdict

Record = Dict[str, Any]

def split_by_document(
    records: List[Record],
    train_ratio: float = 0.8,
    dev_ratio: float = 0.1,
    eval_ratio: float = 0.1,
    seed: int = 42,
) -> Tuple[List[Record], List[Record], List[Record]]:
    assert abs((train_ratio + dev_ratio + eval_ratio) - 1.0) < 1e-9

    # ---- group records by document ----
    by_doc = defaultdict(list)
    for r in records:
        by_doc[r["document"]].append(r)

    doc_keys = list(by_doc.keys())

    rng = random.Random(seed)
    rng.shuffle(doc_keys)

    n = len(doc_keys)
    n_train = int(n * train_ratio)
    n_dev = int(n * dev_ratio)

    train_docs = doc_keys[:n_train]
    dev_docs = doc_keys[n_train:n_train + n_dev]
    eval_docs = doc_keys[n_train + n_dev:]

    def collect(docs):
        out = []
        for d in docs:
            out.extend(by_doc[d])
        return out

    train = collect(train_docs)
    dev = collect(dev_docs)
    eval_ = collect(eval_docs)

    return train, dev, eval_


train_data, dev_data, eval_data = split_by_document(flattened)

def save_split(split: dict, path: str):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(split, f, ensure_ascii=False, indent=2)


In [7]:
def group_by_category(data_by_file):
    data_by_category = {"Unsupported_claim": [],
                        "Format": [],
                        "Coherence": [],
                        "Lacks_synthesis": []}

    for record in data_by_file:
        category = record['label']
        data_by_category[category].append(record)
    return data_by_category

train_data = group_by_category(train_data)
dev_data = group_by_category(dev_data)
eval_data = group_by_category(eval_data)

In [8]:
format_def = """Issues in citation formatting such as a missing bracket and using the wrong style of citing.
    a) Due to preprocessing errors of the source dataset, some words contain hyphens that do not require it, and some are missing hyphens where it is required. Please ignore these types of formatting issues.
    b) Highlight the word/citation in which the formatting issue occurs in and not only the issue within the word/citation.
    c) Formatting issues appear as either citations or parts of a citation.
    Examples of formatting issues include:
        i) Narrative citation missing year: “Vatswani et al.” -> should be “Vatswani et al. (2020)”
        ii) Wrong citation style: “In (Vatswani et al., 2019)” -> should be “in Vatswani et al. (2019)”
        iii) Wrong use of footnotes: "Vastwani et al. 1" -> should include the year or be reformatted as a proper footnote."""

unsupp_def = """claim about prior work or statistics w/o citation or evidence. 
    a) The author should cite at every first mention of a study, paper, shared task, competition or dataset.
    b) Specific information to a niche topic, despite sounding like it should be known in that topic of study, should be cited.
    c) If a claim is made and is obvious to be a natural deduction from previous statements through common sense (i.e not requiring expert knowledge), then this claim does not fall under ‘Unsupported claim’. For example:
        i) “However, creating a large and suitable set of questions for supporting narrative comprehension is both time-consuming and cognitively demanding.” -> it is obvious that creating a dataset is time consuming and mentally demanding.
    d) Any mention of “recent works” should be backed up with citations to the works.
    e) Unsupported claim issues appear as segments, phrases, sub-sentences or full sentences.
    Examples of unsupported claims include:
        i) Missing citations for mentions of 'recent works': “and there are many recent works that explore this topic”,
        ii) Mention of a previous work and claim without citation: “..., while in a previous study, the authors claim …”,
        iii) Mentioning of a specific setup of a task without citation to the work: ".. BERT was used in an AES task trained on essays .." """

lacksynth_def = """occurs when either:
    a) The author describes or cites papers without connecting them to their own work/argument 
    b) Or only follows up the summary of previous works with their own contribution without explicitly highlighting the gap their work intends to research.
    c) It does not articulate the author's perspective or motivation.
    d) A lack of argument/opinion in the first paragraph is permissible as it serves to be the foundation of the author's argument 
    e) Lacks synthesis issues appear either as single sentences or multiple sentences.
    Examples of lack of synthesis include:
        i) No elaboration of own contribution/argument:"Following early neural approaches to question answering, many subsequent studies adopt a pipeline architecture consisting of retrieval and comprehension components. The retrieval component focuses on identifying relevant documents or passages from a large corpus, while the comprehension component extracts an answer span from the retrieved text. Initial models relied on recurrent neural networks with attention mechanisms to encode questions and contexts (Seo et al., 2017; Wang et al., 2017)."
        ii)  No explanation of the cited works and relation to their own work: “Recently, several studies have explored the use of prompting techniques with pre-trained language models to influence model outputs or access latent knowledge (Brown et al., 2020; Gao et al., 2021; Liu et al., 2021; Wei et al., 2022).” """

coherence_def = """connection between cited works is abrupt, lacking relation to each other. It is unclear how one mentioned work is relevant to a prior mentioned work. 
    a) Sentences are not transitioned from one to another.
    b) The relationship between sentences describing papers is implied but not explicitly stated.
    c) Coherence issues appear only as multiple sentences.
    Examples of coherence issues include:
        i) Relation between mentioned works is not explicit: “Smith (2020) identified a relationship between personal belief systems and ethical decision-making frameworks. Moral foundation theory proposes several core dimensions of moral reasoning, including harm, fairness, and authority (Jones, 2015). Audience adaptation has been explored in computational argumentation. Lee et al. (2019) applied moral categories to argument generation tasks. Human annotators often disagree when labeling moral dimensions in text (Nguyen et al., 2018).”
        ii) Lack of transitions between sentences: “Recent studies have explored various techniques for enhancing model performance. Smith et al. (2020) introduced a novel architecture that significantly improves accuracy on benchmark datasets. Additionally, Johnson and Lee (2019) proposed a data augmentation method that increases training data diversity.” 
        iii) No explanation of the cited works and relation to their own work: “Recently, several studies have explored the use of prompting techniques with pre-trained language models to influence model outputs or access latent knowledge (Brown et al., 2020; Gao et al., 2021; Liu et al., 2021; Wei et al., 2022).” """

In [9]:
def process_data(data):
    processed_data = {}

    for category, samples in data.items():
        if category == 'Unsupported_claim':
            definition = unsupp_def
        elif category == 'Lacks_synthesis':
            definition = lacksynth_def
        elif category == 'Coherence':
            definition = coherence_def
        elif category == 'Format':
            definition = format_def
        else:
            continue

        processed_data.setdefault(category, [])

        for sample in samples:
            span_text = sample["span"].strip()

            prompt = f"""You are an expert annotator.
Task:
Return ONLY the spans from the document that match the definition below.
Each span MUST be wrapped exactly like this:

<span>your span here</span>

Do not output anything else.

Definition:
{definition}

Document:
{sample['document']}
"""

            completion = f"<span>{span_text}</span>"

            processed_data[category].append({
                "prompt": prompt,
                "completion": completion
            })

    return processed_data

train_data = process_data(train_data)
dev_data = process_data(dev_data)
eval_data = process_data(eval_data)

In [10]:
import os 

categories = {'Unsupported_claim': 'unsupported_claim', 'Lacks_synthesis': "lacks_synthesis", 'Coherence': "coherence", 'Format': "format"}

for category in categories:
    filename = f"{categories[category]}/{categories[category]}_training_data.json"
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(train_data[category], f, indent=4)

for category in categories:
    filename = f"{categories[category]}/{categories[category]}_dev_data.json"
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(dev_data[category], f, indent=4)

for category in categories:
    filename = f"{categories[category]}/{categories[category]}_eval_data.json"
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(eval_data[category], f, indent=4)