Collecting the gold label data:

In [32]:
from openai import OpenAI
from dotenv import load_dotenv
import os 

# keys.env with stored keys required 
key_path = 'keys.env'
load_dotenv(dotenv_path=key_path)
api_key = os.getenv("API_KEY_2")
client = OpenAI(api_key=api_key)

def ask_gpt(prompt):
    completion = client.chat.completions.create(
        model="gpt-5",
        messages=[
            {"role": "assistant", "content": "You are an expert in academic writing and citation analysis."},
            {
                "role": "user",
                "content": prompt
            }
        ]
    )

    return completion.choices[0].message.content

### GPT span classifications
- filter dataset to remove 'References' 
- create prompt for zero and few shot prompting
- collect GPT responses

checklist prompting

Prompts:

In [33]:
format_def = f"""Issues in citation formatting such as a missing bracket and using the wrong style of citing.
    a) Due to preprocessing errors of the source dataset, some words contain hyphens that do not require it, and some are missing hyphens where it is required. Please ignore these types of formatting issues.
    b) Highlight the word/citation in which the formatting issue occurs in and not only the issue within the word/citation.
    c) Formatting issues appear as either citations or parts of a citation.
    Examples of formatting issues include:
        i) Narrative citation missing year: “Vatswani et al.” -> should be “Vatswani et al. (2020)”
        ii) Wrong citation style: “In (Vatswani et al., 2019)” -> should be “in Vatswani et al. (2019)”
        iii) Wrong use of footnotes: "Vastwani et al. 1" -> should include the year or be reformatted as a proper footnote."""

unsupp_def = f"""claim about prior work or statistics w/o citation or evidence. 
    a) The author should cite at every first mention of a study, paper, shared task, competition or dataset.
    b) Specific information to a niche topic, despite sounding like it should be known in that topic of study, should be cited.
    c) If a claim is made and is obvious to be a natural deduction from previous statements through common sense (i.e not requiring expert knowledge), then this claim does not fall under ‘Unsupported claim’. For example:
        i) “However, creating a large and suitable set of questions for supporting narrative comprehension is both time-consuming and cognitively demanding.” -> it is obvious that creating a dataset is time consuming and mentally demanding.
    d) Any mention of “recent works” should be backed up with citations to the works.
    e) Unsupported claim issues appear as segments, phrases, sub-sentences or full sentences.
    Examples of unsupported claims include:
        i) Missing citations for mentions of 'recent works': “and there are many recent works that explore this topic”,
        ii) Mention of a previous work and claim without citation: “..., while in a previous study, the authors claim …”,
        iii) Mentioning of a specific setup of a task without citation to the work: ".. BERT was used in an AES task trained on essays .." """

lacksynth_def = f"""occurs when either:
    a) The author describes or cites papers without connecting them to their own work/argument 
    b) Or only follows up the summary of previous works with their own contribution without explicitly highlighting the gap their work intends to research.
    c) It does not articulate the author's perspective or motivation.
    d) A lack of argument/opinion in the first paragraph is permissible as it serves to be the foundation of the author's argument 
    e) Lacks synthesis issues appear either as single sentences or multiple sentences.
    Examples of lack of synthesis include:
        i) No elaboration of own contribution/argument:"Following early neural approaches to question answering, many subsequent studies adopt a pipeline architecture consisting of retrieval and comprehension components. The retrieval component focuses on identifying relevant documents or passages from a large corpus, while the comprehension component extracts an answer span from the retrieved text. Initial models relied on recurrent neural networks with attention mechanisms to encode questions and contexts (Seo et al., 2017; Wang et al., 2017)."
        ii)  No explanation of the cited works and relation to their own work: “Recently, several studies have explored the use of prompting techniques with pre-trained language models to influence model outputs or access latent knowledge (Brown et al., 2020; Gao et al., 2021; Liu et al., 2021; Wei et al., 2022).” """

coherence_def = f"""connection between cited works is abrupt, lacking relation to each other. It is unclear how one mentioned work is relevant to a prior mentioned work. 
    a) Sentences are not transitioned from one to another.
    b) The relationship between sentences describing papers is implied but not explicitly stated.
    c) Coherence issues appear only as multiple sentences.
    Examples of coherence issues include:
        i) Relation between mentioned works is not explicit: “Smith (2020) identified a relationship between personal belief systems and ethical decision-making frameworks. Moral foundation theory proposes several core dimensions of moral reasoning, including harm, fairness, and authority (Jones, 2015). Audience adaptation has been explored in computational argumentation. Lee et al. (2019) applied moral categories to argument generation tasks. Human annotators often disagree when labeling moral dimensions in text (Nguyen et al., 2018).”
        ii) Lack of transitions between sentences: “Recent studies have explored various techniques for enhancing model performance. Smith et al. (2020) introduced a novel architecture that significantly improves accuracy on benchmark datasets. Additionally, Johnson and Lee (2019) proposed a data augmentation method that increases training data diversity.” 
        iii) No explanation of the cited works and relation to their own work: “Recently, several studies have explored the use of prompting techniques with pre-trained language models to influence model outputs or access latent knowledge (Brown et al., 2020; Gao et al., 2021; Liu et al., 2021; Wei et al., 2022).” """

In [34]:
import json 
import re

def complete_prompt(text, full_text):
    final = text + f"""\n
Respond ONLY in this json format:
{{  "span_text": ... ,
    "gpt_label": ... ,
    "reason": ... }}
Please have 'span_text' ONLY contain text. The text must be verbatim from the passage of text provided.

Here is the passage of text you must identify spans in:
{full_text}"""
    return final 

def parse_list(response):
    s = re.sub(r"```(?:json)?\s*", "", response).replace("```", "").strip()

    start_candidates = [i for i in (s.find("["), s.find("{")) if i != -1]
    if not start_candidates:
        raise ValueError("No JSON start token found ('[' or '{').")

    start = min(start_candidates)
    s = s[start:].lstrip()

    decoder = json.JSONDecoder()
    obj, _end = decoder.raw_decode(s)  # parses first JSON value only

    if isinstance(obj, dict):
        obj = [obj]

    if not isinstance(obj, list):
        raise ValueError(f"Expected a JSON list, got {type(obj).__name__}")

    if not all(isinstance(item, dict) for item in obj):
        bad_types = {type(item).__name__ for item in obj if not isinstance(item, dict)}
        raise ValueError(f"Expected list of dicts, but found non-dict items: {bad_types}")

    return obj

In [35]:
def extract_start_end_char(text, full_text):
    text_norm = text.replace('.', '')
    full_norm = full_text.replace('.', '')

    if text_norm in full_norm:
        start = full_norm.index(text_norm)
        end = start + len(text_norm)
    else:
        start = end = -1

    return start, end

GPT response collection:

In [37]:
from tqdm import tqdm 
from pathlib import Path

prompt_strategy = ['zero', 'fewshot']
zeroshot = []
fewshot = []
folder = '../data/to_annotate'
categories_def = {"Unsupported Claim": unsupp_def, "Format": format_def, "Coherence": coherence_def, "Lacks synthesis": lacksynth_def}

for i in tqdm(range(100,101), desc='Collecting GPT responses: '):
    path = Path(f"{folder}/paper_{i}.txt")

    if path.exists():
        with open(f"{folder}/paper_{i}.txt", 'r') as f:
            full_text = f.read()
        full_text = full_text.split('References')[0]
    else:
        continue

    for strategy in prompt_strategy:
        temp = []
        for category, definition in categories_def.items():
            if strategy == 'fewshot':
                prompt = f"""
You are given a passage of text that comes from an academic paper on the field of Natural Language Processing. You must highlight the spans that exhibit any of these issues:
**{category}**: {definition}
"""
            else:
                prompt = f"""
You are given a passage of text that comes from an academic paper on the field of Natural Language Processing. You must highlight the spans that exhibit any of these issues:
**{category}**: {definition.split("Examples of")[0]}
"""
            prompt = complete_prompt(prompt, full_text)
            response = ask_gpt(prompt)
            if len(response) == 0:
                continue

            json_response = parse_list(response)
            for resp in json_response:
                start, end = extract_start_end_char(resp['span_text'], full_text)
                resp['start'] = start
                resp['end'] = end

            if strategy == 'fewshot':
                temp.append({"filename": f"paper_{i}.txt", "full_text": full_text, 'gpt_response': json_response})
            else:
                temp.append({"filename": f"paper_{i}.txt", "full_text": full_text, 'gpt_response': json_response})
        
        if strategy == 'fewshot':
            fewshot.append(temp)
        else:
            zeroshot.append(temp)
    
    if i%10 == 0:
        with open(f"../experiments/fewshot_responses_{i}.json", "w") as f:
            json.dump(fewshot, f, indent=4)

        with open(f"../experiments/zero_responses_{i}.json", "w") as f:
            json.dump(zeroshot, f, indent=4)

Collecting GPT responses: 100%|██████████| 1/1 [07:09<00:00, 429.31s/it]


In [None]:
import json
from collections import defaultdict
from typing import Any, Dict, List

def _flatten(x):
    """Yield dict items from arbitrarily nested lists."""
    if isinstance(x, list):
        for y in x:
            yield from _flatten(y)
    else:
        yield x

def merge_and_group_by_paper(data: Any, *, dedupe: bool = True) -> Dict[str, Dict[str, Any]]:
    """
    Accepts either:
      - list of dicts
      - list of lists of dicts (nested)
    and merges/group predictions by filename.
    """
    merged: Dict[str, Dict[str, Any]] = {}

    for rec in _flatten(data):
        if not isinstance(rec, dict):
            continue  # skip unexpected items safely

        # handle both 'filename' and 'file' keys
        fn = rec.get("filename") or rec.get("file")
        if fn is None:
            continue

        full_text = rec.get("full_text", "") or rec.get("text", "")

        if fn not in merged:
            merged[fn] = {
                "filename": fn,
                "full_text": full_text,
                "all_spans": [],
                "spans_by_label": defaultdict(list),
            }
        else:
            if not merged[fn]["full_text"] and full_text:
                merged[fn]["full_text"] = full_text

        for span in rec.get("gpt_response", []) or []:
            merged[fn]["all_spans"].append(span)
            label = span.get("gpt_label", "UNKNOWN")
            merged[fn]["spans_by_label"][label].append(span)

    # Optional dedupe
    if dedupe:
        for fn, obj in merged.items():
            def uniq(spans):
                seen = set()
                out = []
                for s in spans:
                    key = (
                        s.get("gpt_label"),
                        int(s.get("start")) if s.get("start") is not None else None,
                        int(s.get("end")) if s.get("end") is not None else None,
                        s.get("span_text"),
                    )
                    if key in seen:
                        continue
                    seen.add(key)
                    out.append(s)
                return out

            obj["all_spans"] = uniq(obj["all_spans"])
            for lab in list(obj["spans_by_label"].keys()):
                obj["spans_by_label"][lab] = uniq(obj["spans_by_label"][lab])

            obj["spans_by_label"] = dict(obj["spans_by_label"])

    return merged

# group all predictions based on paper 
fewshot = merge_and_group_by_paper(fewshot)
zeroshot = merge_and_group_by_paper(zeroshot)

dict_keys(['Unsupported Claim', 'Format', 'Coherence', 'Lacks synthesis'])


Some spans dont have a start and end index due to GPT over correcting grammar/spelling mistakes, so we do post-processing to fix this:

In [6]:
# Run this block if youre resuming work after restarting your kernel, otherwise continue to the next block to save the responses after collection is done
import json 

fewshot_path = "../experiments/final/fewshot_preds.json"
zeroshot_path = "../experiments/final/zeroshot_preds.json"

with open(fewshot_path, 'r') as f:
    fewshot = json.load(f)

with open(zeroshot_path, 'r') as f:
    zeroshot = json.load(f)

In [None]:
import json
import re

MISSING_SEMI_SPACE_RE = re.compile(r";\S")  # semicolon not followed by space

def remove_missing_spaces_format_spans(
    in_path: str,
    out_path: str,
):
    with open(in_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    removed = 0

    for paper_key, paper_obj in data.items():
        # ---- filter all_spans ----
        new_all = []
        for s in paper_obj.get("all_spans", []):
            if s.get("gpt_label") == "Format":
                reason = (s.get("reason") or "").lower()
                span_text = s.get("span_text") or ""

                if (
                    "missing space" in reason
                    or "missing spaces" in reason
                    or MISSING_SEMI_SPACE_RE.search(span_text)
                ):
                    removed += 1
                    continue

            new_all.append(s)

        paper_obj["all_spans"] = new_all

        # ---- keep spans_by_label in sync ----
        sbl = paper_obj.get("spans_by_label")
        if isinstance(sbl, dict) and "Format" in sbl:
            new_format = []
            for s in sbl["Format"]:
                reason = (s.get("reason") or "").lower()
                span_text = s.get("span_text") or ""

                if (
                    "missing space" in reason
                    or "missing spaces" in reason
                    or MISSING_SEMI_SPACE_RE.search(span_text)
                ):
                    continue

                new_format.append(s)

            sbl["Format"] = new_format

    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, ensure_ascii=False)

    print(f"Removed {removed} Format spans for missing semicolon spaces.")
    return removed


In [11]:
remove_missing_spaces_format_spans(
    in_path="../experiments/final/zeroshot_preds.json",
    out_path="../experiments/zeroshot_preds.json",
)

Removed 173 Format spans for missing semicolon spaces.


173

In [None]:
from difflib import SequenceMatcher
import unicodedata
from tqdm import tqdm

def _normalize_with_map(s: str):
    """
    Normalize in a way that often matches annotation tools:
    - Unicode normalize (NFC)
    - Convert CRLF/CR to LF
    - Replace NBSP with space
    - Remove common zero-width chars
    Returns: (normalized_string, norm_index -> original_index map)
    """
    s = unicodedata.normalize("NFC", s)

    norm_chars = []
    norm_to_orig = []

    i = 0
    while i < len(s):
        ch = s[i]

        # normalize newlines
        if ch == "\r":
            # treat \r\n or \r as a single \n
            if i + 1 < len(s) and s[i + 1] == "\n":
                # map the normalized '\n' to the start of the pair
                norm_chars.append("\n")
                norm_to_orig.append(i)
                i += 2
                continue
            else:
                norm_chars.append("\n")
                norm_to_orig.append(i)
                i += 1
                continue

        # normalize NBSP
        if ch == "\u00A0":
            ch = " "

        # drop zero-width characters (common culprits)
        if ch in ("\u200b", "\u200c", "\u200d", "\ufeff"):
            i += 1
            continue

        norm_chars.append(ch)
        norm_to_orig.append(i)
        i += 1

    return "".join(norm_chars), norm_to_orig


def fuzzy_span_start_end_mapped(
    span_text: str,
    full_text: str,
    threshold: float = 0.80,
    window_slack: int = 20,
    output_1_based_inclusive: bool = False,
):
    if not span_text or not full_text:
        return -1, -1

    norm_full, norm_map = _normalize_with_map(full_text)
    norm_span, _ = _normalize_with_map(span_text)

    target_len = len(norm_span)
    best_score = 0.0
    best_start = -1
    best_end = -1  # exclusive in normalized coordinates

    min_len = max(1, target_len - window_slack)
    max_len = min(len(norm_full), target_len + window_slack)

    for win_len in range(min_len, max_len + 1):
        for i in range(0, len(norm_full) - win_len + 1):
            candidate = norm_full[i:i + win_len]
            score = SequenceMatcher(None, norm_span, candidate).ratio()
            if score > best_score:
                best_score = score
                best_start = i
                best_end = i + win_len

    if best_score < threshold or best_start < 0:
        return -1, -1

    # Map normalized [best_start, best_end) back to ORIGINAL full_text indices.
    # start maps directly; end maps via last char + 1 (to keep exclusive end).
    orig_start = norm_map[best_start]
    orig_end = norm_map[best_end - 1] + 1

    if output_1_based_inclusive:
        # convert [orig_start, orig_end) to 1-based inclusive
        return orig_start + 1, orig_end  # end becomes inclusive in 1-based

    return orig_start, orig_end


for resp in tqdm(fewshot, desc='Fuzzy matching few shot samples: '):
    span = resp['gpt_response'][0]['span_text']
    full_text = resp['full_text']
    s, e = fuzzy_span_start_end_mapped(span, full_text, output_1_based_inclusive=True)
    resp['gpt_response'][0]['start'], resp['gpt_response'][0]['end'] = s, e

for resp in tqdm(zeroshot, desc='Fuzzy matching zero shot samples: '):
    span = resp['gpt_response'][0]['span_text']
    full_text = resp['full_text']
    s, e = fuzzy_span_start_end_mapped(span, full_text, output_1_based_inclusive=True)
    resp['gpt_response'][0]['start'], resp['gpt_response'][0]['end'] = s, e


Save to folder:

In [None]:
import json 

filename = "fewshot_preds.json"
folder = "../experiments/final"

with open(f"{folder}/{filename}", 'w') as f:
    json.dump(fewshot, f, indent=4)

In [None]:
import json 

filename = "zeroshot_preds.json"
folder = "../experiments/final"

with open(f"{folder}/{filename}", 'w') as f:
    json.dump(zeroshot, f, indent=4)