In [4]:
import json 
import os 
from tqdm import tqdm 

nlpeer_path = "../nlpeer_pdf/ARR-22/data"
complete_reviews = []

for folders in tqdm(os.listdir(nlpeer_path)):
    if folders == 'meta.json':
        continue

    v1_folder = os.path.join(nlpeer_path, folders, 'v1', 'reviews.json')

    with open(v1_folder, 'r') as f:
        review_data = json.load(f)
    
    if len(review_data) != 0:
        review_data = review_data[0]
        complete_reviews.append({'id': folders, 'review': review_data['reviewer'], 'summary': review_data['report']['paper_summary'],
                                'weaknesses': review_data['report']['summary_of_weaknesses'], 'suggestions': review_data['report']['comments,_suggestions_and_typos']})

100%|██████████| 477/477 [00:12<00:00, 39.33it/s]


In [None]:
from openai import OpenAI
from dotenv import load_dotenv

# keys.env with stored keys required 
key_path = 'keys.env'
load_dotenv(dotenv_path=key_path) 
api_key = os.getenv("API_KEY_IRAA")
client = OpenAI(api_key=api_key)

def ask_gpt(prompt):
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "assistant", "content": "You are a helpful assistant."},
            {
                "role": "user",
                "content": prompt
            }
        ]
    )

    return completion.choices[0].message.content

In [7]:
from tqdm import tqdm 
import re 

for reviews in tqdm(complete_reviews):
    weaknesses = reviews['weaknesses']
    suggestions = reviews['suggestions']

    prompt = f""" 
    Provided to you is a review comments on the weaknesses and suggestions for the paper. Please classify it as either relevant to the introduction section, related works section, or neither. 
    Valid classifications are either: Introduction, Related Works, Neither.

    Please have your output strictly at the end of your response and in this format:
    {'classification: ...'}

    Weaknesses: 
    {weaknesses}
    Suggestions:
    {suggestions} """

    response = ask_gpt(prompt)
    match = re.search(r"classification:\s*(.*)", response, re.IGNORECASE | re.DOTALL)
    if match:
        reviews['gpt_classification'] = match.group(1)
    else:
        reviews['gpt_classification'] = None

  0%|          | 0/364 [00:00<?, ?it/s]

100%|██████████| 364/364 [09:54<00:00,  1.63s/it]


In [8]:
with open("../data/gpt_classifications_reviews_3.json", 'w') as f:
    json.dump(complete_reviews, f, indent=4)

In [1]:
import json
from collections import Counter
from pathlib import Path

def load_ids_from_json(path):
    """Load a JSON file assumed to be a list of dicts with an 'id' key."""
    path = Path(path)
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    items = []
    for item in data:
        # guard in case some items don't have 'id'
        if isinstance(item, dict) and "id" in item:
            items.append({'id': item["id"], 'gpt_classification': item['gpt_classification']})
    return items


def most_common_ids_across_files(file1, file2, file3, top_n=None):
    all_items = []
    for path in [file1, file2, file3]:
        all_items.extend(load_ids_from_json(path))
    
    id_counts = Counter(item["id"] for item in all_items)

    id_to_class = {}
    for item in all_items:
        _id = item["id"]
        if _id not in id_to_class:
            id_to_class[_id] = item["gpt_classification"]

    common = id_counts.most_common(top_n) if top_n is not None else id_counts.most_common()

    result = [
        {
            "id": _id,
            "count": count,
            "gpt_classification": id_to_class.get(_id)
        }
        for _id, count in common
    ]

    return result


file1 = "../data/gpt_classifications_reviews_1.json"
file2 = "../data/gpt_classifications_reviews_2.json"
file3 = "../data/gpt_classifications_reviews_3.json"

common_ids = most_common_ids_across_files(file1, file2, file3)
print(len(common_ids))

364


In [2]:
rw_intro = []

for review in common_ids:
    if review['gpt_classification'] != "Neither":
        rw_intro.append(review)

In [7]:
import shutil
from pathlib import Path

def copy_file_to_folder(src_file, dst_folder, filename):
    src_file = Path(src_file)
    dst_folder = Path(dst_folder)

    dst_folder.mkdir(parents=True, exist_ok=True)  # create folder if it doesn't exist

    shutil.copy2(src_file, dst_folder / f"{filename}.tei")  # preserves metadata

for review in tqdm(rw_intro):
    folder = review['id']
    copy_file_to_folder(f"../nlpeer_pdf/ARR-22/data/{folder}/v1/paper.tei", "../data/annotation/tei_files", f"pdf_{folder}")

100%|██████████| 117/117 [00:13<00:00,  8.91it/s]


In [23]:
from lxml import etree 
from lxml.etree import _Element 
import re 
from typing import List, Tuple, Optional, Dict

SPECIAL_EQUIVALENTS = {"related work", "related works", "background", "past", "current"}

def normalize_target(s: str) -> str:
    s = (s or "").strip()
    # drop any leading numbering like "2 " or "2. " or "2.1 "
    s = re.sub(r"^\d+(\.\d+)*\s+", "", s)
    return s.lower()

def find_top_level_heading_blocks_using_title(blocks: List[dict]) -> List[Tuple[Optional[int], str, int]]:
    """
    Build a list of top-level heading candidates from <head> blocks.

    Returns list of tuples (section_number_or_None, heading_text, block_index).
    - section_number_or_None: int if @n is an integer (no dot), None otherwise.
    - heading_text: the text content of the <head> element (this is what we match against).
    - block_index: index into the blocks list.

    IMPORTANT: Matching will be done against heading_text, NOT the @n value.
    We only use @n to determine top-level vs dotted-subsection (e.g., "3.1").
    """
    headings: List[Tuple[Optional[int], str, int]] = []

    for idx, b in enumerate(blocks):
        if b["type"] != "head":
            continue
        el = b.get("element")
        head_text = b["text"].strip()
        n_attr = None
        if el is not None:
            n_attr = el.get("n")
            if n_attr:
                n_attr = n_attr.strip() or None

        # If n is an integer (no dot) treat as top-level and record that number
        if n_attr and re.fullmatch(r"\d+", n_attr):
            num = int(n_attr)
            headings.append((num, head_text, idx))
            continue

        # If n is dotted like "3.1" treat as subsection and skip from top-level list
        if n_attr and re.fullmatch(r"\d+\.\d+(\.\d+)*", n_attr):
            # skip as top-level candidate (we don't want subsections to be slicing boundaries)
            continue

        # Otherwise (no n, or n not purely integer), include as unnumbered top-level candidate
        # We include it because matching is done on the head text itself.
        headings.append((None, head_text, idx))

    return headings


def fallback_find_headings_in_paragraphs_using_title(blocks: List[dict]) -> List[Tuple[Optional[int], str, int]]:
    """
    Fallback heading detection scanning paragraph blocks for leading numbered headings OR
    short title-case paragraphs that look like headings. Returns same shape as above.
    This also uses the paragraph text (not numeric n values) for matching.
    """
    headings = []
    for idx, b in enumerate(blocks):
        if b["type"] != "p":
            continue
        text = b["text"].strip()
        m = TOP_LEVEL_HEADING_RE.match(text)
        if m:
            num = int(m.group(1))
            title = m.group(2).strip().rstrip(" .:;,-")
            headings.append((num, title, idx))
        else:
            # heuristic: short Title Case paragraph may be an unnumbered heading
            words = text.split()
            if 1 <= len(words) <= 6 and text == text.title():
                headings.append((None, text.rstrip(" .:;,-"), idx))
    return headings

def find_by_id(items, target_id):
    for item in items:
        if item.get("id") == target_id:
            return item['gpt_classification']
    return None

def get_text(el):
    if el is None:
        return ""
    return "".join(el.itertext()).strip()

def is_special_target(norm: str) -> bool:
    return norm in SPECIAL_EQUIVALENTS

def local_name(el: _Element) -> str: 
    return etree.QName(el).localname

def build_ordered_blocks_from_tei(tei_path: str) -> Tuple[List[dict], str]:
    """
    Parse TEI and build a list of blocks in document order.
    Each block: {"type": "head"|"p", "text": str, "element": Element}
    Returns (blocks, full_text_preview) where full_text_preview is a collapsed preview
    (useful for fallback scanning).
    """
    tree = etree.parse(tei_path)
    root = tree.getroot()

    body = root.find(".//tei:text/tei:body", namespaces=TEI_NS)
    source = body if body is not None else root

    blocks: List[dict] = []

    # Walk in document order, collect <head>, <p>, <ab> blocks.
    # We explicitly include <head> and paragraph-like nodes. We ignore other tags.
    for el in source.iter():
        name = local_name(el)
        if name == "head":
            text = get_text(el)
            if text:
                blocks.append({"type": "head", "text": text, "element": el})
        elif name in ("p", "ab"):
            text = get_text(el)
            if text:
                blocks.append({"type": "p", "text": text, "element": el})

    # For fallback matching we also provide a single continuous preview text
    preview_pieces = []
    for b in blocks:
        if b["type"] == "head":
            preview_pieces.append(b["text"])
        else:
            # keep paragraph boundaries with double newlines for preview
            preview_pieces.append(b["text"])
    full_text_preview = " \n\n ".join(preview_pieces)
    return blocks, full_text_preview

def matches_candidate(candidate_title: str, target_norm: str) -> bool:
    """
    Return True if candidate_title matches target_norm according to rules.
    Defensive: if target_norm is empty, do NOT match anything.
    """
    if not target_norm:
        return False
    if not candidate_title:
        return False
    cand_low = candidate_title.lower()
    if is_special_target(target_norm):
        return candidate_matches_special(cand_low)
    # otherwise substring
    return (target_norm in cand_low) or (cand_low in target_norm)

def candidate_matches_special(candidate_title: str) -> bool:
    cand = (candidate_title or "").lower()
    for tok in ("related work", "related works", "background", "past", "current"):
        if tok in cand:
            return True
    return False

FIG_TABLE_RE = re.compile(r'\b(?:figure|table)\s*\d+\s*:', flags=re.IGNORECASE)

def debug_blocks(blocks: List[dict], start: int, end: int):
    print(f"DEBUG: slicing blocks [{start} .. {end-1}] (inclusive)")
    for i in range(start, end):
        b = blocks[i]
        t = b['type']
        txt = b['text'].strip().replace("\n", "\\n")
        print(f"  idx={i:3} type={t:4} text_preview={repr(txt[:120])}")

def extract_section_in_tei(
    tei_path: str,
    section: str,
    *,
    debug: bool = False
) -> Dict[str, Optional[object]]:
    """
    Robust extractor that matches on the <head> text (not @n), slices blocks between
    the matched top-level <head> and the next top-level <head>, excludes Figure/Table
    paragraphs from the returned content, and RETURNS a flag `has_figure` indicating
    whether any "Figure <digit>:" was present in the original slice.

    Returns a dict:
      {
        "matched_heading": str | None,
        "content": str | None,
        "start_block_idx": int | None,
        "end_block_idx": int | None,
        "included_blocks": List[int],
        "excluded_blocks": List[int],
        "has_figure": bool
      }
    """
    # defensive checks
    if section is None:
        return {
            "matched_heading": None, "content": None,
            "start_block_idx": None, "end_block_idx": None,
            "included_blocks": [], "excluded_blocks": [],
            "has_figure": False
        }
    target_norm = normalize_target(section)
    if not target_norm:
        return {
            "matched_heading": None, "content": None,
            "start_block_idx": None, "end_block_idx": None,
            "included_blocks": [], "excluded_blocks": [],
            "has_figure": False
        }

    # build blocks
    blocks, _ = build_ordered_blocks_from_tei(tei_path)

    # find headings by head-text (prefer head elements; fallback to paragraphs)
    headings = find_top_level_heading_blocks_using_title(blocks)
    if not headings:
        headings = fallback_find_headings_in_paragraphs_using_title(blocks)

    if debug:
        print("DEBUG: discovered headings (num_or_None, title, block_idx):")
        for h in headings:
            print("  ", h)

    if not headings:
        return {
            "matched_heading": None, "content": None,
            "start_block_idx": None, "end_block_idx": None,
            "included_blocks": [], "excluded_blocks": [],
            "has_figure": False
        }

    # find first heading that matches the target using matches_candidate (works for special group)
    matched_idx = None
    matched_title = None
    for hi, (num, title, block_idx) in enumerate(headings):
        if matches_candidate(title, target_norm):
            matched_idx = hi
            matched_title = title
            break

    # fallback: try matching by numeric prefix in provided section string
    if matched_idx is None:
        mnum = re.match(r'^\s*(\d+)\s+', section)
        if mnum:
            wanted = int(mnum.group(1))
            for hi, (num, title, block_idx) in enumerate(headings):
                if num == wanted:
                    matched_idx = hi
                    matched_title = title
                    break

    if matched_idx is None:
        if debug:
            print("DEBUG: no matching heading found for target:", repr(target_norm))
        return {
            "matched_heading": None, "content": None,
            "start_block_idx": None, "end_block_idx": None,
            "included_blocks": [], "excluded_blocks": [],
            "has_figure": False
        }

    # compute start/end block indices (slice from matched heading's block to just before next heading's block)
    start_block_idx = headings[matched_idx][2]
    if matched_idx + 1 < len(headings):
        end_block_idx = headings[matched_idx + 1][2]
    else:
        end_block_idx = len(blocks)

    if debug:
        print(f"DEBUG: matched heading: {repr(matched_title)} at headings index {matched_idx}, start_block_idx={start_block_idx}, end_block_idx={end_block_idx}")
        debug_blocks(blocks, start_block_idx, end_block_idx)

    # collect text pieces while excluding figure/table paragraphs
    included_blocks: List[int] = []
    excluded_blocks: List[int] = []
    pieces: List[str] = []

    # specific regex to detect Figure labels only
    FIG_RE = re.compile(r'\bfigure\s*\d+\s*:', flags=re.IGNORECASE)

    # flag if any figure occurs in the slice (whether or not excluded)
    has_figure = False

    for bi in range(start_block_idx, end_block_idx):
        b = blocks[bi]
        text = b["text"] or ""
        # check for any Figure label in the block
        if FIG_RE.search(text):
            has_figure = True

        if b["type"] == "p":
            if FIG_TABLE_RE.search(text):
                excluded_blocks.append(bi)
                if debug:
                    print(f"DEBUG: excluding block idx={bi} (figure/table paragraph): {b['text'][:60]!r}")
                continue
            included_blocks.append(bi)
            pieces.append(text.strip())
        elif b["type"] == "head":
            included_blocks.append(bi)
            pieces.append(text.strip())
        else:
            # include other block types conservatively
            included_blocks.append(bi)
            pieces.append(text.strip())

    content = "\n\n".join(pieces).strip() or None

    if debug:
        print("DEBUG: included_blocks:", included_blocks)
        print("DEBUG: excluded_blocks:", excluded_blocks)
        print("DEBUG: content length:", 0 if content is None else len(content))
        print("DEBUG: has_figure:", has_figure)
        if content:
            print("DEBUG: content preview:", repr(content[:400]))

    return {
        "matched_heading": matched_title,
        "content": content,
        "start_block_idx": start_block_idx,
        "end_block_idx": end_block_idx,
        "included_blocks": included_blocks,
        "excluded_blocks": excluded_blocks,
        "has_figure": has_figure,
    }


In [19]:
from lxml import etree
from typing import List, Dict, Optional, Union
import re

TEI_NS = {"tei": "http://www.tei-c.org/ns/1.0"}

# -------------------------
# Extraction of biblStructs in tei files
# 
# -------------------------
def _get_text(el: Optional[etree._Element]) -> str:
    return "" if el is None else "".join(el.itertext()).strip()

def extract_bibl_structs(tei_path: str) -> List[Dict]:
    """
    Parse TEI file at tei_path and return a filtered list of dicts, one per <biblStruct>.
    Filtering heuristic:
      - include entries that have a monogr title (journal/book-like entries), OR
      - include analytic-title entries only when they also have some useful metadata
        (year, pages, or a useful idno like DOI/arXiv).
      - exclude entries that are essentially empty (e.g., only MD5 idno, empty imprint/date).
    """
    tree = etree.parse(tei_path)
    root = tree.getroot()
    bibl_nodes = root.xpath(".//tei:biblStruct", namespaces=TEI_NS)

    out = []
    for b in bibl_nodes:
        # quick introspection: detect presence of analytic/monogr titles
        analytic_title_nodes = b.xpath(".//tei:analytic/tei:title", namespaces=TEI_NS)
        monogr_title_nodes = b.xpath(".//tei:monogr/tei:title", namespaces=TEI_NS)

        # preliminary metadata checks: date, pages, idnos
        # Year detection
        year = ""
        date_nodes = b.xpath(".//tei:imprint/tei:date", namespaces=TEI_NS)
        if date_nodes:
            d = date_nodes[0]
            when = d.get("when")
            if when and re.search(r"\d{4}", when):
                year = re.search(r"(\d{4})", when).group(1)
            else:
                dt = _get_text(d)
                m = re.search(r"(\d{4})", dt)
                if m:
                    year = m.group(1)

        # Pages detection
        page_nodes = b.xpath(".//tei:imprint/tei:biblScope[@unit='page']", namespaces=TEI_NS)
        pages = ""
        if page_nodes:
            el = page_nodes[0]
            frm = el.get("from")
            to = el.get("to")
            if frm and to:
                pages = f"{frm}--{to}"
            else:
                pages = _get_text(el)

        # idnos: gather and check for useful identifiers
        idno_nodes = b.xpath(".//tei:idno", namespaces=TEI_NS)
        idnos = {}
        for idn in idno_nodes:
            typ = (idn.get("type") or "").strip()
            val = _get_text(idn)
            if typ:
                idnos[typ] = val
            else:
                if re.search(r"10\.\d{4,9}/", val):
                    idnos["DOI"] = val
                elif re.search(r"arxiv", val, flags=re.I) or re.search(r"\d{4}\.\d{4,}", val):
                    idnos["arXiv"] = val
                else:
                    idnos.setdefault("other", []).append(val)

        has_useful_id = bool(idnos.get("DOI") or idnos.get("arXiv"))
        has_year_or_pages = bool(year or pages)

        # Decide whether to include this biblStruct
        include = False
        if monogr_title_nodes:
            # monogr entries are usually meaningful (journal/book)
            include = True
        else:
            # if only analytic title exists, require at least one useful metadata field
            if analytic_title_nodes and (has_year_or_pages or has_useful_id):
                include = True

        if not include:
            # skip this biblStruct as it appears to be non-informative (e.g., only MD5 id)
            continue

        # Build the item dict (same structure as before)
        item = {
            "xml_id": b.get("{http://www.w3.org/XML/1998/namespace}id") or b.get("xml:id") or b.get("id"),
            "raw_xml": etree.tostring(b, encoding="unicode", with_tail=False),
            "title": "",
            "authors": [],  # list of {"surname":..., "forenames":[...], "raw":...}
            "venue": "",
            "publisher": "",
            "year": year,
            "pages": pages,
            "idnos": idnos
        }

        # Title: prefer analytic level='a' then analytic then monogr title then generic
        title_nodes = b.xpath(".//tei:analytic/tei:title[@level='a']|.//tei:analytic/tei:title|.//tei:monogr/tei:title|.//tei:title", namespaces=TEI_NS)
        if title_nodes:
            item["title"] = _get_text(title_nodes[0])

        # Authors: try extracting persName children
        authors = []
        for au in b.xpath(".//tei:author", namespaces=TEI_NS):
            pers = au.find(".//{http://www.tei-c.org/ns/1.0}persName")
            if pers is not None:
                surname_el = pers.find(".//{http://www.tei-c.org/ns/1.0}surname")
                forename_els = pers.findall(".//{http://www.tei-c.org/ns/1.0}forename")
                surname = _get_text(surname_el) if surname_el is not None else ""
                forenames = [_get_text(fn) for fn in forename_els if _get_text(fn)]
                raw = _get_text(pers)
                authors.append({"surname": surname, "forenames": forenames, "raw": raw})
            else:
                # fallback: use author text
                at = _get_text(au)
                if at:
                    parts = at.split()
                    if len(parts) >= 2:
                        surname = parts[-1]
                        forenames = parts[:-1]
                    else:
                        surname = at
                        forenames = []
                    authors.append({"surname": surname, "forenames": forenames, "raw": at})
        item["authors"] = authors

        # Venue / monogr title (prefer monogr/journal)
        journal = b.xpath(".//tei:monogr/tei:title[@level='j']", namespaces=TEI_NS)
        monogr = b.xpath(".//tei:monogr/tei:title", namespaces=TEI_NS)
        if journal:
            item["venue"] = _get_text(journal[0])
        elif monogr:
            item["venue"] = _get_text(monogr[0])
        else:
            gen = b.xpath(".//tei:title", namespaces=TEI_NS)
            item["venue"] = _get_text(gen[0]) if gen else ""

        # Publisher (if present)
        pub = b.xpath(".//tei:imprint/tei:publisher", namespaces=TEI_NS)
        if pub:
            item["publisher"] = _get_text(pub[0])

        # Keep idnos (already collected)
        item["idnos"] = idnos

        out.append(item)

    return out

# -------------------------
# ACL formatting utilities
# -------------------------
def _format_author_acl(author: Dict) -> str:
    """
    Turn {"surname": "...", "forenames": [...]} into "Surname, F." (use first forename initial).
    If multiple forenames, use initials for each (e.g., "John Paul" -> "J.P.").
    If surname missing, use raw.
    """
    surname = author.get("surname") or ""
    forenames = author.get("forenames") or []
    raw = author.get("raw") or ""
    if not surname and not forenames:
        return raw or ""
    # build initials from forenames (use first forename only or all? we use all for initials)
    initials = ""
    if forenames:
        initials = " ".join(
            "".join([part[0] + "." for part in fn.split()]) for fn in forenames
        )
        # collapse spaces to single space
        initials = re.sub(r"\s+", " ", initials).strip()
    if surname and initials:
        return f"{surname}, {initials}"
    if surname:
        return surname
    return raw

def _join_authors_acl(authors: List[Dict]) -> str:
    """
    Join formatted authors in ACL style:
      - 1 author: A.
      - 2 authors: A. and B.
      - >2: A., B., and C.
    """
    formatted = [_format_author_acl(a) for a in authors if _format_author_acl(a)]
    if not formatted:
        return ""
    if len(formatted) == 1:
        return formatted[0]
    if len(formatted) == 2:
        return f"{formatted[0]} and {formatted[1]}"
    # 3+ authors
    return ", ".join(formatted[:-1]) + ", and " + formatted[-1]

def format_reference_acl(ref: Dict) -> str:
    """
    Format a single reference dict (from extract_bibl_structs) into ACL-like citation string.
    """
    authors = _join_authors_acl(ref.get("authors", []))
    year = ref.get("year") or ""
    title = ref.get("title") or ""
    venue = ref.get("venue") or ""
    pages = ref.get("pages") or ""
    doi = None
    idnos = ref.get("idnos", {})
    # prefer DOI, then arXiv, then any other id
    if idnos:
        doi = idnos.get("DOI") or idnos.get("doi") or idnos.get("arXiv")
        if not doi:
            # if 'other' is list, pick first
            oth = idnos.get("other")
            if isinstance(oth, list) and oth:
                doi = oth[0]

    parts = []
    if authors:
        parts.append(authors)
    if year:
        parts.append(year + ".")
    if title:
        parts.append(title.rstrip(".") + ".")
    if venue:
        parts.append("In " + venue.rstrip(".") + ".")
    if pages:
        parts.append("pp. " + pages)
    if doi:
        parts.append(doi)
    # Join with space
    return " ".join(p for p in parts if p).strip()

In [24]:
from tqdm import tqdm 
import os 
total_text = []
pdf_folder = '../data/annotation/pdfs'
failed_to_process = []

for file in tqdm(os.listdir(pdf_folder)):
    if '.tei' in file:
        # pdf_path = os.path.join(pdf_folder, file)
        file_id = file[4:132]
        tei_path = os.path.join(pdf_folder, f"pdf_{file_id}.tei")
    else:
        continue

    section = find_by_id(rw_intro, file_id) 
    tei_section_text = extract_section_in_tei(tei_path, section, debug=False)

    refs = extract_bibl_structs(tei_path)
    references = [format_reference_acl(r) for r in refs]
    references = "\n".join(references)

    if tei_section_text["matched_heading"] == None or len(references) == 0:
        failed_to_process.append(file_id)
        continue

    total_text.append({
        'id': file,
        'text': tei_section_text['content'],
        "has_figure": tei_section_text['has_figure'],
        'references': references
    })

print(f"Failed to process files: {failed_to_process}")

  0%|          | 0/234 [00:00<?, ?it/s]

100%|██████████| 234/234 [00:04<00:00, 51.88it/s]

Failed to process files: ['00aff1ff48640448a87f47a0985a0ac40c6aa18fbbd3909bdfd521eb22d67f209005b6ca9bf68bcde0d387509c0026cbed0dcd9acdd779959b4b943f677dcc4c', '0f4cb8033e92de1eecebb650474462752b609f5a11cdf226e790163ec2a020aeb4d08d7d750bc75fd8d7c0be415881528a542492eeff9c184f1fd6b090129258', '1f63cc5901bef369c232cedfdf722fdf15b03dbc2d1570dfdcb2d635e82e96e3a23aee4c4ea0f71b12d5eeff81d92af1bc50ed0dc14a11a7822247a3869cfabc', '25b37714233d1ceb645b7dbc062d3c1cae7cdd0a23dfc01d1ad6cf964be2701f56cd692d11316907ba1b92e91058a4834314321dcc6a09aa069082c8b775400f', '30e4f79e74393e179f891c8a9cec30925ff3a4aec46c79a8988d5a96e69b2aa0ecda673f3dd4d25c67a8cf306debe95f729ec54145d9a6bf25e07c12a3f86ac9', '3682a41985bc4769c5dc8fb3ca6049f7cdac04eca025ade78ad24ad1d7cb3408cc49c533fd653f236881888aedb59880277da3e30dedb69510df272588e8d612', '3bf4a49a78cbf2ee21666876aa50d4b69ebedfbd82098262e29b1d09e1c2bbad83a30d50d122733c8216a39edb9e5762f090416ee3e1d1a9fe91695ad9ee5b4b', '6666ff9aa9f0e507dde28de61fe9d051463456cab0a8a7422




In [25]:
import json 

with open("../data/complete_dataset.json", 'w') as f:
    json.dump(total_text, f, indent=4)

In [31]:
import os
from pathlib import Path

files_ref = []

def save_to_txt(dict_list, folder_path, base_filename="item"):
    print("Number of items in dict_list:", len(dict_list))

    folder = Path(folder_path)
    folder.mkdir(parents=True, exist_ok=True)
    print("Saving files into folder:", folder.resolve())

    for i, item in enumerate(dict_list, start=1):
        # Only use the 'text' field
        text = item.get("text", "")
        ref = item.get("references", "")

        # Use 'id' in the filename if present, else fallback to the index
        file_id = item.get("id", i)
        file_path = folder / f"{base_filename}_{i}.txt"  # or use file_id if you prefer

        if item['has_figure'] == True:
            files_ref.append(i)

        with open(file_path, "w", encoding="utf-8") as f:
            f.write(str(text) + "\n\n References: \n" + str(ref))

save_to_txt(total_text, "../data/to_annotate", "paper")
print(f"These files have figures in them: {files_ref}")

Number of items in dict_list: 100
Saving files into folder: /mnt/c/Users/imana/Desktop/Masters/Masters_Thesis/codebase/data/to_annotate
These files have figures in them: [3, 35, 50, 55, 56, 73, 74, 81, 86, 89]
