In [17]:
import os
import json
import re
from pathlib import Path
from typing import Dict, Any, List, Iterable
from tqdm.auto import tqdm

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
REF_PATTERN = re.compile(r"\[(\d+(?:\s*,\s*\d+)*)\]")
PAREN_CITATION_PATTERN = re.compile(r"\(([^()]*\d{4}[^()]*)\)")
AUTHOR_YEAR_PATTERN = re.compile(r"(.+?),\s*(\d{4}[a-z]?)")

In [20]:
def extract_arxiv_info(paper_id: str, link_map: Dict[str, str]):
    """
    Paper ID (örn: Artificial Intelligence_1) kullanarak map'ten URL'i çeker.
    URL içinden (örn: .../2510.02276) yılı (2025) olarak ayıklar.

    (url, year) döndürür.
    """

    pdf_key = f"{paper_id}.pdf"
    url = link_map.get(pdf_key)

    year = None

    if url:
        match = re.search(r"/(\d{4})\.\d+", url)
        if match:
            yymm = match.group(1)
            yy = yymm[:2]

            try:
                year = int("20" + yy)
            except ValueError:
                year = None

    return url, year

In [21]:
def extract_numeric_reference_ids(text: str) -> List[int]:
  """
  Metin içindeki [1,2,3] gibi akademik referans ID'lerini çıkarır.
  Örn: "MAS [1,2,3] ... [4. 5]" -> [1,2,3,4,5]
  """

  ids: List[int] = []
  for match in REF_PATTERN.findall(text):
    parts = match.split(",")
    for p in parts:
      p = p.strip()
      if p.isdigit():
        ids.append(int(p))

  return sorted(set(ids))

In [22]:
def extract_author_year_citations(text: str) -> List[Dict[str, Any]]:
  """
  (Rosenberg & Van Hout, 2013; Stracina et al., 2022) gibi author-year atıfları çıkarır.

  Çıktı örneği:
  [
      {"raw": "Rosenberg & Van Hout, 2013", "authors": "Rosenberg & Van Hout", "year": "2013"},
      {"raw": "Stracina et al., 2022", "authors": "Stracina et al.", "year": "2022"},
      ...
  ]
  """

  citations: List[Dict[str, Any]] = []

  for paren_content in PAREN_CITATION_PATTERN.findall(text):

    parts = re.split(r";", paren_content)
    for part in parts:
      part = part.strip()
      if not part:
        continue

      m = AUTHOR_YEAR_PATTERN.search(part)
      if m:
        authors = m.group(1).strip()
        year = m.group(2).strip()
        citations.append({
            "raw": part,
            "authors": authors,
            "year": year,
        })

  return citations

In [23]:
def iter_sections(
    section: Dict[str, Any],
    section_path: List[str]
) -> Iterable[Dict[str, Any]]:

  """
  Bir section + alt section'lar üzerinde dolaşır, her paragraf için
  (section_path, paragraph_text) döner.

  section: {"title": str, "paragraphs": [str], "subsections": [...]}
  """

  current_path = section_path + [section.get("title", "").strip()]

  for i, para in enumerate(section.get("paragraphs", [])):
    yield {
        "section_path": current_path,
        "para_index": i,
        "text": para
    }

  for subsection in section.get("subsections", []):
    yield from iter_sections(subsection, current_path)

In [24]:
def resolve_inline_reference(
        authors: str,
        year: str,
        reference_map: Dict[int, str]
):

    """
    Inline citation (authors, year) -> reference_map'taki text ile eşleşen referansı bulmaya çalışır.
    Basit string matching: authors ve year geçiyorsa eşleşmiş sayıyoruz.
    """

    authors_l = authors.lower()
    year_l = year.lower()

    for rid, full_text in reference_map.items():
        ft_l = full_text.lower()
        if authors_l in ft_l and year_l in ft_l:
            return {"id": rid, "text": full_text}

    return None


In [25]:
def process_single_paper(json_path: Path, link_map: Dict[str, str]) -> List[Dict[str, Any]]:
  """
  Tek bir JSON makaleyi açıp tüm paragraf chunk'larını çıkarır.

  Her chunk için:
  - Hem [1,2,3] tarzı numerik referansları (reference_ids),
  - Hem de (Rosenberg & Van Hout, 2013; Stracina et al., 2022) tarzı author-year
    referansları (inline_citations) alanına yazar.
  - Bu ikisine göre çözülebilen gerçek kaynaklar -> resolved_references
    sadece o paragrafta kullanılan referanslar
  """

  with json_path.open("r", encoding="utf-8") as f:
    paper = json.load(f)

  title = paper.get("title", "").strip()
  authors = paper.get("authors", [])

  paper_id = json_path.stem

  pdf_url, extracted_year = extract_arxiv_info(paper_id, link_map)
  year = paper.get("year")
  if year is None:
    year = extracted_year

  venue = paper.get("venue")

  paper_references = paper.get("references", [])

  reference_map: Dict[int, str] = {}
  for ref in paper_references:
    try:
        rid = ref.get("id")
        rtext = ref.get("text")
        if rid is not None and rtext is not None:
            reference_map[int(rid)] = rtext
    except Exception:
        continue

  chunks: List[Dict[str, Any]] = []

  abstract_raw = paper.get("abstract") or ""
  abstract_text = abstract_raw.strip()
  if abstract_text:
    numeric_ref_ids = extract_numeric_reference_ids(abstract_text)
    author_year_cits = extract_author_year_citations(abstract_text)

    resolved_numeric = [{
        "id": rid,
        "text": reference_map.get(rid)
        }
        for rid in numeric_ref_ids
        if rid in reference_map
    ]

    resolved_inline = []
    for cit in author_year_cits:
        authors_c = cit["authors"]
        year_c = cit["year"]
        ref_match = resolve_inline_reference(authors_c, year_c, reference_map)
        if ref_match and ref_match not in resolved_numeric and ref_match not in resolved_inline:
            resolved_inline.append(ref_match)

    resolved_references = resolved_numeric + resolved_inline

    chunk_id = f"{paper_id}_abstract_p0"
    chunks.append({
        "chunk_id": chunk_id,
        "paper_id": paper_id,
        "title": title,
        "section_title": "Abstract",
        "section_path": ["Abstract"],
        "para_index": 0,
        "text": abstract_text,
        "reference_ids": numeric_ref_ids,
        "inline_citations": author_year_cits,
        "year": year,
        "venue": venue,
        "url": pdf_url,
        "authors": authors,
        "references": resolved_references,
    })

  for sec_idx, sec in enumerate(paper.get("sections", [])):
    for para_info in iter_sections(sec, []):
      section_path = para_info["section_path"]
      para_index = para_info["para_index"]
      text_raw = para_info["text"] or ""
      text = text_raw.strip()
      if not text:
        continue

      numeric_ref_ids = extract_numeric_reference_ids(text)
      author_year_cits = extract_author_year_citations(text)

      resolved_numeric = [
            {"id": rid, "text": reference_map.get(rid)}
            for rid in numeric_ref_ids
            if rid in reference_map
        ]

      resolved_inline = []
      for cit in author_year_cits:
          authors_c = cit["authors"]
          year_c = cit["year"]
          ref_match = resolve_inline_reference(authors_c, year_c, reference_map)
          if ref_match and ref_match not in resolved_numeric and ref_match not in resolved_inline:
              resolved_inline.append(ref_match)

      resolved_references = resolved_numeric + resolved_inline

      section_title = section_path[-1] if section_path else ""

      chunk_id = f"{paper_id}__sec{sec_idx}_p{para_index}"

      chunks.append({
          "chunk_id": chunk_id,
          "paper_id": paper_id,
          "title": title,
          "section_title": section_title,
          "section_path": section_path,
          "para_index": para_index,
          "text": text,
          "reference_ids": numeric_ref_ids,
          "inline_citations": author_year_cits,
          "year": year,
          "venue": venue,
          "url": pdf_url,
          "authors": authors,
          "references": resolved_references,
      })

  return chunks

In [26]:
def build_all_chunks(data_dir: str, links_path: str, output_path: str) -> None:
  """
  data_dir altındaki tüm .json makaleleri okuyup hepsini tek bir JSONL dosyasına yazar.
  """

  data_dir = Path(data_dir)
  out_path = Path(output_path)
  out_path.parent.mkdir(parents=True, exist_ok=True)

  print(f"Loading link mapping from: {links_path}")
  with open(links_path, "r", encoding="utf-8") as f:
    link_map = json.load(f)

  json_files = sorted(data_dir.glob("*.json"))

  with out_path.open("w", encoding="utf-8") as out_f:
    for json_path in tqdm(json_files, desc="Processing papers"):
      try:
        chunks = process_single_paper(json_path, link_map)
        for ch in chunks:
          out_f.write(json.dumps(ch, ensure_ascii=False) + "\n")
      except Exception as e:
        print(f"Error in {json_path}: {e}")

In [27]:
build_all_chunks(
    data_dir="/content/drive/MyDrive/NLP/codes/data/jsons",
    links_path="/content/drive/MyDrive/NLP/codes/data/pdf_links_matching.json",
    output_path="/content/drive/MyDrive/NLP/codes/data/chunks/chunks.jsonl"
  )

Loading link mapping from: /content/drive/MyDrive/NLP/codes/data/pdf_links_matching.json


Processing papers:   0%|          | 0/7461 [00:00<?, ?it/s]

In [7]:
# -type f: Sadece dosyaları arar (klasörleri saymaz)
# wc -l: Satırları sayar (yani dosya sayısını verir)
!find "/content/drive/MyDrive/NLP/codes/data/jsons" -type f | wc -l

7461


In [4]:
# 1. Önce Zip dosyasını Colab'in geçici diskine açalım
# -q: Sessiz mod (ekranı doldurmasın)
# -o: Overwrite (gerekirse üzerine yazsın, burası geçici alan sorun değil)
!unzip -q -o "/content/drive/MyDrive/NLP/jsons_devam.zip" -d "/content/gecici_yukleme_alani"

print("Zip açma işlemi tamamlandı. Şimdi Drive'a aktarılıyor...")

Zip açma işlemi tamamlandı. Şimdi Drive'a aktarılıyor...


In [5]:
# --ignore-existing: EN ÖNEMLİ KISIM. Dosya Drive'da varsa (1) koymaz, o dosyayı atlar.
# -r: Klasörlerin içine de gir (recursive)
# -v: İşlemi göster
!rsync -rv --ignore-existing "/content/gecici_yukleme_alani/" "/content/drive/MyDrive/NLP/codes/data/jsons/"

sending incremental file list
Human-Computer Interaction_100.json
Human-Computer Interaction_101.json
Human-Computer Interaction_102.json
Human-Computer Interaction_103.json
Human-Computer Interaction_104.json
Human-Computer Interaction_105.json
Human-Computer Interaction_106.json
Human-Computer Interaction_107.json
Human-Computer Interaction_108.json
Human-Computer Interaction_109.json
Human-Computer Interaction_110.json
Human-Computer Interaction_111.json
Human-Computer Interaction_112.json
Human-Computer Interaction_113.json
Human-Computer Interaction_114.json
Human-Computer Interaction_115.json
Human-Computer Interaction_116.json
Human-Computer Interaction_117.json
Human-Computer Interaction_118.json
Human-Computer Interaction_119.json
Human-Computer Interaction_120.json
Human-Computer Interaction_121.json
Human-Computer Interaction_122.json
Human-Computer Interaction_123.json
Human-Computer Interaction_124.json
Human-Computer Interaction_125.json
Human-Computer Interaction_126.jso

In [16]:
!wc -l "/content/drive/MyDrive/NLP/codes/data/pdf_links_matching.json"

7473 /content/drive/MyDrive/NLP/codes/data/pdf_links_matching.json
