In [None]:
# ╔════════════════════════════════════════════════════════════════════╗
# ║  Cell 1 – Extract raw text from ATOP1005.pdf                       ║
# ╚════════════════════════════════════════════════════════════════════╝


# filename: extract_ATOP1005_text.ipynb  (inside the notebook itself)


# 1. Install the helper once (has no effect if already installed)
import sys, subprocess, json, pathlib, textwrap, importlib.util

def _ensure(package: str):
    if importlib.util.find_spec(package) is None:
        print(f"Installing {package} …")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

_ensure("pdfplumber")

# 2. Set up paths  ── adapt RELATIVE_PDF if your notebook sits elsewhere
RELATIVE_PDF = r"atop_version_5.1_codesheets\ATOP Version 5.1 Codesheets\ATOP1005.pdf"

nb_dir   = pathlib.Path.cwd()          # folder that holds the .ipynb
pdf_path = nb_dir / pathlib.Path(RELATIVE_PDF)
txt_path = pdf_path.with_suffix(".txt")   # e.g.  ATOP1005.txt  in same folder as PDF

if not pdf_path.exists():
    raise FileNotFoundError(
        f"PDF not found at {pdf_path}\n"
        "→ Double-check the relative path from the notebook’s location."
    )

# 3. Extract every page’s text
import pdfplumber, re

with pdfplumber.open(pdf_path) as pdf:
    raw_text = "\n".join(
        page.extract_text() or ""   # keeps order; returns '' if page is image-only
        for page in pdf.pages
    )

# Optional: collapse multiple newlines for easier regex design
raw_text_clean = re.sub(r"\n{2,}", "\n\n", raw_text)

# 4. Save to a .txt file for manual inspection
txt_path.write_text(raw_text_clean, encoding="utf-8")
print(f"✓ Extracted {len(raw_text):,} characters from {pdf_path.name}")
print(f"  Dump saved to: {txt_path.relative_to(nb_dir)}")

# 5. Quick preview (first 40 lines) so you can see what you have
preview_lines = 300
print("\n" + "="*60 + f"\nPreview (first {preview_lines} lines):\n" + "="*60)
print("\n".join(raw_text_clean.splitlines()[:preview_lines]))


✓ Extracted 14,057 characters from ATOP1005.pdf
  Dump saved to: atop_version_5.1_codesheets\ATOP Version 5.1 Codesheets\ATOP1005.txt

Preview (first 300 lines):
ALLIANCE TREATY OBLIGATIONS AND PROVISIONS (ATOP)
CODING SHEET
Brett Ashley Leeds with Jeffrey M. Ritter, Sara McLaughlin Mitchell, and Andrew G. Long
Department of Political Science, Rice University, P.O. Box 1892-MS 24, Houston, TX 77251-1892
Version 8.0; August, 2000
SECTION I: FORMATION, MEMBERS, AND DURATION
1. ATOP ID: # 1005.
2. COW Alliance # (if applicable): N/A.
3. Signature date(s) (may vary for members in multilateral alliances):
January 3, 1815.
See # 4 below for accession dates.
4. Members of alliance (use COW country codes):
200 United Kingdom
220 France
300 Austria
245 Bavaria (January 13, 1815)
240 Hanover (January 19, 1815)
210 Netherlands (January 31, 1815).
5. Was the alliance signed during war? If so, which war, and which member(s) was at war? (check COW data)
No.
6. Means by which alliance was established

In [43]:
import re
import pandas as pd
from pathlib import Path
import sys
import subprocess
import importlib.util

GLOBAL_SINGLE_PDF = "atop_version_5.1_codesheets\ATOP Version 5.1 Codesheets\ATOP1415.1.pdf"

# Ensure required packages are installed
def ensure_package(package: str):
    """Install package if not already installed"""
    if importlib.util.find_spec(package) is None:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
ensure_package("pdfplumber")
ensure_package("pandas")
ensure_package("python-dateutil")       # ← add just below the others
from dateutil.parser import parse as _dt_parse



import pdfplumber

# Define all questions with exact text for faster matching
QUESTIONS = {
    "Q1": {
        "short": "ATOP_ID",
        "full": "ATOP ID",
        "question_text": "1. ATOP ID: "
    },
    "Q2": {
        "short": "COW_Alliance_Number",
        "full": "COW Alliance # (if applicable)",
        "question_text": "2. COW Alliance # (if applicable):"
    },
    "Q3": {
        "short": "Signature_Date",
        "full": "Signature date(s) (may vary for members in multilateral alliances)",
        "question_text": "3. Signature date(s) (may vary for members in multilateral alliances):"
    },
    "Q4": {
        "short": "Members",
        "full": "Members of alliance (use COW country codes)",
        "question_text": "4. Members of alliance (use COW country codes):"
    },
    "Q5": {
        "short": "Signed_During_War",
        "full": "Was the alliance signed during war? If so, which war, and which member(s) was at war?",
        "question_text": "5. Was the alliance signed during war? If so, which war, and which member(s) was at war? (check COW data)"
    },
    "Q6": {
    "short": "Establishment_Means",
    "full": (
        "Means by which alliance was established (e.g., treaty, executive agreement, "
        "diplomatic notes, etc.): (Only select \"treaty\" if the\nagreement requires "
        "ratification.)"
    ),
    # NOTE the explicit  \n  before the word “agreement” and the curly quotes
    "question_text": (
        "6. Means by which alliance was established (e.g., treaty, executive agreement, "
        "diplomatic notes, etc.): (Only select “treaty” if the\nagreement requires "
        "ratification.)"
    )
    },
    "Q7": {
        "short": "Public_Secret_Status",
        "full": "Was the treaty public, public but with secret articles, or secret?",
        "question_text": "7. Was the treaty public, public but with secret articles, or secret? If secret articles, describe the content of secret articles."
    },
    "Q8": {
        "short": "Additional_Members_Provisions",
        "full": "Are there provisions for adding additional members mentioned in the treaty?",
        "question_text": "8. Are there provisions for adding additional members mentioned in the treaty? (Yes, No) If Yes, describe these provisions."
    },
    "Q9": {
        "short": "Specific_Future_Members",
        "full": "Are specific state(s) mentioned as possible future members of the alliance?",
        "question_text": "9. Are specific state(s) mentioned as possible future members of the alliance? (Yes, No) If Yes, which state(s)?"
    },
    "Q10": {
        "short": "Treaty_Duration",
        "full": "Are there specific provisions in the treaty concerning the length of time the treaty is to last?",
        "question_text": "10. Are there specific provisions in the treaty concerning the length of time the treaty is to last? (Yes, No) If Yes, describe."
    },
    "Q11": {
        "short": "Renounce_Conditions",
        "full": "Are there conditions under which members may renounce the treaty?",
        "question_text": "11. Are there conditions under which members may renounce the treaty? If Yes, describe these conditions."
    },
    "Q12": {
        "short": "Renewal_Provisions",
        "full": "Are there specific provisions in the treaty concerning renewal of the treaty?",
        "question_text": "12. Are there specific provisions in the treaty concerning renewal of the treaty? (Yes, No) If Yes, describe these provisions."
    },
    "Q13": {
        "short": "Times_Renewed",
        "full": "Number of times alliance renewed, dates of renewal, and means by which alliance was renewed",
        "question_text": "13. Number of times alliance renewed, dates of renewal, and means by which alliance was renewed (Do not include automatic\nrenewals):"
    },
    "Q14": {
        "short": "Termination_Date",
        "full": "Termination date(s) and source(s) for termination dates(s)",
        "question_text": "14. Termination date(s) (may vary for members in multilateral alliances) and source(s) for termination dates(s):"
    },
    "Q15": {
        "short": "Termination_Cause",
        "full": "Describe what caused the alliance to terminate",
        "question_text": "15. Describe what caused the alliance to terminate."
    },
    "Q16": {
        "short": "Alliance_Type",
        "full": "Type of alliance (defense pact, neutrality pact, nonaggression pact, consultation pact, offense pact)",
        "question_text": "16. Type of alliance (defense pact, neutrality pact, nonaggression pact, consultation pact, offense pact): (List all that apply to any\nmember of the alliance.)"
    },
    "Q17": {
        "short": "COW_Alliance_Type",
        "full": "Type of alliance as coded in COW data (if applicable)",
        "question_text": "17. Type of alliance as coded in COW data (if applicable):"
    },
    "Q18": {
        "short": "Obligations",
        "full": "Describe the obligations of the alliance partners",
        "question_text": "18. Describe the obligations of the alliance partners."
    },
    "Q19": {
        "short": "Contingencies",
        "full": "Are any of the obligations contingent upon any of the following",
        "question_text": "19. Are any of the obligations contingent upon any of the following: specific adversary, specific location, specific ongoing conflict,\nnumber of adversaries, noncompliance with a specific demand, attack, nonprovocation of enemy, or prior agreement among partners?\n(List all that apply)"
    },
    "Q20": {
        "short": "Limits_Description",
        "full": "Describe the nature of the limits to obligations listed in #19",
        "question_text": "20. Describe the nature of the limits to obligations listed in #19."
    },
    "Q21": {
        "short": "Additional_War_Provisions",
        "full": "Are there any additional provisions for assistance in the event of war?",
        "question_text": "21. Are there any additional provisions for assistance in the event of war? (e.g., promise not to participate in economic sanctions against\npartner, promise not to aid internal or external enemies, etc.) (Yes, No) If Yes, describe."
    },
    "Q22": {
        "short": "Additional_Limits",
        "full": "Are there any additional limits to the alliance obligations or conditions",
        "question_text": "22. Are there any additional limits to the alliance obligations or conditions under which treaty obligations do not apply that were not\nlisted in #19? (Yes, No) If Yes, describe."
    },
    "Q23": {
        "short": "Target_Threat",
        "full": "Is a specific target/threat mentioned as the object of the treaty?",
        "question_text": "23. Is a specific target/threat mentioned as the object of the treaty? (Yes, No) (Answering Yes to this question does not necessarily mean\nthat obligations are limited only to this adversary.) If Yes, list the target nation(s) (using COW country codes if possible) and describe\nthe general nature of the reference to the target of the treaty."
    },
    "Q24": {
        "short": "Symmetric_Obligations",
        "full": "Are the treaty obligations symmetric?",
        "question_text": "24. Are the treaty obligations symmetric? (That is, do all members commit to the same obligations?) (Yes, No) If No, describe."
    },
    "Q25": {
        "short": "No_Separate_Peace",
        "full": "Does the treaty prohibit members from settling conflicts independently (no separate peace)?",
        "question_text": "25. Does the treaty prohibit members from settling conflicts independently (no separate peace)? (Yes, No) If Yes, describe."
    },
    "Q26": {
        "short": "Consult_Third_Parties",
        "full": "Does the treaty have provisions requiring that the contracting parties consult before making commitments to third parties",
        "question_text": "26. Does the treaty have provisions requiring that the contracting parties consult before making commitments to third parties (excluding\nno separate peace provisions discussed above)? (Yes, No) If Yes, describe these provisions."
    },
    "Q27": {
        "short": "No_Contrary_Alliances",
        "full": "Does the treaty specify that the contracting parties must not enter into any other alliances that are directed against the alliance",
        "question_text": "27. Does the treaty specify that the contracting parties must not enter into any other alliances that are directed against the alliance in\nquestion? (Yes, No) If Yes, describe."
    },
    "Q28": {
        "short": "Additional_Obligations_Comments",
        "full": "Additional coder comments regarding treaty obligations",
        "question_text": "28. Additional coder comments regarding treaty obligations:"
    },
    "Q29": {
        "short": "Internal_Conflict_Reference",
        "full": "Does the treaty make reference to the potential for conflict among members of the alliance?",
        "question_text": "29. Does the treaty make reference to the potential for conflict among members of the alliance? (Yes, No) If Yes, describe."
    },
    "Q30": {
        "short": "Dispute_Settlement",
        "full": "Does the treaty discuss mediation/arbitration or other means of settling disputes among the signatories?",
        "question_text": "30. Does the treaty discuss mediation/arbitration or other means of settling disputes among the signatories? (Yes, No) If Yes, describe."
    },
    "Q31": {
        "short": "Military_Contact",
        "full": "Does the treaty require official contact among the military forces of the participating states?",
        "question_text": "31. Does the treaty require official contact among the military forces of the participating states? (Yes, No) If Yes, describe. (Note\nwhether official contact is required only in the event of hostilities or also during peacetime.)"
    },
    "Q32": {
        "short": "Military_Aid",
        "full": "Does the treaty include provisions for military aid?",
        "question_text": "32. Does the treaty include provisions for military aid? (e.g., grants, loans, transfer of technology, training) (Yes, No) If Yes, describe\nthese provisions."
    },
    "Q33": {
        "short": "Integrated_Command",
        "full": "Does the treaty provide for integrated command of military forces while the alliance is in effect?",
        "question_text": "33. Does the treaty provide for integrated command of military forces while the alliance is in effect (peacetime as well as wartime)?\n(Yes, No) If Yes, describe."
    },
    "Q34": {
        "short": "Force_Subordination",
        "full": "Does the treaty require subordination of the forces of one or more member states to another in the event of hostilities?",
        "question_text": "34. Does the treaty require subordination of the forces of one or more member states to another in the event of hostilities? (Yes, No) If\nYes, describe."
    },
    "Q35": {
        "short": "Organizations_Established",
        "full": "Does the treaty establish any organizations?",
        "question_text": "35. Does the treaty establish any organizations? (include provisions for regularly scheduled meetings) (Yes, No) If Yes, describe."
    },
    "Q36": {
        "short": "Joint_Bases_Troops",
        "full": "Does the treaty provide for joint military bases, or for one or more states to place troops in the territory of one or more other states?",
        "question_text": "36. Does the treaty provide for joint military bases, or for one or more states to place troops in the territory of one or more other states?\n(Yes, No) If Yes, describe."
    },
    "Q37": {
        "short": "Contribution_Levels",
        "full": "Does the treaty specify contribution levels (funds, troops, etc.)?",
        "question_text": "37. Does the treaty specify contribution levels (funds, troops, etc.)? (Yes, No) If Yes, describe."
    },
    "Q38": {
        "short": "Armament_Provisions",
        "full": "Does the treaty contain any provisions regarding coordinated increase of armaments, reduction of armaments, prohibition of weapons, or rules of warfare?",
        "question_text": "38. Does the treaty contain any provisions regarding coordinated increase of armaments, reduction of armaments, prohibition of\nweapons, or rules of warfare? (Yes, No) If Yes, describe such provisions."
    },
    "Q39": {
        "short": "Territory_Acquisition",
        "full": "Does the treaty explicitly permit or prohibit the acquisition of territory not currently held by either ally?",
        "question_text": "39. Does the treaty explicitly permit or prohibit the acquisition of territory not currently held by either ally? (Yes, No) If Yes, describe\nreference."
    },
    "Q40": {
        "short": "Division_of_Gains",
        "full": "Does the treaty include discussion of the division of gains from any future conflict?",
        "question_text": "40. Does the treaty include discussion of the division of gains from any future conflict? (Yes, No) If Yes, describe."
    },
    "Q41": {
        "short": "Demobilization_Withdrawal",
        "full": "Does the treaty include discussion of demobilization/withdrawal following conflict?",
        "question_text": "41. Does the treaty include discussion of demobilization/withdrawal following conflict? (Yes, No) If Yes, describe."
    },
    "Q42": {
        "short": "Other_Coordination",
        "full": "Are there any other provisions in the treaty that describe the means through which the states will coordinate their military efforts and policies?",
        "question_text": "42. Are there any other provisions in the treaty that describe the means through which the states will coordinate their military efforts and\npolicies? (Yes, No) If Yes, describe."
    },
    "Q43": {
        "short": "Additional_Institutionalization_Comments",
        "full": "Additional coder comments regarding references to alliance institutionalization",
        "question_text": "43. Additional coder comments regarding references to alliance institutionalization:"
    },
    "Q44": {
        "short": "Nullifies_Treaties",
        "full": "Does the treaty state that it nullifies any existing treaties signed by one or more of the contracting parties?",
        "question_text": "44. Does the treaty state that it nullifies any existing treaties signed by one or more of the contracting parties? (Yes, No) If Yes, list the\ntreaties/agreements that are nullified and describe the treaty reference."
    },
    "Q45": {
        "short": "Modifies_Treaties",
        "full": "Does the treaty state that it modifies any existing treaties signed by one or more of the contracting parties?",
        "question_text": "45. Does the treaty state that it modifies any existing treaties signed by one or more of the contracting parties? (Yes, No) If Yes, list the\ntreaties/agreements that are modified and describe the treaty reference."
    },
    "Q46": {
        "short": "Includes_Treaties",
        "full": "Does the treaty state that it includes any existing treaties signed by one or more of the contracting parties?",
        "question_text": "46. Does the treaty state that it includes any existing treaties signed by one or more of the contracting parties? (Yes, No) If Yes, list the\ntreaties/agreements that are included and describe the treaty reference."
    },
    "Q47": {
        "short": "International_Organizations",
        "full": "Does the treaty make any references to international organizations?",
        "question_text": "47. Does the treaty make any references to international organizations? (Yes, No) If Yes, list the international organization and describe\nthe nature of the reference."
    },
    "Q48": {
        "short": "Other_Alliances",
        "full": "Does the treaty make any references to other existing alliances?",
        "question_text": "48. Does the treaty make any references to other existing alliances? (Yes, No) If Yes, list the alliance and describe the nature of the\nreference."
    },
    "Q49": {
        "short": "Companion_Agreements",
        "full": "Are there any companion agreements referenced in the treaty?",
        "question_text": "49. Are there any companion agreements referenced in the treaty? (Yes, No) If Yes, describe the agreements."
    },
    "Q50": {
        "short": "Non_Military_Cooperation",
        "full": "Does the treaty include statements regarding non-military cooperation?",
        "question_text": "50. Does the treaty include statements regarding non-military cooperation (e.g., economic, cultural, scientific exchange)? (Yes, No) If\nYes, describe."
    },
    "Q51": {
        "short": "Resolves_Conflicts",
        "full": "Does the treaty resolve other conflicts among the parties to the alliance?",
        "question_text": "51. Does the treaty resolve other conflicts among the parties to the alliance? (Yes, No) If Yes, describe."
    },
    "Q52": {
        "short": "Enforces_External_Settlement",
        "full": "Does the treaty propose/enforce settlement of a conflict not involving parties to the alliance?",
        "question_text": "52. Does the treaty propose/enforce settlement of a conflict not involving parties to the alliance? (Yes, No) If Yes, describe."
    },
    "Q53": {
        "short": "Proposes_Agreements",
        "full": "Does the treaty propose other agreements among the contracting parties?",
        "question_text": "53. Does the treaty propose other agreements among the contracting parties? (Yes, No) If Yes, describe."
    },
    "Q54": {
        "short": "Unresolved_Conflicts",
        "full": "Does the treaty mention unresolved conflicts among the contracting parties?",
        "question_text": "54. Does the treaty mention unresolved conflicts among the contracting parties? (Yes, No) If Yes, describe."
    },
    "Q55": {
        "short": "Economic_Aid",
        "full": "Does the treaty include provisions for economic aid or other enticements?",
        "question_text": "55. Does the treaty include provisions for economic aid or other enticements (include trade concessions, post war recovery, etc.)? (Yes,\nNo) If Yes, describe these provisions."
    },
    "Q56": {
        "short": "Internal_Politics_Intervention",
        "full": "Does the treaty describe circumstances under which one party may intervene in the internal politics of another party?",
        "question_text": "56. Does the treaty describe circumstances under which one party may intervene in the internal politics of another party or specifically\ncommit the states to non-intervention? (Yes, No) If Yes, describe."
    },
    "Q57": {
        "short": "Additional_Comments",
        "full": "Additional coder comments on this alliance treaty",
        "question_text": "57. Additional coder comments on this alliance treaty:"
    },
    "Q58": {
        "short": "Source_of_Coding",
        "full": "Source of coding information",
        "question_text": "58. Source of coding information (e.g., treaty, treaty and secondary sources, secondary sources only, etc.):"
    },
    "Q59": {
        "short": "Treaty_Citation",
        "full": "Treaty citation",
        "question_text": "59. Treaty citation:"
    },
    "Q60": {
        "short": "Last_Revision_Date",
        "full": "Date of last revision of this coding sheet",
        "question_text": "60. Date of last revision of this coding sheet:"
    },
    "Q61": {
        "short": "Coder",
        "full": "Coder",
        "question_text": "61. Coder:"
    }
}

def parse_single_date(text):
    """
    Return ISO date ('YYYY-MM-DD') if `text` looks like a date.
    Otherwise return None.
    """
    try:
        clean = text.strip().rstrip('.')        # drop trailing period
        dt = _dt_parse(clean, fuzzy=True)       # handles 'March 9, 1833'
        return dt.date().isoformat()
    except (ValueError, OverflowError):
        return None

def extract_countries(text):
    """
    Return a semicolon-separated string of CountryName(code) pairs found in `text`.
    Handles both '365 Russia 200 United Kingdom.' and 'Russia (365)' layouts, collecting ALL occurrences.
    """
    pairs = []

    # Pattern A – code first, then name (e.g. "365 Russia 200 United Kingdom.")
    for code, name in re.findall(
        r'\b(\d{3})\s+([A-Z][A-Za-z.\- ]+?)'               # capture 3-digit code + country name
        r'(?='                                             # but stop when you see:
          r'(?:\s+\d{3}\s)'      #   – another code (e.g. " 200 ")
          r'|[.,;\n]'            #   – or punctuation: . , ; or newline
          r'|$'                  #   – or end-of-string
        r')',
        text
    ):
        pair = f"{name.strip()}({code})"
        if pair not in pairs:
            pairs.append(pair)

    # Pattern B – name first, then code in parentheses (e.g. "Russia (365)")
    for name, code in re.findall(
        r'\b([A-Z][A-Za-z.\- ]+?)\s*\(\s*(\d{3})\s*\)',
        text
    ):
        pair = f"{name.strip()}({code})"
        if pair not in pairs:
            pairs.append(pair)

    return ";".join(pairs)





def extract_text_from_pdf(pdf_path):
    """Extract text from PDF file using pdfplumber"""
    print(f"Extracting text from {pdf_path}")
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Extract text from all pages
            raw_text = "\n".join(
                page.extract_text() or ""  # returns '' if page is image-only
                for page in pdf.pages
            )
        
        # Clean up multiple newlines
        raw_text_clean = re.sub(r"\n{2,}", "\n\n", raw_text)
        
        print(f"✓ Extracted {len(raw_text):,} characters from {Path(pdf_path).name}")
        return raw_text_clean
    
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {str(e)}")
        return None

def categorize_answer(answer_text):
    """Categorize the answer based on its content"""
    answer_clean = answer_text.strip()
    
    # Check for Yes/No answers
    if answer_clean.lower() in ['yes', 'no', 'yes.', 'no.', 'n/a', 'n/a.', 'zero', 'zero.']:
        return "Yes/No+Text"
    
    # Check for coded list answers (contains semicolons or specific patterns)
    if ';' in answer_clean and len(answer_clean) < 100:
        return "Coded_List"
    
    # Check for multiple items (contains "and" or comma-separated short items)
    if (',' in answer_clean or ' and ' in answer_clean.lower()) and len(answer_clean) < 100:
        return "Multiple"
    
    
    # Check for numeric answers
    if re.match(r'^\d+\.?$', answer_clean):
        return "Numeric"
    
    # Default to text for longer answers
    return "Text"

import re

def fast_extract_answers(text, alliance_id=None, alliance_name=None):
    """
    Fast extraction using whitespace-flattened search so that slight
    differences in line breaks or punctuation don’t break question matching.
    Returns a list of row dicts with keys:
      Alliance_ID, Alliance_Name, Question_ID, Question_Short, Question_Full,
      Answer_YesNo, Answer_Text, Answer_Category, Decoded_Countries, Text_Length
    """
    # —————————————— Alliance ID / Name logic (unchanged) ——————————————
    if alliance_id is None:
        id_pos = text.find("1. ATOP ID:")
        if id_pos != -1:
            start = id_pos + len("1. ATOP ID:")
            end   = text.find("\n", start)
            alliance_id = text[start:end].strip().replace("#", "").strip()
        else:
            alliance_id = "Unknown"
    if alliance_name is None:
        alliance_name = f"Alliance_{alliance_id}"

    # —————————————— Flatten for robust searching ——————————————
    text_search = re.sub(r"\s+", " ", text).strip()

    # —————————————— Locate every question position ——————————————
    question_positions = []
    for q_id, q_info in QUESTIONS.items():
        q_flat = re.sub(r"\s+", " ", q_info["question_text"]).strip()
        matches = [m.start() for m in re.finditer(re.escape(q_flat), text_search)]
        if matches:
            pos = matches[-1]
            question_positions.append((q_id, pos, len(q_flat), q_info))
    question_positions.sort(key=lambda x: x[1])

    rows = []
    # —————————————— Extract each answer block ——————————————
    for i, (q_id, pos, plen, q_info) in enumerate(question_positions):
        answer_start = pos + plen
        if i + 1 < len(question_positions):
            answer_end = question_positions[i + 1][1]
        else:
            # last question → up to common markers or end
            answer_end = len(text_search)
            for marker in ["SECTION", "Source:", "Date of last revision:"]:
                mpos = text_search.find(marker, answer_start)
                if 0 <= mpos < answer_end:
                    answer_end = mpos

        # slice & basic cleanup
        answer_text = text_search[answer_start:answer_end].strip()
        answer_text = re.sub(r'^#\s*', '', answer_text)  # drop leading "# "
        answer_text = re.sub(
            r'^See\s+#?\s*\d+\s+below[^\n]*\n?',
            '',
            answer_text,
            flags=re.I
        )
        answer_text = re.sub(
            r'^SECTION\s+[IVXLCDM]+\s*:[^\n]*\n?',
            '',
            answer_text,
            flags=re.I | re.M
        )
        # 3️⃣ NEW — remove SECTION headers anywhere (e.g. inline bleed)
        answer_text = re.sub(
            r'\bSECTION\s+[IVXLCDM]+\s*:\s*[A-Za-z ]+\b',
            '',
            answer_text,
            flags=re.I
        )
        answer_text = re.sub(r'\s+', ' ', answer_text)

        # Yes/No flag
        answer_yesno = "N/A"
        low = answer_text.lower()
        if low.startswith("yes"):
            answer_yesno = "Yes"
        elif low.startswith("no"):
            answer_yesno = "No"

        # decode any countries
        decoded = extract_countries(answer_text)

        # categorization (with Q3/Q14 → Date override)
        if q_id in ("Q3", "Q14"):
            iso = parse_single_date(answer_text)
            if iso:
                answer_text = iso
            answer_category = "Date"
        else:
            if decoded:
                answer_category = "coded_text"
            else:
                answer_category = categorize_answer(answer_text)

        # final row assembly
        row = {
            "Alliance_ID":      alliance_id,
            "Alliance_Name":    alliance_name,
            "Question_ID":      q_id,
            "Question_Short":   q_info["short"],
            "Question_Full":    q_info["full"],
            "Answer_YesNo":     answer_yesno,
            "Answer_Text":      answer_text,
            "Answer_Category":  answer_category,
            "Decoded_Countries": decoded,
            "Text_Length":      len(answer_text)
        }
        rows.append(row)

    # —————————————— Fill in any missing questions ——————————————
    found = {r["Question_ID"] for r in rows}
    for q_id, q_info in QUESTIONS.items():
        if q_id not in found:
            rows.append({
                "Alliance_ID":      alliance_id,
                "Alliance_Name":    alliance_name,
                "Question_ID":      q_id,
                "Question_Short":   q_info["short"],
                "Question_Full":    q_info["full"],
                "Answer_YesNo":     "Not Found",
                "Answer_Text":      "",
                "Answer_Category":  "Missing",
                "Decoded_Countries": "",
                "Text_Length":      0
            })

    # —————————————— Sort by question number & return ——————————————
    rows.sort(key=lambda r: int(r["Question_ID"][1:]))
    return rows


def process_single_pdf(pdf_path, alliance_id=None, alliance_name=None, save_text=False):
    """Process a single ATOP PDF file and extract data"""
    
    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)
    
    if text is None:
        print(f"Failed to extract text from {pdf_path}")
        return None
    
    # Optionally save extracted text
    if save_text:
        txt_path = Path(pdf_path).with_suffix('.txt')
        txt_path.write_text(text, encoding='utf-8')
        print(f"Text saved to: {txt_path}")
    
    # Extract alliance ID from filename if not provided
    if alliance_id is None:
        file_name = Path(pdf_path).stem
        alliance_id_match = re.search(r'ATOP(\d+)', file_name)
        alliance_id = alliance_id_match.group(1) if alliance_id_match else "Unknown"
    
    # Extract answers using fast method
    rows = fast_extract_answers(text, alliance_id, alliance_name)
    
    # Convert to DataFrame
    df = pd.DataFrame(rows)
    
    return df

def process_multiple_pdfs(pdf_paths, output_csv="atop_extracted_data.csv", save_text_files=False):
    """Process multiple ATOP PDF files and combine results"""
    
    all_dfs = []
    failed_files = []
    
    for i, pdf_path in enumerate(pdf_paths):
        print(f"\nProcessing file {i+1}/{len(pdf_paths)}: {Path(pdf_path).name}")
        
        try:
            df = process_single_pdf(pdf_path, save_text=save_text_files)
            
            if df is not None:
                all_dfs.append(df)
            else:
                failed_files.append(pdf_path)
                
        except Exception as e:
            print(f"Error processing {pdf_path}: {str(e)}")
            failed_files.append(pdf_path)
    
    # Combine all dataframes
    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True)
        
        # Save to CSV
        combined_df.to_csv(output_csv, index=False)
        print(f"\n✓ Data from {len(all_dfs)} files saved to {output_csv}")
        
        if failed_files:
            print(f"\n⚠ Failed to process {len(failed_files)} files:")
            for f in failed_files:
                print(f"  - {Path(f).name}")
        
        return combined_df
    else:
        print("No data extracted from any files.")
        return None

# Example usage functions
def demo_single_pdf():
    """Demo: Process a single PDF file"""
    pdf_path = GLOBAL_SINGLE_PDF
    
    # Process the PDF
    df = process_single_pdf(
        pdf_path,
        alliance_id="1005",
        alliance_name="Quadruple Alliance",
        save_text=True  # This will also save the extracted text
    )
    
    if df is not None:
        # Display sample results
        print("\nSample extracted data:")
        print(df[['Question_ID', 'Question_Short', 'Answer_Text']].head(10))
        
        # Save to CSV
        df.to_csv("atop_1005_extracted.csv", index=False)
        print(f"\nData saved to atop_1005_extracted.csv")
        #df.head()
        return df

import glob
from pathlib import Path

def demo_batch_processing():
    """Process only ATOP PDFs that do *not* have a 'v' in their filename."""
    # 1️⃣ Glob for everything under ATOP*.pdf
    pdf_pattern = "atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets/ATOP*.pdf"
    all_pdfs = glob.glob(pdf_pattern)

    # 2️⃣ Exclude any whose stem contains 'v' (case-insensitive)
    pdf_files = [
        f for f in all_pdfs
        if 'v' not in Path(f).stem.lower()
    ]

    print(f"Found {len(pdf_files)} PDF files to process (no 'v' in filename)")

    # 3️⃣ Process as before
    combined_df = process_multiple_pdfs(
        pdf_files,
        output_csv="all_atop_data.csv",
        save_text_files=False
    )

    if combined_df is not None:
        print(f"\nTotal records extracted: {len(combined_df)}")
        print(f"Unique alliances: {combined_df['Alliance_ID'].nunique()}")


# Main execution
if __name__ == "__main__":
    # Choose one of the demo functions to run
    df = demo_batch_processing()
    print("\nProcessing complete for version 1 variant. DataFrame ready for analysis.")

Installing python-dateutil...
Found 335 PDF files to process (no 'v' in filename)

Processing file 1/335: ATOP1005.pdf
Extracting text from atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets\ATOP1005.pdf
✓ Extracted 14,057 characters from ATOP1005.pdf

Processing file 2/335: ATOP1020.1.pdf
Extracting text from atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets\ATOP1020.1.pdf
✓ Extracted 11,950 characters from ATOP1020.1.pdf

Processing file 3/335: ATOP1020.2.pdf
Extracting text from atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets\ATOP1020.2.pdf
✓ Extracted 19,250 characters from ATOP1020.2.pdf

Processing file 4/335: ATOP1020.3.pdf
Extracting text from atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets\ATOP1020.3.pdf
✓ Extracted 19,147 characters from ATOP1020.3.pdf

Processing file 5/335: ATOP1025.pdf
Extracting text from atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets\ATOP1025.pdf
✓ Extracted 11,866 characters from ATOP1025.pdf

Processing file 6/335: A

In [44]:
import re
import pandas as pd
from pathlib import Path
import sys
import subprocess
import importlib.util

GLOBAL_SINGLE_PDF = "atop_version_5.1_codesheets\ATOP Version 5.1 Codesheets\ATOP.pdf"

# Ensure required packages are installed
def ensure_package(package: str):
    """Install package if not already installed"""
    if importlib.util.find_spec(package) is None:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
ensure_package("pdfplumber")
ensure_package("pandas")
ensure_package("python-dateutil")       # ← add just below the others
from dateutil.parser import parse as _dt_parse



import pdfplumber

# Define all questions with exact text for faster matching
QUESTIONS = {
    "Q1": {
        "short": "ATOP_ID",
        "full": "ATOP ID",
        "question_text": "1. ATOP ID: "
    },
    "Q2": {
        "short": "COW_Alliance_Number",
        "full": "COW Alliance # (if applicable)",
        "question_text": "2. COW Alliance # (if applicable):"
    },
    "Q3": {
        "short": "Signature_Date",
        "full": "Signature date(s) (may vary for members in multilateral alliances)",
        "question_text": "3. Signature date(s) (may vary for members in multilateral alliances):"
    },
    "Q4": {
        "short": "Members",
        "full": "Members of alliance (use COW country codes)",
        "question_text": "4. Members of alliance (use COW country codes):"
    },
    "Q5": {
        "short": "Signed_During_War",
        "full": "Was the alliance signed during war? If so, which war, and which member(s) was at war?",
        "question_text": "5. Was the alliance signed during war? If so, which war, and which member(s) was at war? (check COW data)"
    },
    "Q6": {
    "short": "Establishment_Means",
    "full": (
        "Means by which alliance was established (e.g., treaty, executive agreement, "
        "diplomatic notes, etc.): (Only select \"treaty\" if the\nagreement requires "
        "ratification.)"
    ),
    # NOTE the explicit  \n  before the word “agreement” and the curly quotes
    "question_text": (
        "6. Means by which alliance was established (e.g., treaty, executive agreement, "
        "diplomatic notes, etc.): (Only select “treaty” if the\nagreement requires "
        "ratification.)"
    )
    },
    "Q7": {
        "short": "Public_Secret_Status",
        "full": "Was the treaty public, public but with secret articles, or secret?",
        "question_text": "7. Was the treaty public, public but with secret articles, or secret? If secret articles, describe the content of secret articles."
    },
    "Q8": {
        "short": "Additional_Members_Provisions",
        "full": "Are there provisions for adding additional members mentioned in the treaty?",
        "question_text": "8. Are there provisions for adding additional members mentioned in the treaty? (Yes, No) If Yes, describe these provisions."
    },
    "Q9": {
        "short": "Specific_Future_Members",
        "full": "Are specific state(s) mentioned as possible future members of the alliance?",
        "question_text": "9. Are specific state(s) mentioned as possible future members of the alliance? (Yes, No) If Yes, which state(s)?"
    },
    "Q10": {
        "short": "Treaty_Duration",
        "full": "Are there specific provisions in the treaty concerning the length of time the treaty is to last?",
        "question_text": "10. Are there specific provisions in the treaty concerning the length of time the treaty is to last? (Yes, No) If Yes, describe."
    },
    "Q11": {
        "short": "Renounce_Conditions",
        "full": "Are there conditions under which members may renounce the treaty?",
        "question_text": "11. Are there conditions under which members may renounce the treaty? If Yes, describe these conditions."
    },
    "Q12": {
        "short": "Renewal_Provisions",
        "full": "Are there specific provisions in the treaty concerning renewal of the treaty?",
        "question_text": "12. Are there specific provisions in the treaty concerning renewal of the treaty? (Yes, No) If Yes, describe these provisions."
    },
    "Q13": {
        "short": "Times_Renewed",
        "full": "Number of times alliance renewed, dates of renewal, and means by which alliance was renewed",
        "question_text": "13. Number of times alliance renewed, dates of renewal, and means by which alliance was renewed (Do not include automatic\nrenewals):"
    },
    "Q14": {
        "short": "Termination_Date",
        "full": "Termination date(s) and source(s) for termination dates(s)",
        "question_text": "14. Termination date(s) (may vary for members in multilateral alliances) and source(s) for termination dates(s):"
    },
    "Q15": {
        "short": "Termination_Cause",
        "full": "Describe what caused the alliance to terminate",
        "question_text": "15. Describe what caused the alliance to terminate."
    },
    "Q16": {
        "short": "Alliance_Type",
        "full": "Type of alliance (defense pact, neutrality pact, nonaggression pact, consultation pact, offense pact)",
        "question_text": "16. Type of alliance (defense pact, neutrality pact, nonaggression pact, consultation pact, offense pact): (List all that apply to any\nmember of the alliance.)"
    },
    "Q17": {
        "short": "COW_Alliance_Type",
        "full": "Type of alliance as coded in COW data (if applicable)",
        "question_text": "17. Type of alliance as coded in COW data (if applicable):"
    },
    "Q18": {
        "short": "Obligations",
        "full": "Describe the obligations of the alliance partners",
        "question_text": "18. Describe the obligations of the alliance partners."
    },
    "Q19": {
        "short": "Contingencies",
        "full": "Are any of the obligations contingent upon any of the following",
        "question_text": "19. Are any of the obligations contingent upon any of the following: specific adversary, specific location, specific ongoing conflict,\nnumber of adversaries, noncompliance with a specific demand, attack, nonprovocation of enemy, or prior agreement among partners?\n(List all that apply)"
    },
    "Q20": {
        "short": "Limits_Description",
        "full": "Describe the nature of the limits to obligations listed in #19",
        "question_text": "20. Describe the nature of the limits to obligations listed in #19."
    },
    "Q21": {
        "short": "Additional_War_Provisions",
        "full": "Are there any additional provisions for assistance in the event of war?",
        "question_text": "21. Are there any additional provisions for assistance in the event of war? (e.g., promise not to participate in economic sanctions against\npartner, promise not to aid internal or external enemies, etc.) (Yes, No) If Yes, describe."
    },
    "Q22": {
        "short": "Additional_Limits",
        "full": "Are there any additional limits to the alliance obligations or conditions",
        "question_text": "22. Are there any additional limits to the alliance obligations or conditions under which treaty obligations do not apply that were not\nlisted in #19? (Yes, No) If Yes, describe."
    },
    "Q23": {
        "short": "Target_Threat",
        "full": "Is a specific target/threat mentioned as the object of the treaty?",
        "question_text": "23. Is a specific target/threat mentioned as the object of the treaty? (Yes, No) (Answering Yes to this question does not necessarily mean\nthat obligations are limited only to this adversary.) If Yes, list the target nation(s) (using COW country codes if possible) and describe\nthe general nature of the reference to the target of the treaty."
    },
    "Q24": {
        "short": "Symmetric_Obligations",
        "full": "Are the treaty obligations symmetric?",
        "question_text": "24. Are the treaty obligations symmetric? (That is, do all members commit to the same obligations?) (Yes, No) If No, describe."
    },
    "Q25": {
        "short": "No_Separate_Peace",
        "full": "Does the treaty prohibit members from settling conflicts independently (no separate peace)?",
        "question_text": "25. Does the treaty prohibit members from settling conflicts independently (no separate peace)? (Yes, No) If Yes, describe."
    },
    "Q26": {
        "short": "Consult_Third_Parties",
        "full": "Does the treaty have provisions requiring that the contracting parties consult before making commitments to third parties",
        "question_text": "26. Does the treaty have provisions requiring that the contracting parties consult before making commitments to third parties (excluding\nno separate peace provisions discussed above)? (Yes, No) If Yes, describe these provisions."
    },
    "Q27": {
        "short": "No_Contrary_Alliances",
        "full": "Does the treaty specify that the contracting parties must not enter into any other alliances that are directed against the alliance",
        "question_text": "27. Does the treaty specify that the contracting parties must not enter into any other alliances that are directed against the alliance in\nquestion? (Yes, No) If Yes, describe."
    },
    "Q28": {
        "short": "Additional_Obligations_Comments",
        "full": "Additional coder comments regarding treaty obligations",
        "question_text": "28. Additional coder comments regarding treaty obligations:"
    },
    "Q29": {
        "short": "Internal_Conflict_Reference",
        "full": "Does the treaty make reference to the potential for conflict among members of the alliance?",
        "question_text": "29. Does the treaty make reference to the potential for conflict among members of the alliance? (Yes, No) If Yes, describe."
    },
    "Q30": {
        "short": "Dispute_Settlement",
        "full": "Does the treaty discuss mediation/arbitration or other means of settling disputes among the signatories?",
        "question_text": "30. Does the treaty discuss mediation/arbitration or other means of settling disputes among the signatories? (Yes, No) If Yes, describe."
    },
    "Q31": {
        "short": "Military_Contact",
        "full": "Does the treaty require official contact among the military forces of the participating states?",
        "question_text": "31. Does the treaty require official contact among the military forces of the participating states? (Yes, No) If Yes, describe. (Note\nwhether official contact is required only in the event of hostilities or also during peacetime.)"
    },
    "Q32": {
        "short": "Military_Aid",
        "full": "Does the treaty include provisions for military aid?",
        "question_text": "32. Does the treaty include provisions for military aid? (e.g., grants, loans, transfer of technology, training) (Yes, No) If Yes, describe\nthese provisions."
    },
    "Q33": {
        "short": "Integrated_Command",
        "full": "Does the treaty provide for integrated command of military forces while the alliance is in effect?",
        "question_text": "33. Does the treaty provide for integrated command of military forces while the alliance is in effect (peacetime as well as wartime)?\n(Yes, No) If Yes, describe."
    },
    "Q34": {
        "short": "Force_Subordination",
        "full": "Does the treaty require subordination of the forces of one or more member states to another in the event of hostilities?",
        "question_text": "34. Does the treaty require subordination of the forces of one or more member states to another in the event of hostilities? (Yes, No) If\nYes, describe."
    },
    "Q35": {
        "short": "Organizations_Established",
        "full": "Does the treaty establish any organizations?",
        "question_text": "35. Does the treaty establish any organizations? (include provisions for regularly scheduled meetings) (Yes, No) If Yes, describe."
    },
    "Q36": {
        "short": "Joint_Bases_Troops",
        "full": "Does the treaty provide for joint military bases, or for one or more states to place troops in the territory of one or more other states?",
        "question_text": "36. Does the treaty provide for joint military bases, or for one or more states to place troops in the territory of one or more other states?\n(Yes, No) If Yes, describe."
    },
    "Q37": {
        "short": "Contribution_Levels",
        "full": "Does the treaty specify contribution levels (funds, troops, etc.)?",
        "question_text": "37. Does the treaty specify contribution levels (funds, troops, etc.)? (Yes, No) If Yes, describe."
    },
    "Q38": {
        "short": "Armament_Provisions",
        "full": "Does the treaty contain any provisions regarding coordinated increase of armaments, reduction of armaments, prohibition of weapons, or rules of warfare?",
        "question_text": "38. Does the treaty contain any provisions regarding coordinated increase of armaments, reduction of armaments, prohibition of\nweapons, or rules of warfare? (Yes, No) If Yes, describe such provisions."
    },
    "Q39": {
        "short": "Territory_Acquisition",
        "full": "Does the treaty explicitly permit or prohibit the acquisition of territory not currently held by either ally?",
        "question_text": "39. Does the treaty explicitly permit or prohibit the acquisition of territory not currently held by either ally? (Yes, No) If Yes, describe\nreference."
    },
    "Q40": {
        "short": "Division_of_Gains",
        "full": "Does the treaty include discussion of the division of gains from any future conflict?",
        "question_text": "40. Does the treaty include discussion of the division of gains from any future conflict? (Yes, No) If Yes, describe."
    },
    "Q41": {
        "short": "Demobilization_Withdrawal",
        "full": "Does the treaty include discussion of demobilization/withdrawal following conflict?",
        "question_text": "41. Does the treaty include discussion of demobilization/withdrawal following conflict? (Yes, No) If Yes, describe."
    },
    "Q42": {
        "short": "Other_Coordination",
        "full": "Are there any other provisions in the treaty that describe the means through which the states will coordinate their military efforts and policies?",
        "question_text": "42. Are there any other provisions in the treaty that describe the means through which the states will coordinate their military efforts and\npolicies? (Yes, No) If Yes, describe."
    },
    "Q43": {
        "short": "Additional_Institutionalization_Comments",
        "full": "Additional coder comments regarding references to alliance institutionalization",
        "question_text": "43. Additional coder comments regarding references to alliance institutionalization:"
    },
    "Q44": {
        "short": "Nullifies_Treaties",
        "full": "Does the treaty state that it nullifies any existing treaties signed by one or more of the contracting parties?",
        "question_text": "44. Does the treaty state that it nullifies any existing treaties signed by one or more of the contracting parties? (Yes, No) If Yes, list the\ntreaties/agreements that are nullified and describe the treaty reference."
    },
    "Q45": {
        "short": "Modifies_Treaties",
        "full": "Does the treaty state that it modifies any existing treaties signed by one or more of the contracting parties?",
        "question_text": "45. Does the treaty state that it modifies any existing treaties signed by one or more of the contracting parties? (Yes, No) If Yes, list the\ntreaties/agreements that are modified and describe the treaty reference."
    },
    "Q46": {
        "short": "Includes_Treaties",
        "full": "Does the treaty state that it includes any existing treaties signed by one or more of the contracting parties?",
        "question_text": "46. Does the treaty state that it includes any existing treaties signed by one or more of the contracting parties? (Yes, No) If Yes, list the\ntreaties/agreements that are included and describe the treaty reference."
    },
    "Q47": {
        "short": "International_Organizations",
        "full": "Does the treaty make any references to international organizations?",
        "question_text": "47. Does the treaty make any references to international organizations? (Yes, No) If Yes, list the international organization and describe\nthe nature of the reference."
    },
    "Q48": {
        "short": "Other_Alliances",
        "full": "Does the treaty make any references to other existing alliances?",
        "question_text": "48. Does the treaty make any references to other existing alliances? (Yes, No) If Yes, list the alliance and describe the nature of the\nreference."
    },
    "Q49": {
        "short": "Companion_Agreements",
        "full": "Are there any companion agreements referenced in the treaty?",
        "question_text": "49. Are there any companion agreements referenced in the treaty? (Yes, No) If Yes, describe the agreements."
    },
    "Q50": {
        "short": "Non_Military_Cooperation",
        "full": "Does the treaty include statements regarding non-military cooperation?",
        "question_text": "50. Does the treaty include statements regarding non-military cooperation (e.g., economic, cultural, scientific exchange)? (Yes, No) If\nYes, describe."
    },
    "Q51": {
        "short": "Resolves_Conflicts",
        "full": "Does the treaty resolve other conflicts among the parties to the alliance?",
        "question_text": "51. Does the treaty resolve other conflicts among the parties to the alliance? (Yes, No) If Yes, describe."
    },
    "Q52": {
        "short": "Enforces_External_Settlement",
        "full": "Does the treaty propose/enforce settlement of a conflict not involving parties to the alliance?",
        "question_text": "52. Does the treaty propose/enforce settlement of a conflict not involving parties to the alliance? (Yes, No) If Yes, describe."
    },
    "Q53": {
        "short": "Proposes_Agreements",
        "full": "Does the treaty propose other agreements among the contracting parties?",
        "question_text": "53. Does the treaty propose other agreements among the contracting parties? (Yes, No) If Yes, describe."
    },
    "Q54": {
        "short": "Unresolved_Conflicts",
        "full": "Does the treaty mention unresolved conflicts among the contracting parties?",
        "question_text": "54. Does the treaty mention unresolved conflicts among the contracting parties? (Yes, No) If Yes, describe."
    },
    "Q55": {
        "short": "Economic_Aid",
        "full": "Does the treaty include provisions for economic aid or other enticements?",
        "question_text": "55. Does the treaty include provisions for economic aid or other enticements (include trade concessions, post war recovery, etc.)? (Yes,\nNo) If Yes, describe these provisions."
    },
    "Q56": {
        "short": "Internal_Politics_Intervention",
        "full": "Does the treaty describe circumstances under which one party may intervene in the internal politics of another party?",
        "question_text": "56. Does the treaty describe circumstances under which one party may intervene in the internal politics of another party or specifically\ncommit the states to non-intervention? (Yes, No) If Yes, describe."
    },
    "Q57": {
        "short": "Additional_Comments",
        "full": "Additional coder comments on this alliance treaty",
        "question_text": "57. Additional coder comments on this alliance treaty:"
    },
    "Q58": {
        "short": "Source_of_Coding",
        "full": "Source of coding information",
        "question_text": "58. Source of coding information (e.g., treaty, treaty and secondary sources, secondary sources only, etc.):"
    },
    "Q59": {
        "short": "Treaty_Citation",
        "full": "Treaty citation",
        "question_text": "59. Treaty citation:"
    },
    "Q60": {
        "short": "Last_Revision_Date",
        "full": "Date of last revision of this coding sheet",
        "question_text": "60. Date of last revision of this coding sheet:"
    },
    "Q61": {
        "short": "Coder",
        "full": "Coder",
        "question_text": "61. Coder:"
    }
}

def parse_single_date(text):
    """
    Return ISO date ('YYYY-MM-DD') if `text` looks like a date.
    Otherwise return None.
    """
    try:
        clean = text.strip().rstrip('.')        # drop trailing period
        dt = _dt_parse(clean, fuzzy=True)       # handles 'March 9, 1833'
        return dt.date().isoformat()
    except (ValueError, OverflowError):
        return None

def extract_countries(text):
    """
    Return a semicolon-separated string of CountryName(code) pairs found in `text`.
    Handles both '365 Russia 200 United Kingdom.' and 'Russia (365)' layouts, collecting ALL occurrences.
    """
    pairs = []

    # Pattern A – code first, then name (e.g. "365 Russia 200 United Kingdom.")
    for code, name in re.findall(
        r'\b(\d{3})\s+([A-Z][A-Za-z.\- ]+?)'               # capture 3-digit code + country name
        r'(?='                                             # but stop when you see:
          r'(?:\s+\d{3}\s)'      #   – another code (e.g. " 200 ")
          r'|[.,;\n]'            #   – or punctuation: . , ; or newline
          r'|$'                  #   – or end-of-string
        r')',
        text
    ):
        pair = f"{name.strip()}({code})"
        if pair not in pairs:
            pairs.append(pair)

    # Pattern B – name first, then code in parentheses (e.g. "Russia (365)")
    for name, code in re.findall(
        r'\b([A-Z][A-Za-z.\- ]+?)\s*\(\s*(\d{3})\s*\)',
        text
    ):
        pair = f"{name.strip()}({code})"
        if pair not in pairs:
            pairs.append(pair)

    return ";".join(pairs)





def extract_text_from_pdf(pdf_path):
    """Extract text from PDF file using pdfplumber"""
    print(f"Extracting text from {pdf_path}")
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Extract text from all pages
            raw_text = "\n".join(
                page.extract_text() or ""  # returns '' if page is image-only
                for page in pdf.pages
            )
        
        # Clean up multiple newlines
        raw_text_clean = re.sub(r"\n{2,}", "\n\n", raw_text)
        
        print(f"✓ Extracted {len(raw_text):,} characters from {Path(pdf_path).name}")
        return raw_text_clean
    
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {str(e)}")
        return None

def categorize_answer(answer_text):
    """Categorize the answer based on its content"""
    answer_clean = answer_text.strip()
    
    # Check for Yes/No answers
    if answer_clean.lower() in ['yes', 'no', 'yes.', 'no.', 'n/a', 'n/a.', 'zero', 'zero.']:
        return "Yes/No+Text"
    
    # Check for coded list answers (contains semicolons or specific patterns)
    if ';' in answer_clean and len(answer_clean) < 100:
        return "Coded_List"
    
    # Check for multiple items (contains "and" or comma-separated short items)
    if (',' in answer_clean or ' and ' in answer_clean.lower()) and len(answer_clean) < 100:
        return "Multiple"
    
    
    # Check for numeric answers
    if re.match(r'^\d+\.?$', answer_clean):
        return "Numeric"
    
    # Default to text for longer answers
    return "Text"

import re

def fast_extract_answers(text, alliance_id=None, alliance_name=None):
    """
    Fast extraction using whitespace-flattened search so that slight
    differences in line breaks or punctuation don’t break question matching.
    Returns a list of row dicts with keys:
      Alliance_ID, Alliance_Name, Question_ID, Question_Short, Question_Full,
      Answer_YesNo, Answer_Text, Answer_Category, Decoded_Countries, Text_Length
    """
    # —————————————— Alliance ID / Name logic (unchanged) ——————————————
    if alliance_id is None:
        id_pos = text.find("1. ATOP ID:")
        if id_pos != -1:
            start = id_pos + len("1. ATOP ID:")
            end   = text.find("\n", start)
            alliance_id = text[start:end].strip().replace("#", "").strip()
        else:
            alliance_id = "Unknown"
    if alliance_name is None:
        alliance_name = f"Alliance_{alliance_id}"

    # —————————————— Flatten for robust searching ——————————————
    text_search = re.sub(r"\s+", " ", text).strip()

    # —————————————— Locate every question position ——————————————
    question_positions = []
    for q_id, q_info in QUESTIONS.items():
        q_flat = re.sub(r"\s+", " ", q_info["question_text"]).strip()
        matches = [m.start() for m in re.finditer(re.escape(q_flat), text_search)]
        if matches:
            pos = matches[-1]
            question_positions.append((q_id, pos, len(q_flat), q_info))
    question_positions.sort(key=lambda x: x[1])

    rows = []
    # —————————————— Extract each answer block ——————————————
    for i, (q_id, pos, plen, q_info) in enumerate(question_positions):
        answer_start = pos + plen
        if i + 1 < len(question_positions):
            answer_end = question_positions[i + 1][1]
        else:
            # last question → up to common markers or end
            answer_end = len(text_search)
            for marker in ["SECTION", "Source:", "Date of last revision:"]:
                mpos = text_search.find(marker, answer_start)
                if 0 <= mpos < answer_end:
                    answer_end = mpos

        # slice & basic cleanup
        answer_text = text_search[answer_start:answer_end].strip()
        answer_text = re.sub(r'^#\s*', '', answer_text)  # drop leading "# "
        answer_text = re.sub(
            r'^See\s+#?\s*\d+\s+below[^\n]*\n?',
            '',
            answer_text,
            flags=re.I
        )
        answer_text = re.sub(
            r'^SECTION\s+[IVXLCDM]+\s*:[^\n]*\n?',
            '',
            answer_text,
            flags=re.I | re.M
        )
        # 3️⃣ NEW — remove SECTION headers anywhere (e.g. inline bleed)
        answer_text = re.sub(
            r'\bSECTION\s+[IVXLCDM]+\s*:\s*[A-Za-z ]+\b',
            '',
            answer_text,
            flags=re.I
        )
        answer_text = re.sub(r'\s+', ' ', answer_text)

        # Yes/No flag
        answer_yesno = "N/A"
        low = answer_text.lower()
        if low.startswith("yes"):
            answer_yesno = "Yes"
        elif low.startswith("no"):
            answer_yesno = "No"

        # decode any countries
        decoded = extract_countries(answer_text)

        # categorization (with Q3/Q14 → Date override)
        if q_id in ("Q3", "Q14"):
            iso = parse_single_date(answer_text)
            if iso:
                answer_text = iso
            answer_category = "Date"
        else:
            if decoded:
                answer_category = "coded_text"
            else:
                answer_category = categorize_answer(answer_text)

        # final row assembly
        row = {
            "Alliance_ID":      alliance_id,
            "Alliance_Name":    alliance_name,
            "Question_ID":      q_id,
            "Question_Short":   q_info["short"],
            "Question_Full":    q_info["full"],
            "Answer_YesNo":     answer_yesno,
            "Answer_Text":      answer_text,
            "Answer_Category":  answer_category,
            "Decoded_Countries": decoded,
            "Text_Length":      len(answer_text)
        }
        rows.append(row)

    # —————————————— Fill in any missing questions ——————————————
    found = {r["Question_ID"] for r in rows}
    for q_id, q_info in QUESTIONS.items():
        if q_id not in found:
            rows.append({
                "Alliance_ID":      alliance_id,
                "Alliance_Name":    alliance_name,
                "Question_ID":      q_id,
                "Question_Short":   q_info["short"],
                "Question_Full":    q_info["full"],
                "Answer_YesNo":     "Not Found",
                "Answer_Text":      "",
                "Answer_Category":  "Missing",
                "Decoded_Countries": "",
                "Text_Length":      0
            })

    # —————————————— Sort by question number & return ——————————————
    rows.sort(key=lambda r: int(r["Question_ID"][1:]))
    return rows


def process_single_pdf(pdf_path, alliance_id=None, alliance_name=None, save_text=False):
    """Process a single ATOP PDF file and extract data"""
    
    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)
    
    if text is None:
        print(f"Failed to extract text from {pdf_path}")
        return None
    
    # Optionally save extracted text
    if save_text:
        txt_path = Path(pdf_path).with_suffix('.txt')
        txt_path.write_text(text, encoding='utf-8')
        print(f"Text saved to: {txt_path}")
    
    # Extract alliance ID from filename if not provided
    if alliance_id is None:
        file_name = Path(pdf_path).stem
        alliance_id_match = re.search(r'ATOP(\d+)', file_name)
        alliance_id = alliance_id_match.group(1) if alliance_id_match else "Unknown"
    
    # Extract answers using fast method
    rows = fast_extract_answers(text, alliance_id, alliance_name)
    
    # Convert to DataFrame
    df = pd.DataFrame(rows)
    
    return df

def process_multiple_pdfs(pdf_paths, output_csv="atop_extracted_data.csv", save_text_files=False):
    """Process multiple ATOP PDF files and combine results"""
    
    all_dfs = []
    failed_files = []
    
    for i, pdf_path in enumerate(pdf_paths):
        print(f"\nProcessing file {i+1}/{len(pdf_paths)}: {Path(pdf_path).name}")
        
        try:
            df = process_single_pdf(pdf_path, save_text=save_text_files)
            
            if df is not None:
                all_dfs.append(df)
            else:
                failed_files.append(pdf_path)
                
        except Exception as e:
            print(f"Error processing {pdf_path}: {str(e)}")
            failed_files.append(pdf_path)
    
    # Combine all dataframes
    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True)
        
        # Save to CSV
        combined_df.to_csv(output_csv, index=False)
        print(f"\n✓ Data from {len(all_dfs)} files saved to {output_csv}")
        
        if failed_files:
            print(f"\n⚠ Failed to process {len(failed_files)} files:")
            for f in failed_files:
                print(f"  - {Path(f).name}")
        
        return combined_df
    else:
        print("No data extracted from any files.")
        return None

# Example usage functions
def demo_single_pdf():
    """Demo: Process a single PDF file"""
    pdf_path = GLOBAL_SINGLE_PDF
    
    # Process the PDF
    df = process_single_pdf(
        pdf_path,
        alliance_id="1005",
        alliance_name="Quadruple Alliance",
        save_text=True  # This will also save the extracted text
    )
    
    if df is not None:
        # Display sample results
        print("\nSample extracted data:")
        print(df[['Question_ID', 'Question_Short', 'Answer_Text']].head(10))
        
        # Save to CSV
        df.to_csv("atop_1005_extracted.csv", index=False)
        print(f"\nData saved to atop_1005_extracted.csv")
        #df.head()
        return df

import glob
from pathlib import Path


def demo_v4_processing():
    """Process only ATOP PDFs whose filenames end with 'v4' (case-insensitive)."""
    # 1️⃣ Glob for everything under ATOP*.pdf
    pdf_pattern = "atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets/ATOP*.pdf"
    all_pdfs = glob.glob(pdf_pattern)

    # 2️⃣ Keep only those whose stem ends with 'v4'
    v4_pdfs = [
        f for f in all_pdfs
        if Path(f).stem.lower().endswith("v4")
    ]

    print(f"Found {len(v4_pdfs)} ATOP v4 PDF files to process")

    # 3️⃣ Process just the v4 files
    combined_df = process_multiple_pdfs(
        v4_pdfs,
        output_csv="all_atop_v4_data.csv",
        save_text_files=False
    )

    if combined_df is not None:
        print(f"\nTotal records extracted from v4 files: {len(combined_df)}")
        print(f"Unique alliances in v4 set: {combined_df['Alliance_ID'].nunique()}")



# Main execution
if __name__ == "__main__":

    # Choose one of the demo functions to run
    # df = demo_single_pdf()
    demo_v4_processing()
    print("\nProcessing complete for version 4 variant. DataFrame ready for analysis.")

Installing python-dateutil...
Found 409 ATOP v4 PDF files to process

Processing file 1/409: ATOP1010v4.pdf
Extracting text from atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets\ATOP1010v4.pdf
✓ Extracted 11,395 characters from ATOP1010v4.pdf

Processing file 2/409: ATOP1015v4.pdf
Extracting text from atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets\ATOP1015v4.pdf
✓ Extracted 10,370 characters from ATOP1015v4.pdf

Processing file 3/409: ATOP1053v4.pdf
Extracting text from atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets\ATOP1053v4.pdf
✓ Extracted 10,780 characters from ATOP1053v4.pdf

Processing file 4/409: ATOP1145v4.pdf
Extracting text from atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets\ATOP1145v4.pdf
✓ Extracted 11,017 characters from ATOP1145v4.pdf

Processing file 5/409: ATOP1150v4.pdf
Extracting text from atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets\ATOP1150v4.pdf
✓ Extracted 13,266 characters from ATOP1150v4.pdf

Processing file 6/409: AT

In [45]:
import re
import pandas as pd
from pathlib import Path
import sys
import subprocess
import importlib.util

GLOBAL_SINGLE_PDF = "atop_version_5.1_codesheets\ATOP Version 5.1 Codesheets\ATOP7050v5.pdf"

# Ensure required packages are installed
def ensure_package(package: str):
    """Install package if not already installed"""
    if importlib.util.find_spec(package) is None:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# Install required packages
ensure_package("pdfplumber")
ensure_package("pandas")
ensure_package("python-dateutil")       # ← add just below the others
from dateutil.parser import parse as _dt_parse



import pdfplumber

# Define all questions with exact text for faster matching
QUESTIONS = {
    "Q1": {
        "short": "ATOP_ID",
        "full": "ATOP ID",
        "question_text": "1. ATOP ID: "
    },
    "Q2": {
        "short": "COW_Alliance_Number",
        "full": "COW Alliance # (if applicable)",
        "question_text": "2. COW Alliance # (if applicable):"
    },
    "Q3": {
        "short": "Signature_Date",
        "full": "Signature date(s) (may vary for members in multilateral alliances)",
        "question_text": "3. Signature date(s) (may vary for members in multilateral alliances):"
    },
    "Q4": {
        "short": "Members",
        "full": "Members of alliance (use COW country codes)",
        "question_text": "4. Members of alliance (use COW country codes):"
    },
    "Q5": {
        "short": "Signed_During_War",
        "full": "Was the alliance signed during war? If so, which war, and which member(s) was at war?",
        "question_text": "5. Was the alliance signed during war? If so, which war, and which member(s) was at war? (check COW data)"
    },
    "Q6": {
    "short": "Establishment_Means",
    "full": (
        "Means by which alliance was established (e.g., treaty, executive agreement, "
        "diplomatic notes, etc.): (Only select \"treaty\" if the\nagreement requires "
        "ratification.)"
    ),
    # NOTE the explicit  \n  before the word “agreement” and the curly quotes
    "question_text": (
        "6. Means by which alliance was established (e.g., treaty, executive agreement, "
        "diplomatic notes, etc.): (Only select “treaty” if the\nagreement requires "
        "ratification.)"
    )
    },
    "Q7": {
        "short": "Public_Secret_Status",
        "full": "Was the treaty public, public but with secret articles, or secret?",
        "question_text": "7. Was the treaty public, public but with secret articles, or secret? If secret articles, describe the content of secret articles."
    },
    "Q8": {
        "short": "Additional_Members_Provisions",
        "full": "Are there provisions for adding additional members mentioned in the treaty?",
        "question_text": "8. Are there provisions for adding additional members mentioned in the treaty? (Yes, No) If Yes, describe these provisions."
    },
    "Q9": {
        "short": "Specific_Future_Members",
        "full": "Are specific state(s) mentioned as possible future members of the alliance?",
        "question_text": "9. Are specific state(s) mentioned as possible future members of the alliance? (Yes, No) If Yes, which state(s)?"
    },
    "Q10": {
        "short": "Treaty_Duration",
        "full": "Are there specific provisions in the treaty concerning the length of time the treaty is to last?",
        "question_text": "10. Are there specific provisions in the treaty concerning the length of time the treaty is to last? (Yes, No) If Yes, describe."
    },
    "Q11": {
        "short": "Renounce_Conditions",
        "full": "Are there conditions under which members may renounce the treaty?",
        "question_text": "11. Are there conditions under which members may renounce the treaty? If Yes, describe these conditions."
    },
    "Q12": {
        "short": "Renewal_Provisions",
        "full": "Are there specific provisions in the treaty concerning renewal of the treaty?",
        "question_text": "12. Are there specific provisions in the treaty concerning renewal of the treaty? (Yes, No) If Yes, describe these provisions."
    },
    "Q13": {
        "short": "Times_Renewed",
        "full": "Number of times alliance renewed, dates of renewal, and means by which alliance was renewed",
        "question_text": "13. Number of times alliance renewed, dates of renewal, and means by which alliance was renewed (Do not include automatic\nrenewals):"
    },
    "Q14": {
        "short": "Termination_Date",
        "full": "Termination date(s) and source(s) for termination dates(s)",
        "question_text": "14. Termination date(s) (may vary for members in multilateral alliances) and source(s) for termination dates(s):"
    },
    "Q15": {
        "short": "Termination_Cause",
        "full": "Describe what caused the alliance to terminate",
        "question_text": "15. Describe what caused the alliance to terminate."
    },
    "Q16": {
        "short": "Alliance_Type",
        "full": "Type of alliance (defense pact, neutrality pact, nonaggression pact, consultation pact, offense pact)",
        "question_text": "16. Type of alliance (defense pact, neutrality pact, nonaggression pact, consultation pact, offense pact): (List all that apply to any\nmember of the alliance.)"
    },
    "Q17": {
        "short": "COW_Alliance_Type",
        "full": "Type of alliance as coded in COW data (if applicable)",
        "question_text": "17. Type of alliance as coded in COW data (if applicable):"
    },
    "Q18": {
        "short": "Obligations",
        "full": "Describe the obligations of the alliance partners",
        "question_text": "18. Describe the obligations of the alliance partners."
    },
    "Q19": {
        "short": "Contingencies",
        "full": "Are any of the obligations contingent upon any of the following",
        "question_text": "19. Are any of the obligations contingent upon any of the following: specific adversary, specific location, specific ongoing conflict,\nnumber of adversaries, noncompliance with a specific demand, attack, nonprovocation of enemy, or prior agreement among partners?\n(List all that apply)"
    },
    "Q20": {
        "short": "Limits_Description",
        "full": "Describe the nature of the limits to obligations listed in #19",
        "question_text": "20. Describe the nature of the limits to obligations listed in #19."
    },
    "Q21": {
        "short": "Additional_War_Provisions",
        "full": "Are there any additional provisions for assistance in the event of war?",
        "question_text": "21. Are there any additional provisions for assistance in the event of war? (e.g., promise not to participate in economic sanctions against\npartner, promise not to aid internal or external enemies, etc.) (Yes, No) If Yes, describe."
    },
    "Q22": {
        "short": "Additional_Limits",
        "full": "Are there any additional limits to the alliance obligations or conditions",
        "question_text": "22. Are there any additional limits to the alliance obligations or conditions under which treaty obligations do not apply that were not\nlisted in #19? (Yes, No) If Yes, describe."
    },
    "Q23": {
        "short": "Target_Threat",
        "full": "Is a specific target/threat mentioned as the object of the treaty?",
        "question_text": "23. Is a specific target/threat mentioned as the object of the treaty? (Yes, No) (Answering Yes to this question does not necessarily mean\nthat obligations are limited only to this adversary.) If Yes, list the target nation(s) (using COW country codes if possible) and describe\nthe general nature of the reference to the target of the treaty."
    },
    "Q24": {
        "short": "Symmetric_Obligations",
        "full": "Are the treaty obligations symmetric?",
        "question_text": "24. Are the treaty obligations symmetric? (That is, do all members commit to the same obligations?) (Yes, No) If No, describe."
    },
    "Q25": {
        "short": "No_Separate_Peace",
        "full": "Does the treaty prohibit members from settling conflicts independently (no separate peace)?",
        "question_text": "25. Does the treaty prohibit members from settling conflicts independently (no separate peace)? (Yes, No) If Yes, describe."
    },
    "Q26": {
        "short": "Consult_Third_Parties",
        "full": "Does the treaty have provisions requiring that the contracting parties consult before making commitments to third parties",
        "question_text": "26. Does the treaty have provisions requiring that the contracting parties consult before making commitments to third parties (excluding\nno separate peace provisions discussed above)? (Yes, No) If Yes, describe these provisions."
    },
    "Q27": {
        "short": "No_Contrary_Alliances",
        "full": "Does the treaty specify that the contracting parties must not enter into any other alliances that are directed against the alliance",
        "question_text": "27. Does the treaty specify that the contracting parties must not enter into any other alliances that are directed against the alliance in\nquestion? (Yes, No) If Yes, describe."
    },
    "Q28": {
        "short": "Additional_Obligations_Comments",
        "full": "Additional coder comments regarding treaty obligations",
        "question_text": "28. Additional coder comments regarding treaty obligations:"
    },
    "Q29": {
        "short": "Internal_Conflict_Reference",
        "full": "Does the treaty make reference to the potential for conflict among members of the alliance?",
        "question_text": "29. Does the treaty make reference to the potential for conflict among members of the alliance? (Yes, No) If Yes, describe."
    },
    "Q30": {
        "short": "Dispute_Settlement",
        "full": "Does the treaty discuss mediation/arbitration or other means of settling disputes among the signatories?",
        "question_text": "30. Does the treaty discuss mediation/arbitration or other means of settling disputes among the signatories? (Yes, No) If Yes, describe."
    },
    "Q31": {
        "short": "Military_Contact",
        "full": "Does the treaty require official contact among the military forces of the participating states?",
        "question_text": "31. Does the treaty require official contact among the military forces of the participating states? (Yes, No) If Yes, describe. (Note\nwhether official contact is required only in the event of hostilities or also during peacetime.)"
    },
    "Q32": {
        "short": "Military_Aid",
        "full": "Does the treaty include provisions for military aid?",
        "question_text": "32. Does the treaty include provisions for military aid? (e.g., grants, loans, transfer of technology, training) (Yes, No) If Yes, describe\nthese provisions."
    },
    "Q33": {
        "short": "Integrated_Command",
        "full": "Does the treaty provide for integrated command of military forces while the alliance is in effect?",
        "question_text": "33. Does the treaty provide for integrated command of military forces while the alliance is in effect (peacetime as well as wartime)?\n(Yes, No) If Yes, describe."
    },
    "Q34": {
        "short": "Force_Subordination",
        "full": "Does the treaty require subordination of the forces of one or more member states to another in the event of hostilities?",
        "question_text": "34. Does the treaty require subordination of the forces of one or more member states to another in the event of hostilities? (Yes, No) If\nYes, describe."
    },
    "Q35": {
        "short": "Organizations_Established",
        "full": "Does the treaty establish any organizations?",
        "question_text": "35. Does the treaty establish any organizations? (include provisions for regularly scheduled meetings) (Yes, No) If Yes, describe."
    },
    "Q36": {
        "short": "Joint_Bases_Troops",
        "full": "Does the treaty provide for joint military bases, or for one or more states to place troops in the territory of one or more other states?",
        "question_text": "36. Does the treaty provide for joint military bases, or for one or more states to place troops in the territory of one or more other states?\n(Yes, No) If Yes, describe."
    },
    "Q37": {
        "short": "Contribution_Levels",
        "full": "Does the treaty specify contribution levels (funds, troops, etc.)?",
        "question_text": "37. Does the treaty specify contribution levels (funds, troops, etc.)? (Yes, No) If Yes, describe."
    },
    "Q38": {
        "short": "Armament_Provisions",
        "full": "Does the treaty contain any provisions regarding coordinated increase of armaments, reduction of armaments, prohibition of weapons, or rules of warfare?",
        "question_text": "38. Does the treaty contain any provisions regarding coordinated increase of armaments, reduction of armaments, prohibition of\nweapons, or rules of warfare? (Yes, No) If Yes, describe such provisions."
    },
    "Q39": {
        "short": "Territory_Acquisition",
        "full": "Does the treaty explicitly permit or prohibit the acquisition of territory not currently held by either ally?",
        "question_text": "39. Does the treaty explicitly permit or prohibit the acquisition of territory not currently held by either ally? (Yes, No) If Yes, describe\nreference."
    },
    "Q40": {
        "short": "Division_of_Gains",
        "full": "Does the treaty include discussion of the division of gains from any future conflict?",
        "question_text": "40. Does the treaty include discussion of the division of gains from any future conflict? (Yes, No) If Yes, describe."
    },
    "Q41": {
        "short": "Demobilization_Withdrawal",
        "full": "Does the treaty include discussion of demobilization/withdrawal following conflict?",
        "question_text": "41. Does the treaty include discussion of demobilization/withdrawal following conflict? (Yes, No) If Yes, describe."
    },
    "Q42": {
        "short": "Other_Coordination",
        "full": "Are there any other provisions in the treaty that describe the means through which the states will coordinate their military efforts and policies?",
        "question_text": "42. Are there any other provisions in the treaty that describe the means through which the states will coordinate their military efforts and\npolicies? (Yes, No) If Yes, describe."
    },
    "Q43": {
        "short": "Additional_Institutionalization_Comments",
        "full": "Additional coder comments regarding references to alliance institutionalization",
        "question_text": "43. Additional coder comments regarding references to alliance institutionalization:"
    },
    "Q44": {
        "short": "Nullifies_Treaties",
        "full": "Does the treaty state that it nullifies any existing treaties signed by one or more of the contracting parties?",
        "question_text": "44. Does the treaty state that it nullifies any existing treaties signed by one or more of the contracting parties? (Yes, No) If Yes, list the\ntreaties/agreements that are nullified and describe the treaty reference."
    },
    "Q45": {
        "short": "Modifies_Treaties",
        "full": "Does the treaty state that it modifies any existing treaties signed by one or more of the contracting parties?",
        "question_text": "45. Does the treaty state that it modifies any existing treaties signed by one or more of the contracting parties? (Yes, No) If Yes, list the\ntreaties/agreements that are modified and describe the treaty reference."
    },
    "Q46": {
        "short": "Includes_Treaties",
        "full": "Does the treaty state that it includes any existing treaties signed by one or more of the contracting parties?",
        "question_text": "46. Does the treaty state that it includes any existing treaties signed by one or more of the contracting parties? (Yes, No) If Yes, list the\ntreaties/agreements that are included and describe the treaty reference."
    },
    "Q47": {
        "short": "International_Organizations",
        "full": "Does the treaty make any references to international organizations?",
        "question_text": "47. Does the treaty make any references to international organizations? (Yes, No) If Yes, list the international organization and describe\nthe nature of the reference."
    },
    "Q48": {
        "short": "Other_Alliances",
        "full": "Does the treaty make any references to other existing alliances?",
        "question_text": "48. Does the treaty make any references to other existing alliances? (Yes, No) If Yes, list the alliance and describe the nature of the\nreference."
    },
    "Q49": {
        "short": "Companion_Agreements",
        "full": "Are there any companion agreements referenced in the treaty?",
        "question_text": "49. Are there any companion agreements referenced in the treaty? (Yes, No) If Yes, describe the agreements."
    },
    "Q50": {
        "short": "Non_Military_Cooperation",
        "full": "Does the treaty include statements regarding non-military cooperation?",
        "question_text": "50. Does the treaty include statements regarding non-military cooperation (e.g., economic, cultural, scientific exchange)? (Yes, No) If\nYes, describe."
    },
    "Q51": {
        "short": "Resolves_Conflicts",
        "full": "Does the treaty resolve other conflicts among the parties to the alliance?",
        "question_text": "51. Does the treaty resolve other conflicts among the parties to the alliance? (Yes, No) If Yes, describe."
    },
    "Q52": {
        "short": "Enforces_External_Settlement",
        "full": "Does the treaty propose/enforce settlement of a conflict not involving parties to the alliance?",
        "question_text": "52. Does the treaty propose/enforce settlement of a conflict not involving parties to the alliance? (Yes, No) If Yes, describe."
    },
    "Q53": {
        "short": "Proposes_Agreements",
        "full": "Does the treaty propose other agreements among the contracting parties?",
        "question_text": "53. Does the treaty propose other agreements among the contracting parties? (Yes, No) If Yes, describe."
    },
    "Q54": {
        "short": "Unresolved_Conflicts",
        "full": "Does the treaty mention unresolved conflicts among the contracting parties?",
        "question_text": "54. Does the treaty mention unresolved conflicts among the contracting parties? (Yes, No) If Yes, describe."
    },
    "Q55": {
        "short": "Economic_Aid",
        "full": "Does the treaty include provisions for economic aid or other enticements?",
        "question_text": "55. Does the treaty include provisions for economic aid or other enticements (include trade concessions, post war recovery, etc.)? (Yes,\nNo) If Yes, describe these provisions."
    },
    "Q56": {
        "short": "Internal_Politics_Intervention",
        "full": "Does the treaty describe circumstances under which one party may intervene in the internal politics of another party?",
        "question_text": "56. Does the treaty describe circumstances under which one party may intervene in the internal politics of another party or specifically\ncommit the states to non-intervention? (Yes, No) If Yes, describe."
    },
    "Q57": {
        "short": "Additional_Comments",
        "full": "Additional coder comments on this alliance treaty",
        "question_text": "57. Additional coder comments on this alliance treaty:"
    },
    "Q58": {
        "short": "Source_of_Coding",
        "full": "Source of coding information",
        "question_text": "58. Source of coding information (e.g., treaty, treaty and secondary sources, secondary sources only, etc.):"
    },
    "Q59": {
        "short": "Treaty_Citation",
        "full": "Treaty citation",
        "question_text": "59. Treaty citation:"
    },
    "Q60": {
        "short": "Last_Revision_Date",
        "full": "Date of last revision of this coding sheet",
        "question_text": "60. Date of last revision of this coding sheet:"
    },
    "Q61": {
        "short": "Coder",
        "full": "Coder",
        "question_text": "61. Coder:"
    }
}

def parse_single_date(text):
    """
    Return ISO date ('YYYY-MM-DD') if `text` looks like a date.
    Otherwise return None.
    """
    try:
        clean = text.strip().rstrip('.')        # drop trailing period
        dt = _dt_parse(clean, fuzzy=True)       # handles 'March 9, 1833'
        return dt.date().isoformat()
    except (ValueError, OverflowError):
        return None

def extract_countries(text):
    """
    Return a semicolon-separated string of CountryName(code) pairs found in `text`.
    Handles both '365 Russia 200 United Kingdom.' and 'Russia (365)' layouts, collecting ALL occurrences.
    """
    pairs = []

    # Pattern A – code first, then name (e.g. "365 Russia 200 United Kingdom.")
    for code, name in re.findall(
        r'\b(\d{3})\s+([A-Z][A-Za-z.\- ]+?)'               # capture 3-digit code + country name
        r'(?='                                             # but stop when you see:
          r'(?:\s+\d{3}\s)'      #   – another code (e.g. " 200 ")
          r'|[.,;\n]'            #   – or punctuation: . , ; or newline
          r'|$'                  #   – or end-of-string
        r')',
        text
    ):
        pair = f"{name.strip()}({code})"
        if pair not in pairs:
            pairs.append(pair)

    # Pattern B – name first, then code in parentheses (e.g. "Russia (365)")
    for name, code in re.findall(
        r'\b([A-Z][A-Za-z.\- ]+?)\s*\(\s*(\d{3})\s*\)',
        text
    ):
        pair = f"{name.strip()}({code})"
        if pair not in pairs:
            pairs.append(pair)

    return ";".join(pairs)





def extract_text_from_pdf(pdf_path):
    """Extract text from PDF file using pdfplumber"""
    print(f"Extracting text from {pdf_path}")
    
    try:
        with pdfplumber.open(pdf_path) as pdf:
            # Extract text from all pages
            raw_text = "\n".join(
                page.extract_text() or ""  # returns '' if page is image-only
                for page in pdf.pages
            )
        
        # Clean up multiple newlines
        raw_text_clean = re.sub(r"\n{2,}", "\n\n", raw_text)
        
        print(f"✓ Extracted {len(raw_text):,} characters from {Path(pdf_path).name}")
        return raw_text_clean
    
    except Exception as e:
        print(f"Error extracting text from {pdf_path}: {str(e)}")
        return None

def categorize_answer(answer_text):
    """Categorize the answer based on its content"""
    answer_clean = answer_text.strip()
    
    # Check for Yes/No answers
    if answer_clean.lower() in ['yes', 'no', 'yes.', 'no.', 'n/a', 'n/a.', 'zero', 'zero.']:
        return "Yes/No+Text"
    
    # Check for coded list answers (contains semicolons or specific patterns)
    if ';' in answer_clean and len(answer_clean) < 100:
        return "Coded_List"
    
    # Check for multiple items (contains "and" or comma-separated short items)
    if (',' in answer_clean or ' and ' in answer_clean.lower()) and len(answer_clean) < 100:
        return "Multiple"
    
    
    # Check for numeric answers
    if re.match(r'^\d+\.?$', answer_clean):
        return "Numeric"
    
    # Default to text for longer answers
    return "Text"

import re

def fast_extract_answers(text, alliance_id=None, alliance_name=None):
    """
    Fast extraction using whitespace-flattened search so that slight
    differences in line breaks or punctuation don’t break question matching.
    Returns a list of row dicts with keys:
      Alliance_ID, Alliance_Name, Question_ID, Question_Short, Question_Full,
      Answer_YesNo, Answer_Text, Answer_Category, Decoded_Countries, Text_Length
    """
    # —————————————— Alliance ID / Name logic (unchanged) ——————————————
    if alliance_id is None:
        id_pos = text.find("1. ATOP ID:")
        if id_pos != -1:
            start = id_pos + len("1. ATOP ID:")
            end   = text.find("\n", start)
            alliance_id = text[start:end].strip().replace("#", "").strip()
        else:
            alliance_id = "Unknown"
    if alliance_name is None:
        alliance_name = f"Alliance_{alliance_id}"

    # —————————————— Flatten for robust searching ——————————————
    text_search = re.sub(r"\s+", " ", text).strip()

    # —————————————— Locate every question position ——————————————
    question_positions = []
    for q_id, q_info in QUESTIONS.items():
        q_flat = re.sub(r"\s+", " ", q_info["question_text"]).strip()
        matches = [m.start() for m in re.finditer(re.escape(q_flat), text_search)]
        if matches:
            pos = matches[-1]
            question_positions.append((q_id, pos, len(q_flat), q_info))
    question_positions.sort(key=lambda x: x[1])

    rows = []
    # —————————————— Extract each answer block ——————————————
    for i, (q_id, pos, plen, q_info) in enumerate(question_positions):
        answer_start = pos + plen
        if i + 1 < len(question_positions):
            answer_end = question_positions[i + 1][1]
        else:
            # last question → up to common markers or end
            answer_end = len(text_search)
            for marker in ["SECTION", "Source:", "Date of last revision:"]:
                mpos = text_search.find(marker, answer_start)
                if 0 <= mpos < answer_end:
                    answer_end = mpos

        # slice & basic cleanup
        answer_text = text_search[answer_start:answer_end].strip()
        answer_text = re.sub(r'^#\s*', '', answer_text)  # drop leading "# "
        
        

        answer_text = re.sub(
            r'^See\s+#?\s*\d+\s+below[^\n]*\n?',
            '',
            answer_text,
            flags=re.I
        )
        answer_text = re.sub(
            r'^SECTION\s+[IVXLCDM]+\s*:[^\n]*\n?',
            '',
            answer_text,
            flags=re.I | re.M
        )
        # 3️⃣ NEW — remove SECTION headers anywhere (e.g. inline bleed)
        answer_text = re.sub(
            r'\bSECTION\s+[IVXLCDM]+\s*:\s*[A-Za-z ]+\b',
            '',
            answer_text,
            flags=re.I
        )
        answer_text = re.sub(r'\s+', ' ', answer_text)

        # Yes/No flag
        answer_yesno = "N/A"
        low = answer_text.lower()
        if low.startswith("yes"):
            answer_yesno = "Yes"
        elif low.startswith("no"):
            answer_yesno = "No"

        # decode any countries
        decoded = extract_countries(answer_text)

        # categorization (with Q3/Q14 → Date override)
        if q_id in ("Q3", "Q14"):
            iso = parse_single_date(answer_text)
            if iso:
                answer_text = iso
            answer_category = "Date"
        else:
            if decoded:
                answer_category = "coded_text"
            else:
                answer_category = categorize_answer(answer_text)

        # final row assembly
        row = {
            "Alliance_ID":      alliance_id,
            "Alliance_Name":    alliance_name,
            "Question_ID":      q_id,
            "Question_Short":   q_info["short"],
            "Question_Full":    q_info["full"],
            "Answer_YesNo":     answer_yesno,
            "Answer_Text":      answer_text,
            "Answer_Category":  answer_category,
            "Decoded_Countries": decoded,
            "Text_Length":      len(answer_text)
        }
        rows.append(row)
        for row in rows:
        # Fix Q18 bleeding into Q19
            if row["Question_ID"] == "Q18" and "19. Are any of the obligations contingent" in row["Answer_Text"]:
                full_text = row["Answer_Text"]
                split_pos = full_text.find("19. Are any of the obligations contingent")
                
                # Clean Q18 answer
                row["Answer_Text"] = full_text[:split_pos].strip()
                row["Text_Length"] = len(row["Answer_Text"])
                
                # Extract Q19 answer (everything after the Q19 question text)
                q19_part = full_text[split_pos:]
                # Find where the actual answer starts (after the question text)
                q19_answer_start = q19_part.find("(List all that apply)")
                if q19_answer_start != -1:
                    q19_answer = q19_part[q19_answer_start + len("(List all that apply)"):].strip()
                else:
                    # Fallback: take everything after a reasonable chunk of the question
                    q19_answer = q19_part[200:].strip()  # Skip ~200 chars of question text
                
                # Find Q19 row and populate it
                for q19_row in rows:
                    if q19_row["Question_ID"] == "Q19":
                        q19_row["Answer_Text"] = q19_answer
                        q19_row["Answer_YesNo"] = "Yes" if q19_answer.lower().startswith("yes") else ("No" if q19_answer.lower().startswith("no") else "N/A")
                        q19_row["Answer_Category"] = categorize_answer(q19_answer)
                        q19_row["Text_Length"] = len(q19_answer)
                        q19_row["Decoded_Countries"] = extract_countries(q19_answer)
                        break
            
            # Fix Q49 bleeding into Q50
            elif row["Question_ID"] == "Q49" and "50. Does the treaty include statements" in row["Answer_Text"]:
                full_text = row["Answer_Text"]
                split_pos = full_text.find("50. Does the treaty include statements")
                
                # Clean Q49 answer
                row["Answer_Text"] = full_text[:split_pos].strip()
                row["Text_Length"] = len(row["Answer_Text"])
                
                # Extract Q50 answer
                q50_part = full_text[split_pos:]
                # Find where the actual answer starts (after the question text)
                q50_answer_start = q50_part.find("(Yes, No)")
                if q50_answer_start != -1:
                    q50_answer = q50_part[q50_answer_start + len("(Yes, No)"):].strip()
                else:
                    # Fallback: take everything after a reasonable chunk of the question
                    q50_answer = q50_part[150:].strip()  # Skip ~150 chars of question text
                
                # Find Q50 row and populate it
                for q50_row in rows:
                    if q50_row["Question_ID"] == "Q50":
                        q50_row["Answer_Text"] = q50_answer
                        q50_row["Answer_YesNo"] = "Yes" if q50_answer.lower().startswith("yes") else ("No" if q50_answer.lower().startswith("no") else "N/A")
                        q50_row["Answer_Category"] = categorize_answer(q50_answer)
                        q50_row["Text_Length"] = len(q50_answer)
                        q50_row["Decoded_Countries"] = extract_countries(q50_answer)
                        break



    # —————————————— Fill in any missing questions ——————————————
    found = {r["Question_ID"] for r in rows}
    for q_id, q_info in QUESTIONS.items():
        if q_id not in found:
            rows.append({
                "Alliance_ID":      alliance_id,
                "Alliance_Name":    alliance_name,
                "Question_ID":      q_id,
                "Question_Short":   q_info["short"],
                "Question_Full":    q_info["full"],
                "Answer_YesNo":     "Not Found",
                "Answer_Text":      "",
                "Answer_Category":  "Missing",
                "Decoded_Countries": "",
                "Text_Length":      0
            })

    # —————————————— Sort by question number & return ——————————————
    rows.sort(key=lambda r: int(r["Question_ID"][1:]))
    return rows


def process_single_pdf(pdf_path, alliance_id=None, alliance_name=None, save_text=False):
    """Process a single ATOP PDF file and extract data"""
    
    # Extract text from PDF
    text = extract_text_from_pdf(pdf_path)
    
    if text is None:
        print(f"Failed to extract text from {pdf_path}")
        return None
    
    # Optionally save extracted text
    if save_text:
        txt_path = Path(pdf_path).with_suffix('.txt')
        txt_path.write_text(text, encoding='utf-8')
        print(f"Text saved to: {txt_path}")
    
    # Extract alliance ID from filename if not provided
    if alliance_id is None:
        file_name = Path(pdf_path).stem
        alliance_id_match = re.search(r'ATOP(\d+)', file_name)
        alliance_id = alliance_id_match.group(1) if alliance_id_match else "Unknown"
    
    # Extract answers using fast method
    rows = fast_extract_answers(text, alliance_id, alliance_name)
    
    # Convert to DataFrame
    df = pd.DataFrame(rows)
    
    return df

def process_multiple_pdfs(pdf_paths, output_csv="atop_extracted_data.csv", save_text_files=False):
    """Process multiple ATOP PDF files and combine results"""
    
    all_dfs = []
    failed_files = []
    
    for i, pdf_path in enumerate(pdf_paths):
        print(f"\nProcessing file {i+1}/{len(pdf_paths)}: {Path(pdf_path).name}")
        
        try:
            df = process_single_pdf(pdf_path, save_text=save_text_files)
            
            if df is not None:
                all_dfs.append(df)
            else:
                failed_files.append(pdf_path)
                
        except Exception as e:
            print(f"Error processing {pdf_path}: {str(e)}")
            failed_files.append(pdf_path)
    
    # Combine all dataframes
    if all_dfs:
        combined_df = pd.concat(all_dfs, ignore_index=True)
        
        # Save to CSV
        combined_df.to_csv(output_csv, index=False)
        print(f"\n✓ Data from {len(all_dfs)} files saved to {output_csv}")
        
        if failed_files:
            print(f"\n⚠ Failed to process {len(failed_files)} files:")
            for f in failed_files:
                print(f"  - {Path(f).name}")
        
        return combined_df
    else:
        print("No data extracted from any files.")
        return None

# Example usage functions
def demo_single_pdf():
    """Demo: Process a single PDF file"""
    pdf_path = GLOBAL_SINGLE_PDF
    
    # Process the PDF
    df = process_single_pdf(
        pdf_path,
        alliance_id="1005",
        alliance_name="Quadruple Alliance",
        save_text=True  # This will also save the extracted text
    )
    
    if df is not None:
        # Display sample results
        print("\nSample extracted data:")
        print(df[['Question_ID', 'Question_Short', 'Answer_Text']].head(10))
        
        # Save to CSV
        df.to_csv("atop_1005_extracted.csv", index=False)
        print(f"\nData saved to atop_1005_extracted.csv")
        #df.head()
        return df

import glob
from pathlib import Path


def demo_v5_processing():
    """Process only ATOP PDFs whose filenames end with 'v4' (case-insensitive)."""
    # 1️⃣ Glob for everything under ATOP*.pdf
    pdf_pattern = "atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets/ATOP*.pdf"
    all_pdfs = glob.glob(pdf_pattern)

    # 2️⃣ Keep only those whose stem ends with 'v4'
    v4_pdfs = [
        f for f in all_pdfs
        if Path(f).stem.lower().endswith("v5")
    ]

    print(f"Found {len(v4_pdfs)} ATOP v5 PDF files to process")

    # 3️⃣ Process just the v4 files
    combined_df = process_multiple_pdfs(
        v4_pdfs,
        output_csv="all_atop_v5_data.csv",
        save_text_files=False
    )

    if combined_df is not None:
        print(f"\nTotal records extracted from v5 files: {len(combined_df)}")
        print(f"Unique alliances in v5 set: {combined_df['Alliance_ID'].nunique()}")



# Main execution
if __name__ == "__main__":
    # Choose one of the demo functions to run
    # df = demo_single_pdf()
    demo_v5_processing()
    print("\nProcessing complete for version 5 variant. DataFrame ready for analysis.")

Installing python-dateutil...
Found 49 ATOP v5 PDF files to process

Processing file 1/49: ATOP1323v5.pdf
Extracting text from atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets\ATOP1323v5.pdf
✓ Extracted 10,799 characters from ATOP1323v5.pdf

Processing file 2/49: ATOP1358v5.pdf
Extracting text from atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets\ATOP1358v5.pdf
✓ Extracted 13,593 characters from ATOP1358v5.pdf

Processing file 3/49: ATOP1362v5.pdf
Extracting text from atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets\ATOP1362v5.pdf
✓ Extracted 17,183 characters from ATOP1362v5.pdf

Processing file 4/49: ATOP3222v5.pdf
Extracting text from atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets\ATOP3222v5.pdf
✓ Extracted 12,481 characters from ATOP3222v5.pdf

Processing file 5/49: ATOP3333v5.pdf
Extracting text from atop_version_5.1_codesheets/ATOP Version 5.1 Codesheets\ATOP3333v5.pdf
✓ Extracted 11,763 characters from ATOP3333v5.pdf

Processing file 6/49: ATOP3367v

In [46]:
import pandas as pd

# 1. Define relative paths to your input CSVs
paths = {
    'base': 'all_atop_data.csv',
    'v4':   'all_atop_v4_data.csv',
    'v5':   'all_atop_v5_data.csv'
}

# 2. Load each, tag with version
dfs = []
for version, path in paths.items():
    df = pd.read_csv(path)
    df['version'] = version
    dfs.append(df)

# 3. Concatenate into one DataFrame
merged = pd.concat(dfs, ignore_index=True)

# 4. Write out to CSV
out_csv  = 'ATOP_treaty_text_20250703.csv'
merged.to_csv(out_csv, index=False)

# 5. Write out to JSON (list of records)
out_json = 'ATOP_treaty_text_20250703.json'
merged.to_json(out_json, orient='records', force_ascii=False, indent=2)

print(f"Saved merged data to:\n  • {out_csv}\n  • {out_json}")


Saved merged data to:
  • ATOP_treaty_text_20250703.csv
  • ATOP_treaty_text_20250703.json
