In [None]:
import os, re, unicodedata, itertools, collections
from pathlib import Path
import pandas as pd

# spaCy for NER
try:
    import spacy
    nlp = spacy.load("en_core_web_sm")
except Exception as e:
    print("spaCy model 'en_core_web_sm' not found. Install with:\n  python -m spacy download en_core_web_sm")
    import spacy
    nlp = None

# Paths
PROJECT_ROOT = Path(".").resolve()
DATA_PATH = PROJECT_ROOT / "key_events_20th_century_text.txt"   # adjust if needed
OUT_DIR = PROJECT_ROOT / "outputs" / "Exercise_1_6"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root : {PROJECT_ROOT}")
print(f"Data path    : {DATA_PATH} (exists: {DATA_PATH.exists()})")
print(f"Output dir   : {OUT_DIR}")

if not DATA_PATH.exists():
    raise FileNotFoundError("Could not find 'key_events_20th_century_text.txt' at project root. Update DATA_PATH if needed.")

raw_text = DATA_PATH.read_text(encoding="utf-8", errors="ignore")
print(f"Loaded {len(raw_text):,} characters.\n")
print(raw_text[:800] + "\n...")

def non_ascii_report(text, top_n=25):
    non_ascii = [c for c in text if ord(c) > 127]
    counts = collections.Counter(non_ascii).most_common(top_n)
    return counts

def basic_clean(text: str) -> str:
    # Unicode normalization
    x = unicodedata.normalize("NFKC", text)

    # Replace common unicode punctuation with ASCII equivalents
    replacements = {
        "\u2018": "'", "\u2019": "'", "\u201C": '"', "\u201D": '"',
        "\u2013": "-", "\u2014": "-", "\u00A0": " ", "\u2026": "...",
    }
    for k, v in replacements.items():
        x = x.replace(k, v)

    # Collapse whitespace
    x = re.sub(r"[ \t]+", " ", x)
    x = re.sub(r"\s+\n", "\n", x)
    x = re.sub(r"\n{3,}", "\n\n", x).strip()
    return x

print("Top non-ASCII chars BEFORE cleaning:", non_ascii_report(raw_text))
clean_text = basic_clean(raw_text)
print("Top non-ASCII chars AFTER  cleaning:", non_ascii_report(clean_text))

CLEAN_PATH = OUT_DIR / "key_events_20th_century_text_clean.txt"
CLEAN_PATH.write_text(clean_text, encoding="utf-8")
print(f"\nSaved cleaned text to: {CLEAN_PATH}")

user_list = None
for fn in ["countries_list.txt", "countries.txt"]:
    p = Path(fn)
    if p.exists():
        try:
            user_list = [line.strip() for line in p.read_text(encoding="utf-8", errors="ignore").splitlines() if line.strip()]
            break
        except Exception as e:
            print(f"Could not load {fn}: {e}")

if user_list:
    CANON_COUNTRIES = sorted(set(user_list))
    print(f"Loaded {len(CANON_COUNTRIES)} countries from your file.")
else:
    CANON_COUNTRIES = sorted({
        # Major & frequently-mentioned (edit as needed)
        "United States","United Kingdom","France","Germany","Italy","Spain","Portugal",
        "Russia","Soviet Union","China","Japan","India","Pakistan","Canada","Australia",
        "Poland","Czechoslovakia","Yugoslavia","Austria","Hungary","Netherlands","Belgium",
        "Switzerland","Sweden","Norway","Denmark","Finland","Ireland","Greece","Turkey",
        "Israel","Palestine","Egypt","Iran","Iraq","Syria","Lebanon","Jordan","Saudi Arabia",
        "Korea","North Korea","South Korea","Vietnam","North Vietnam","South Vietnam","Taiwan",
        "Czech Republic","Slovakia","East Germany","West Germany",
        "Brazil","Argentina","Mexico","Cuba","Chile","Peru","Colombia",
        "South Africa","Ethiopia","Kenya","Nigeria","Ghana","Algeria","Morocco",
        # Historical empires/states that may appear
        "Ottoman Empire","Austro-Hungarian Empire","Russian Empire","Prussia"
    })
    print(f"Using built-in canonical list: {len(CANON_COUNTRIES)} countries.")

# Synonyms/variants → canonical
COUNTRY_SYNONYMS = {
    # US/UK
    "u.s.":"United States","u.s":"United States","u.s.a.":"United States","usa":"United States","us":"United States","america":"United States",
    "united states of america":"United States",
    "u.k.":"United Kingdom","uk":"United Kingdom","great britain":"United Kingdom","britain":"United Kingdom","england":"United Kingdom",
    # Russia/USSR
    "ussr":"Soviet Union","soviet russia":"Soviet Union","russian empire":"Russian Empire",
    # Germany
    "german empire":"Germany","prussian empire":"Prussia","west germany":"West Germany","east germany":"East Germany",
    # China/Taiwan
    "prc":"China","people's republic of china":"China","peoples republic of china":"China","republic of china":"Taiwan","roc":"Taiwan",
    # Korea/Vietnam
    "korean peninsula":"Korea","dprk":"North Korea","rok":"South Korea",
    "democratic republic of vietnam":"North Vietnam","republic of vietnam":"South Vietnam",
    # Other common variants
    "czechia":"Czech Republic","holland":"Netherlands","ivory coast":"Ivory Coast"
}

def canonicalize_country(name: str):
    """Map a raw entity string to a canonical country name if possible."""
    if not name:
        return None
    s = re.sub(r"[^\w\s\-&\.']", " ", name.lower()).strip()
    s = re.sub(r"\s+", " ", s)
    if s in COUNTRY_SYNONYMS:
        return COUNTRY_SYNONYMS[s]
    # Title-case fallback (e.g., "France", "Soviet Union")
    t = re.sub(r"\s+", " ", name.strip())
    return t

if nlp is None:
    raise RuntimeError("spaCy language model not loaded. Run: python -m spacy download en_core_web_sm")

doc = nlp(clean_text)
print(f"Document length (tokens): {len(doc)}")
print("Sample entities:", [(ent.text, ent.label_) for ent in doc.ents[:15]])

sentence_entities = []   # list of tuples: (sentence_text, [entity_texts])
for sent in doc.sents:
    ents = [ent for ent in sent.ents if ent.label_ in ("GPE","LOC")]
    if ents:
        sentence_entities.append((sent.text, [e.text for e in ents]))

print(f"Sentences with (GPE/LOC) entities: {len(sentence_entities)}")
print("Example:", sentence_entities[0] if sentence_entities else "No sentences with GPE/LOC found.")

filtered_sentences = []  # (sentence_text, [canonical_country_names])
for sent_text, ents in sentence_entities:
    canon = []
    for e in ents:
        c = canonicalize_country(e)
        if c in CANON_COUNTRIES:
            canon.append(c)
    canon = sorted(set(canon))  # unique within sentence
    if canon:
        filtered_sentences.append((sent_text, canon))

print(f"Sentences with recognized countries (canonical): {len(filtered_sentences)}")
if filtered_sentences:
    print("Example:", filtered_sentences[0])

# %% [markdown]
# ## 6) Build Relationships (country–country co-occurrence per sentence)

# %%
pairs = []
for _, countries in filtered_sentences:
    if len(countries) >= 2:
        for a, b in itertools.combinations(sorted(countries), 2):
            pairs.append((a, b))

print(f"Total raw pairs: {len(pairs)}")

# %%
# Aggregate counts into a DataFrame
if pairs:
    df_pairs = pd.DataFrame(pairs, columns=["country1","country2"])
    df_rel = df_pairs.value_counts().reset_index(name="weight")
else:
    df_rel = pd.DataFrame(columns=["country1","country2","weight"])

df_rel.head()

# %% [markdown]
# ## 7) Save & Export Outputs

# %%
# Save relationships CSV
REL_CSV = OUT_DIR / "country_relationships.csv"
df_rel.to_csv(REL_CSV, index=False, encoding="utf-8")
print(f"Saved relationships to: {REL_CSV} (rows: {len(df_rel)})")

# Clean text already saved earlier at CLEAN_PATH

# %% [markdown]
# ## 8) Sanity Checks & Notes
#
# - Inspect `country_relationships.csv` for reasonable pairs (e.g., “United States”–“United Kingdom”, “Germany”–“France”, etc.).  
# - If you see unexpected names (e.g., cities being treated as countries), extend the **synonyms** or tighten the **canonical list**.  
# - If Exercise 1.5 produced a final countries list, place it as `countries_list.txt` in the project root (one per line) and rerun this notebook to use it.
#
# **Next (Exercise 1.7):** Use `country_relationships.csv` to create the network visualization (nodes=countries, edges weighted by co-occurrence).
