In [None]:
# Net Migration (SM.POP.NETM.csv) Cleaning
# Output schema: Country | Year | NetMigration

import re
import pandas as pd
from pathlib import Path

# -------- Paths --------
cwd = Path.cwd()            # .../notebooks
BASE_DIR = cwd.parent       # project root
DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

# -------- Country standardization --------
from utils_country import standardize_country_column, report_unmapped

# -------- Helpers --------
def is_year_col(col) -> bool:
    """
    Accept year headers like:
      - 1960 (int), 1960.0 (float)
      - "1960" (str), "1960.0" (str)
    """
    if isinstance(col, (int, float)):
        try:
            y = int(col)
            return 1900 <= y <= 2100
        except Exception:
            return False
    s = str(col).strip()
    if re.fullmatch(r"\d{4}(?:\.0+)?", s):
        y = int(float(s))
        return 1900 <= y <= 2100
    return False

def normalize_year_header(c):
    """Normalize year-like header to '####' (e.g., 1960.0 -> '1960')."""
    if isinstance(c, (int, float)):
        return str(int(c))
    s = str(c).strip()
    if re.fullmatch(r"\d{4}(?:\.0+)?", s):
        return str(int(float(s)))
    return s

def clean_country_name(name: str) -> str:
    if pd.isna(name):
        return name
    return str(name).strip()

def drop_non_countries(df: pd.DataFrame, country_col: str = "Country") -> pd.DataFrame:
    """Remove aggregate/region rows (e.g., World, income groups)."""
    if country_col not in df.columns:
        return df
    drop_keywords = [
        "Early-demographic dividend", "IBRD only", "IDA & IBRD total", "IDA blend", "IDA only", "IDA total", "Late-demographic dividend", "OECD members", "Post-demographic dividend", "Pre-demographic dividend",
        "income", "world", "europe", "asia", "africa", "america",
        "caribbean", "euro area", "sub-saharan", "middle east", "north africa",
        "arab world", "east asia", "south asia", "pacific", "latin america",
        "and the caribbean", "heavily indebted", "least developed", "small states",
        "fragile and conflict", "upper middle", "lower middle", "high income", "low income"
    ]
    patt = re.compile("|".join([re.escape(k) for k in drop_keywords]), flags=re.I)
    mask = df[country_col].fillna("").astype(str).str.contains(patt)
    return df.loc[~mask].copy()

def read_any_csv_first(path: Path) -> pd.DataFrame:
    """
    Robust reader for World Bank-style CSVs.
    Tries multiple encodings and fallbacks:
      utf-8 → latin1 → iso-8859-1 → cp1252
    Then various header/structure adjustments.
    """
    # --- Primary encodings to try ---
    encodings = ["utf-8", "latin1", "iso-8859-1", "cp1252"]

    # Try standard read with several encodings
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, low_memory=False)
        except (pd.errors.ParserError, UnicodeDecodeError):
            continue

    # Try python engine for messy files
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, engine="python",
                               on_bad_lines="skip", low_memory=False)
        except Exception:
            continue

    # Try skipping World Bank metadata (first 4 lines)
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, skiprows=4, low_memory=False)
        except (pd.errors.ParserError, UnicodeDecodeError):
            continue
        except Exception:
            continue

    # Try semicolon separator
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, sep=";", skiprows=4,
                               engine="python", on_bad_lines="skip", low_memory=False)
        except Exception:
            continue

    # --- Last resort: auto-detect header line ---
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        lines = f.readlines()
    header_idx = next(
        (i for i, line in enumerate(lines)
         if line.lower().startswith("country name") or line.lower().startswith("country")),
        0
    )
    return pd.read_csv(path, header=header_idx, engine="python",
                       on_bad_lines="skip", low_memory=False)


def to_three_columns_wide_years(df: pd.DataFrame,
                                country_col_candidates=("Country Name","Country","Territory","Location"),
                                value_col_name="Value") -> pd.DataFrame:
    # Locate country column
    country_col = next((c for c in country_col_candidates if c in df.columns), None)
    if country_col is None:
        raise ValueError("Country column not found. Update 'country_col_candidates' if needed.")

    # Prefer not to stringify columns before year detection
    year_cols = [c for c in df.columns if is_year_col(c)]
    if not year_cols:
        df = df.rename(columns={c: str(c) for c in df.columns})
        year_cols = [c for c in df.columns if is_year_col(c)]
        if not year_cols:
            raise ValueError("No year columns detected (e.g., 1960, 1961, ...). Is this the correct file?")

    # Normalize year headers to '####'
    df = df.rename(columns={c: normalize_year_header(c) for c in df.columns})
    year_cols = [c for c in df.columns if re.fullmatch(r"\d{4}", str(c))]

    # Melt to long
    long_df = df.melt(id_vars=[country_col], value_vars=year_cols,
                      var_name="Year", value_name=value_col_name)

    # Standardize schema
    long_df = long_df.rename(columns={country_col: "Country"})
    long_df["Country"] = long_df["Country"].map(clean_country_name)

    # Year/NA filters
    long_df["Year"] = long_df["Year"].astype(int)
    long_df = long_df.dropna(subset=[value_col_name])

    # Project window
    long_df = long_df[(long_df["Year"] >= 1960) & (long_df["Year"] <= 2018)]

    # Remove aggregates & canonicalize countries
    long_df = drop_non_countries(long_df, "Country")
    long_df = standardize_country_column(long_df, col="Country")

    out = long_df[["Country", "Year", value_col_name]].sort_values(["Country","Year"]).reset_index(drop=True)
    assert set(out.columns) == {"Country","Year", value_col_name}
    return out

# -------- Build --------
csv_file = RAW_DIR / "SM.POP.NETM.csv"
if not csv_file.exists():
    raise FileNotFoundError(f"File not found: {csv_file}\nPlace 'SM.POP.NETM.csv' under data/raw/")

df_raw = read_any_csv_first(csv_file)

# In WB format, often there are helper columns we don't need; keep as-is (function handles them).
df_netmig = to_three_columns_wide_years(df_raw, value_col_name="NetMigration")

# --- NEW: normalize typographic apostrophes before save (Excel-friendly) ---
df_netmig["Country"] = (
    df_netmig["Country"]
    .astype(str)
    .str.replace(r"[\u2019\u2018\u2032\u00B4`]", "'", regex=True)
)

# -------- Save --------
out_path = PROC_DIR / "net_migration_1960_2018_long.csv"
# --- NEW: write with UTF-8 BOM so Excel displays Unicode correctly ---
df_netmig.to_csv(out_path, index=False, encoding="utf-8-sig")

# -------- Notify --------
print("✅ Saved:", out_path.as_posix())
print("Rows:", len(df_netmig), " | Columns:", list(df_netmig.columns))

# Optional quick check for unmapped names
try:
    print("Unmapped (sample):")
    print(report_unmapped(df_netmig, "Country", sample=20))
except Exception:
    pass

try:
    from IPython.display import display
    display(df_netmig.head(10))
except Exception:
    print(df_netmig.head(10).to_string(index=False))


In [None]:
# Urban Population (% of total) Cleaning (UN WUP 2018 - File 21)
# Output schema: Country | Year | UrbanPopulationPercentage

import re
import pandas as pd
from pathlib import Path

# ---------- Paths (project layout) ----------
cwd = Path.cwd()                 # e.g., .../notebooks
BASE_DIR = cwd.parent            # project root
DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

# ---------- Country standardization ----------
import importlib, utils_country
importlib.reload(utils_country)  # notebokta eski sürüm cache’ini kır
from utils_country import standardize_country_column, report_unmapped, canonical_country


# ---------- Helpers ----------
def is_year_col(col) -> bool:
    """Accept '1960'(str), '1960.0'(str), 1960(int), or 1960.0(float) as a year column header."""
    if isinstance(col, (int, float)):
        try:
            y = int(col)
            return 1900 <= y <= 2100
        except Exception:
            return False
    s = str(col).strip()
    if re.fullmatch(r"\d{4}(?:\.0+)?", s):
        y = int(float(s))
        return 1900 <= y <= 2100
    return False

def normalize_year_header(c):
    """Return canonical 4-digit year string for a year-like header (e.g., 1950.0 -> '1950')."""
    if isinstance(c, (int, float)):
        return str(int(c))
    s = str(c).strip()
    if re.fullmatch(r"\d{4}(?:\.0+)?", s):
        return str(int(float(s)))
    return s

def clean_country_name(name: str) -> str:
    if pd.isna(name):
        return name
    return str(name).strip()

def drop_non_countries(df: pd.DataFrame, country_col: str = "Country") -> pd.DataFrame:
    """Remove aggregate/region rows (e.g., World, income groups)."""
    if country_col not in df.columns:
        return df
    drop_keywords = [
        "Early-demographic dividend", "IBRD only", "IDA & IBRD total", "IDA blend", "IDA only", "IDA total", "Late-demographic dividend", "OECD members", "Post-demographic dividend", "Pre-demographic dividend",
        "Less developed regions", "Less developed regions, excluding China", "More developed regions", "OCEANIA", "income", "world", "europe", "asia", "africa", "america",
        "caribbean", "euro area", "sub-saharan", "middle east", "north africa",
        "arab world", "east asia", "south asia", "pacific", "latin america",
        "and the caribbean", "heavily indebted", "least developed", "small states",
        "fragile and conflict", "upper middle", "lower middle", "high income", "low income"
    ]
    patt = re.compile("|".join([re.escape(k) for k in drop_keywords]), flags=re.I)
    mask = df[country_col].fillna("").astype(str).str.contains(patt)
    return df.loc[~mask].copy()

# ---------- Excel reader (handles header offset & numeric year headers) ----------
def read_population_xlsx(path: Path):
    xls = pd.ExcelFile(path)
    for sheet in xls.sheet_names:
        for skip in range(0, 30):  # header row can be offset by intro text
            try:
                df = pd.read_excel(path, sheet_name=sheet, header=skip)
            except Exception:
                continue
            if df is None or df.empty:
                continue

            df.columns = [str(c).strip() for c in df.columns]

            country_col = None
            for cand in ("Region, subregion, country or area", "Country Name", "Country", "Territory", "Location"):
                if cand in df.columns:
                    country_col = cand
                    break
            if country_col is None:
                continue

            year_cols = [c for c in df.columns if is_year_col(c)]
            if len(year_cols) >= 3:
                keep = [country_col] + year_cols
                return df[keep].copy()

    raise ValueError("No valid sheet found with clear year columns and a country column.")

# ---------- Wide → Long ----------
def to_three_columns_population(df: pd.DataFrame, value_col_name="UrbanPopulationPercentage") -> pd.DataFrame:
    # find country column
    country_col = None
    for cand in ("Region, subregion, country or area", "Country Name", "Country", "Territory", "Location"):
        if cand in df.columns:
            country_col = cand
            break
    if country_col is None:
        raise ValueError("Country column not found.")

    # normalize year headers to '####' strings
    df = df.rename(columns={c: normalize_year_header(c) for c in df.columns})
    year_cols = [c for c in df.columns if re.fullmatch(r"\d{4}", str(c))]

    # melt to long
    long_df = df.melt(id_vars=[country_col], value_vars=year_cols,
                      var_name="Year", value_name=value_col_name)

    # standardize schema
    long_df = long_df.rename(columns={country_col: "Country"})
    long_df["Country"] = long_df["Country"].map(clean_country_name)
    long_df["Year"] = long_df["Year"].astype(int)

    # drop NA, filter to 1960–2018, remove aggregates
    long_df = long_df.dropna(subset=[value_col_name])
    long_df = long_df[(long_df["Year"] >= 1960) & (long_df["Year"] <= 2018)]
    long_df = drop_non_countries(long_df, "Country")

        # --- canonicalize country names (proof print) ---
    before = long_df["Country"].astype(str).copy()
    long_df = standardize_country_column(long_df, col="Country")
    changed = before != long_df["Country"]
    print(f"[standardize] renamed rows: {int(changed.sum())}")
    if changed.any():
        demo = (
            pd.DataFrame({"before": before[changed], "after": long_df.loc[changed, "Country"]})
            .drop_duplicates()
            .head(10)
        )
        print(demo.to_string(index=False))

    # quick sanity for UN short form:
    print("[sanity] UN short →", canonical_country("Dem. People's Republic of Korea"))
    # beklenen: Korea, Democratic People’s Republic of


    # finalize
    out = long_df[["Country", "Year", value_col_name]].sort_values(["Country", "Year"]).reset_index(drop=True)
    assert set(out.columns) == {"Country", "Year", value_col_name}
    return out

# ---------- Build ----------
xlsx_file = RAW_DIR / "POPDBWUPRev.20181F21.xlsx"
if not xlsx_file.exists():
    raise FileNotFoundError(f"File not found: {xlsx_file}\nPlace 'POPDBWUPRev.20181F21.xlsx' under data/raw/")

df_raw = read_population_xlsx(xlsx_file)
df_up = to_three_columns_population(df_raw, value_col_name="UrbanPopulationPercentage")

# --- Excel'e dost apostrof normalize (kozmetik) ---
df_up["Country"] = (
    df_up["Country"]
    .astype(str)
    .str.replace(r"[\u2019\u2018\u2032\u00B4`]", "'", regex=True)
)

# ---------- Save ----------
out_path = PROC_DIR / "urban_population_percentage_1960_2018_long.csv"
df_up.to_csv(out_path, index=False, encoding="utf-8-sig")

print("✅ Saved:", out_path.as_posix())
print("Rows:", len(df_up), " | Columns:", list(df_up.columns))

# Optional: see unmapped examples to extend the mapping if needed
try:
    print("Unmapped (sample):")
    print(report_unmapped(df_up, "Country", sample=20))
except Exception:
    pass

try:
    from IPython.display import display
    display(df_up.head(10))
except Exception:
    print(df_up.head(10).to_string(index=False))


In [None]:
# Female Labor Force Participation (SL.TLF.CACT.FE.ZS.csv) Cleaning
# Output schema: Country | Year | FemaleLFPRate

import re
import pandas as pd
from pathlib import Path

# ---------------- Paths ----------------
cwd = Path.cwd()            # .../notebooks
BASE_DIR = cwd.parent       # project root
DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

# ---------------- Country standardization ----------------
from utils_country import standardize_country_column, report_unmapped

# ---------------- Helpers ----------------
def is_year_col(col) -> bool:
    """
    Yıl başlıklarını esnek algılar:
      - 1960 (int), 1960.0 (float)
      - "1960" (str), "1960.0" (str)
    """
    if isinstance(col, (int, float)):
        try:
            y = int(col)
            return 1900 <= y <= 2100
        except Exception:
            return False
    s = str(col).strip()
    if re.fullmatch(r"\d{4}(?:\.0+)?", s):
        y = int(float(s))
        return 1900 <= y <= 2100
    return False

def normalize_year_header(c):
    """Yıl başlıklarını 4 haneli stringe çevir (örn. 1960.0 -> '1960')."""
    if isinstance(c, (int, float)):
        return str(int(c))
    s = str(c).strip()
    if re.fullmatch(r"\d{4}(?:\.0+)?", s):
        return str(int(float(s)))
    return s

def clean_country_name(name: str) -> str:
    if pd.isna(name):
        return name
    return str(name).strip()

def drop_non_countries(df: pd.DataFrame, country_col: str = "Country") -> pd.DataFrame:
    """Bölgesel/aggregate girdileri ele."""
    if country_col not in df.columns:
        return df
    drop_keywords = [
        "Early-demographic dividend", "IBRD only", "IDA & IBRD total", "IDA blend", "IDA only", "IDA total", "Late-demographic dividend", "OECD members", "Post-demographic dividend", "Pre-demographic dividend",
        "income", "world", "europe", "asia", "africa", "america",
        "caribbean", "euro area", "sub-saharan", "middle east", "north africa",
        "arab world", "east asia", "south asia", "pacific", "latin america",
        "and the caribbean", "heavily indebted", "least developed", "small states",
        "fragile and conflict", "upper middle", "lower middle", "high income", "low income"
    ]
    patt = re.compile("|".join([re.escape(k) for k in drop_keywords]), flags=re.I)
    mask = df[country_col].fillna("").astype(str).str.contains(patt)
    return df.loc[~mask].copy()

def read_any_csv_first(path: Path) -> pd.DataFrame:
    """
    WB tarzı CSV'ler için dayanıklı okuyucu.
    Sıra: (utf-8 / latin1 / iso-8859-1 / cp1252) → python engine → skiprows=4 → sep=';' → header tarama.
    """
    encodings = ["utf-8", "latin1", "iso-8859-1", "cp1252"]

    # Düz okuma
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, low_memory=False)
        except (pd.errors.ParserError, UnicodeDecodeError):
            continue

    # Python engine ile
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, engine="python",
                               on_bad_lines="skip", low_memory=False)
        except Exception:
            continue

    # WB metaveri (ilk 4 satır) atla
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, skiprows=4, low_memory=False)
        except (pd.errors.ParserError, UnicodeDecodeError):
            continue
        except Exception:
            continue

    # Noktalı virgül
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, sep=";", skiprows=4,
                               engine="python", on_bad_lines="skip", low_memory=False)
        except Exception:
            continue

    # Header satırını otomatik bul
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        lines = f.readlines()
    header_idx = next(
        (i for i, line in enumerate(lines)
         if line.lower().startswith("country name") or line.lower().startswith("country")),
        0
    )
    return pd.read_csv(path, header=header_idx, engine="python",
                       on_bad_lines="skip", low_memory=False)

def to_three_columns_wide_years(df: pd.DataFrame,
                                country_col_candidates=("Country Name","Country","Territory","Location"),
                                value_col_name="Value") -> pd.DataFrame:
    # Ülke sütununu bul
    country_col = next((c for c in country_col_candidates if c in df.columns), None)
    if country_col is None:
        raise ValueError("Country column not found. Update 'country_col_candidates' if needed.")

    # Yıl sütunlarını tespit et (stringe çevirmeden önce dene)
    year_cols = [c for c in df.columns if is_year_col(c)]
    if not year_cols:
        df = df.rename(columns={c: str(c) for c in df.columns})
        year_cols = [c for c in df.columns if is_year_col(c)]
        if not year_cols:
            raise ValueError("No year columns detected (e.g., 1960, 1961, ...). Is this the correct file?")

    # Yıl başlıklarını normalize et (1960.0 -> '1960')
    df = df.rename(columns={c: normalize_year_header(c) for c in df.columns})

    # Normalize sonrası yıl listesi (hepsi '####' string)
    year_cols = [c for c in df.columns if re.fullmatch(r"\d{4}", str(c))]

    # Long'a çevir
    long_df = df.melt(id_vars=[country_col], value_vars=year_cols,
                      var_name="Year", value_name=value_col_name)

    # Şema standardizasyonu
    long_df = long_df.rename(columns={country_col: "Country"})
    long_df["Country"] = long_df["Country"].map(clean_country_name)

    # Yıl ve boşluk temizliği
    long_df["Year"] = long_df["Year"].astype(int)
    long_df = long_df.dropna(subset=[value_col_name])

    # Analiz aralığı
    long_df = long_df[(long_df["Year"] >= 1960) & (long_df["Year"] <= 2018)]

    # Bölgeselleri ele
    long_df = drop_non_countries(long_df, "Country")

    # Ülke adlarını standardize et
    long_df = standardize_country_column(long_df, col="Country")

    # Final çıktı
    out = long_df[["Country", "Year", value_col_name]].sort_values(["Country","Year"]).reset_index(drop=True)
    assert set(out.columns) == {"Country","Year", value_col_name}
    return out

# ---------------- Build ----------------
lfp_file = RAW_DIR / "SL.TLF.CACT.FE.ZS.csv"
if not lfp_file.exists():
    raise FileNotFoundError(f"File not found: {lfp_file}\nPlace 'SL.TLF.CACT.FE.ZS.csv' under data/raw/")

df_raw = read_any_csv_first(lfp_file)
df_f_lfp = to_three_columns_wide_years(df_raw, value_col_name="FemaleLFPRate")

# --- Excel'e dost apostrof normalize (kozmetik) ---
df_f_lfp["Country"] = (
    df_f_lfp["Country"]
    .astype(str)
    .str.replace(r"[\u2019\u2018\u2032\u00B4`]", "'", regex=True)
)

# ---------------- Save ----------------
out_path = PROC_DIR / "female_lfp_rate_1960_2018_long.csv"
df_f_lfp.to_csv(out_path, index=False, encoding="utf-8-sig")

# ---------------- Notify ----------------
print("✅ Saved:", out_path.as_posix())
print("Rows:", len(df_f_lfp), " | Columns:", list(df_f_lfp.columns))

# Opsiyonel: eşleşmeyen örnekleri gör (harita genişletmek için)
try:
    print("Unmapped (sample):")
    print(report_unmapped(df_f_lfp, "Country", sample=20))
except Exception:
    pass

try:
    from IPython.display import display
    display(df_f_lfp.head(10))
except Exception:
    print(df_f_lfp.head(10).to_string(index=False))


In [None]:
# Fertility Rate (SP.DYN.TFRT.IN.csv) Cleaning
# Output schema: Country | Year | FertilityRate

import re
import pandas as pd
from pathlib import Path

# -------- Relative Paths --------
cwd = Path.cwd()            # .../notebooks
BASE_DIR = cwd.parent       # project root
DATA_DIR = BASE_DIR / "data"
RAW_DIR = DATA_DIR / "raw"
PROC_DIR = DATA_DIR / "processed"

RAW_DIR.mkdir(parents=True, exist_ok=True)
PROC_DIR.mkdir(parents=True, exist_ok=True)

# -------- Country standardization --------
from utils_country import standardize_country_column, report_unmapped

# -------- Helpers --------
def is_year_col(col) -> bool:
    """
    Accept '1960' (str), '1960.0' (str), 1960 (int), or 1960.0 (float) as a year column header.
    """
    if isinstance(col, (int, float)):
        try:
            y = int(col)
            return 1900 <= y <= 2100
        except Exception:
            return False
    s = str(col).strip()
    if re.fullmatch(r"\d{4}(?:\.0+)?", s):
        y = int(float(s))
        return 1900 <= y <= 2100
    return False

def normalize_year_header(c):
    """Normalize year-like headers to 4-digit strings (e.g., 1960.0 -> '1960')."""
    if isinstance(c, (int, float)):
        return str(int(c))
    s = str(c).strip()
    if re.fullmatch(r"\d{4}(?:\.0+)?", s):
        return str(int(float(s)))
    return s

def clean_country_name(name: str) -> str:
    if pd.isna(name):
        return name
    return str(name).strip()

def drop_non_countries(df: pd.DataFrame, country_col: str = "Country") -> pd.DataFrame:
    """Drop aggregates/regions that aren't individual countries."""
    if country_col not in df.columns:
        return df
    drop_keywords = [
        "Early-demographic dividend", "IBRD only", "IDA & IBRD total", "IDA blend", "IDA only", "IDA total", "Late-demographic dividend", "OECD members", "Post-demographic dividend", "Pre-demographic dividend",
        "income", "world", "europe", "asia", "africa", "america",
        "caribbean", "euro area", "sub-saharan", "middle east", "north africa",
        "arab world", "east asia", "south asia", "pacific", "latin america",
        "and the caribbean", "heavily indebted", "least developed", "small states",
        "fragile and conflict", "upper middle", "lower middle", "high income", "low income"
    ]
    patt = re.compile("|".join([re.escape(k) for k in drop_keywords]), flags=re.I)
    mask = df[country_col].fillna("").astype(str).str.contains(patt)
    return df.loc[~mask].copy()

def read_any_csv_first(path: Path) -> pd.DataFrame:
    """
    Robust reader for World Bank-style CSVs.
    Tries multiple encodings and fallbacks:
      utf-8 → latin1 → iso-8859-1 → cp1252
    Then various header/structure adjustments.
    """
    encodings = ["utf-8", "latin1", "iso-8859-1", "cp1252"]

    # Try standard read
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, low_memory=False)
        except (pd.errors.ParserError, UnicodeDecodeError):
            continue

    # Try python engine for messy files
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, engine="python",
                               on_bad_lines="skip", low_memory=False)
        except Exception:
            continue

    # Try skipping World Bank metadata (first 4 lines)
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, skiprows=4, low_memory=False)
        except (pd.errors.ParserError, UnicodeDecodeError):
            continue
        except Exception:
            continue

    # Try semicolon separator
    for enc in encodings:
        try:
            return pd.read_csv(path, encoding=enc, sep=";", skiprows=4,
                               engine="python", on_bad_lines="skip", low_memory=False)
        except Exception:
            continue

    # Last resort: auto-detect header line
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        lines = f.readlines()
    header_idx = next(
        (i for i, line in enumerate(lines)
         if line.lower().startswith("country name") or line.lower().startswith("country")),
        0
    )
    return pd.read_csv(path, header=header_idx, engine="python",
                       on_bad_lines="skip", low_memory=False)

def to_three_columns_wide_years(df: pd.DataFrame,
                                country_col_candidates=("Country Name","Country","Territory","Location"),
                                value_col_name="Value") -> pd.DataFrame:
    # Locate country column
    country_col = next((c for c in df.columns if c in country_col_candidates), None)
    if country_col is None:
        raise ValueError("Country column not found. Update 'country_col_candidates' if needed.")

    # Detect year columns without forcing all to str first
    year_cols = [c for c in df.columns if is_year_col(c)]
    if not year_cols:
        df = df.rename(columns={c: str(c) for c in df.columns})
        year_cols = [c for c in df.columns if is_year_col(c)]
        if not year_cols:
            raise ValueError("No year columns detected (e.g., 1960, 1961, ...). Is this the correct file?")

    # Normalize year headers like "1960.0" -> "1960"
    df = df.rename(columns={c: normalize_year_header(c) for c in df.columns})
    year_cols = [c for c in df.columns if re.fullmatch(r"\d{4}", str(c))]

    # Melt to long
    long_df = df.melt(id_vars=[country_col], value_vars=year_cols,
                      var_name="Year", value_name=value_col_name)

    # Standardize schema
    long_df = long_df.rename(columns={country_col: "Country"})
    long_df["Country"] = long_df["Country"].map(clean_country_name)

    # Keep only numeric years and drop missing
    long_df["Year"] = long_df["Year"].astype(int)
    long_df = long_df.dropna(subset=[value_col_name])

    # Filter analysis window
    long_df = long_df[(long_df["Year"] >= 1960) & (long_df["Year"] <= 2018)]

    # Remove aggregates
    long_df = drop_non_countries(long_df, "Country")

    # Canonicalize country names
    long_df = standardize_country_column(long_df, col="Country")

    # Final ordering and schema
    out = long_df[["Country", "Year", value_col_name]].sort_values(["Country","Year"]).reset_index(drop=True)
    assert set(out.columns) == {"Country","Year", value_col_name}
    return out

# -------- Creation --------
fertility_file = RAW_DIR / "SP.DYN.TFRT.IN.csv"
if not fertility_file.exists():
    raise FileNotFoundError(f"File not found: {fertility_file}\nPlace 'SP.DYN.TFRT.IN.csv' under data/raw/")

df_raw = read_any_csv_first(fertility_file)

# -------- Build 3-column dataset --------
df_fert = to_three_columns_wide_years(df_raw, value_col_name="FertilityRate")

# --- Excel-friendly apostrophe normalize (optional but nice) ---
df_fert["Country"] = (
    df_fert["Country"]
    .astype(str)
    .str.replace(r"[\u2019\u2018\u2032\u00B4`]", "'", regex=True)
)

# -------- Save --------
out_path = PROC_DIR / "fertility_rate_1960_2018_long.csv"
df_fert.to_csv(out_path, index=False, encoding="utf-8-sig")

# -------- Notification --------
print("✅ Saved:", out_path.as_posix())
print("Rows:", len(df_fert), "| Columns:", list(df_fert.columns))
try:
    print("Unmapped (sample):")
    print(report_unmapped(df_fert, "Country", sample=20))
except Exception:
    pass

try:
    from IPython.display import display
    display(df_fert.head(10))
except Exception:
    print(df_fert.head(10).to_string(index=False))
