In [None]:
"""
ADWP processing script (memory-optimized)
- Reads sheet 'Data' from large Excel workbook without fully loading into memory
- Produces 'Output' sheet with dynamic 24-month columns (selected_year -> selected_year+1)
Requirements:
    pip install pandas openpyxl python-dateutil tqdm
Run:
    python adwp_process.py --input "GHA.xlsx" --sheet "Data" --out "ADWP_Output.xlsx" --year 2025 --current_month 8
"""

import argparse
import csv
import os
import tempfile
from datetime import datetime
from dateutil import parser as dateparser

import pandas as pd
from openpyxl import load_workbook
from tqdm import tqdm

# -------------------------
# Helpers / Config
# -------------------------
def month_label(dt):
    # dt is datetime-like -> returns "Jan-25"
    return dt.strftime("%b-%y")

def first_of_month_from_str(s):
    if pd.isna(s):
        return None
    try:
        d = dateparser.parse(str(s), dayfirst=False)
        return d.replace(day=1)
    except Exception:
        return None

# -------------------------
# Step 0: CLI
# -------------------------
def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("--input", required=True, help="Input Excel file path")
    p.add_argument("--sheet", default="Data", help="Sheet name containing vertical data")
    p.add_argument("--out", default="ADWP_Output.xlsx", help="Output Excel file")
    p.add_argument("--year", type=int, required=True, help="Selected start year (e.g., 2025)")
    p.add_argument("--current_month", type=int, required=True, help="Current month number (1-12)")
    p.add_argument("--chunksize", type=int, default=100000, help="CSV chunk size for pandas")
    return p.parse_args()

# -------------------------
# Step 1: Stream Data sheet -> temp CSV
# -------------------------
def excel_sheet_to_csv(excel_path, sheet_name, csv_path):
    # Use openpyxl in read_only mode to stream rows
    wb = load_workbook(filename=excel_path, read_only=True, data_only=True)
    if sheet_name not in wb.sheetnames:
        raise ValueError(f"Sheet {sheet_name} not found in {excel_path}")
    ws = wb[sheet_name]
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        # Iterate rows and write them. First row assumed header.
        for i, row in enumerate(ws.iter_rows(values_only=True)):
            # convert openpyxl None -> '' for CSV cleanliness
            writer.writerow([("" if v is None else v) for v in row])
    wb.close()

# -------------------------
# Step 2: Chunked normalization -> intermediate compact CSV
# -------------------------
def process_chunks_to_intermediate(csv_path, tmp_intermediate, chunksize=100000):
    """
    Reads big CSV in chunks, normalizes columns, computes PositionKey,
    keeps last row per (PositionKey, PeriodMonth) within chunk, and appends to intermediate CSV.
    The intermediate CSV will contain:
      PosKey, PID, MPP_ID, GCB, BF, ServiceL2, BFG, CountryR1, CountryR2, CountryR3,
      Stack, PeriodMonth_iso, MonthLabel, FTE, row_order
    We'll rely on later aggregation to reduce duplicates across chunks.
    """
    # You MUST update these header names if your Data sheet uses different names.
    # Based on your spec:
    expected_cols = {
        "PID": "PID",
        "MPP ID": "MPP ID",
        "GCB": "GCB",
        "Business Framework": "Business Framework",
        "Business Framework Group": "Business Framework Group",
        "Business Service L2": "Business Service L2",
        "Country R1": "Country R1",
        "Country R2": "Country R2",
        "Country R3": "Country R3",
        "FTE": "FTE",
        "Stack": "Stack",
        "Period Month": "Period Month"
    }

    # read header first to map columns (do one small read)
    header_df = pd.read_csv(csv_path, nrows=0)
    header_cols = header_df.columns.tolist()

    # if column names differ slightly, you can extend mapping logic.
    # For now assume exact matches as per user spec.

    usecols = [c for c in header_cols]  # read all columns; modify if you want to restrict

    chunk_iter = pd.read_csv(csv_path, usecols=usecols, chunksize=chunksize, iterator=True, dtype=str)

    # Prepare intermediate CSV
    intermediate_cols = [
        "PosKey", "PID", "MPP ID", "GCB", "Business Framework", "Business Framework Group",
        "Business Service L2", "Country R1", "Country R2", "Country R3", "Stack",
        "PeriodMonth_iso", "MonthLabel", "FTE", "row_idx"
    ]
    with open(tmp_intermediate, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(intermediate_cols)

    row_global_idx = 0
    for chunk in tqdm(chunk_iter, desc="Processing chunks"):
        # Ensure known columns exist and coerce types
        # Normalize column names (strip)
        chunk.columns = [c.strip() for c in chunk.columns]

        # Convert Period Month -> first of month
        chunk["PeriodMonth_dt"] = chunk["Period Month"].apply(first_of_month_from_str)

        # Build PosKey: prefer PID if present else MPP ID, combined with GCB and BF
        def build_poskey(r):
            pid = r.get("PID", "")
            mpp = r.get("MPP ID", "")
            gcb = r.get("GCB", "")
            bf = r.get("Business Framework", "")
            if pid not in (None, "", "nan"):
                return f"PID|{pid}|{gcb}|{bf}"
            else:
                return f"MPP|{mpp}|{gcb}|{bf}"

        chunk["PosKey"] = chunk.apply(build_poskey, axis=1)

        # Keep last row per PosKey + PeriodMonth within this chunk (by appearance)
        chunk["row_idx_local"] = range(row_global_idx, row_global_idx + len(chunk))
        row_global_idx += len(chunk)

        # Normalize FTE numeric
        chunk["FTE"] = pd.to_numeric(chunk["FTE"], errors="coerce")

        # Keep needed cols for intermediate
        mini = chunk[[
            "PosKey", "PID", "MPP ID", "GCB", "Business Framework", "Business Framework Group",
            "Business Service L2", "Country R1", "Country R2", "Country R3", "Stack",
            "PeriodMonth_dt", "FTE", "row_idx_local"
        ]].copy()
        # rename for writing
        mini = mini.rename(columns={"PeriodMonth_dt": "PeriodMonth_dt", "row_idx_local": "row_idx"})
        # drop rows with null PeriodMonth_dt? We keep (they'll have empty month)
        mini["MonthLabel"] = mini["PeriodMonth_dt"].apply(lambda x: month_label(x) if pd.notna(x) else "")

        # For duplicate PosKey+PeriodMonth inside chunk, keep the last (max row_idx)
        mini["PosMonth"] = mini["PosKey"].astype(str) + "|" + mini["MonthLabel"].astype(str)
        mini.sort_values(by="row_idx", inplace=True)
        # drop duplicates, keep last
        mini = mini.drop_duplicates(subset=["PosMonth"], keep="last")

        # write to intermediate CSV
        with open(tmp_intermediate, "a", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            for _, r in mini.iterrows():
                writer.writerow([
                    r["PosKey"], r.get("PID", ""), r.get("MPP ID", ""), r.get("GCB", ""), r.get("Business Framework", ""),
                    r.get("Business Framework Group", ""), r.get("Business Service L2", ""), r.get("Country R1", ""),
                    r.get("Country R2", ""), r.get("Country R3", ""), r.get("Stack", ""),
                    (r["PeriodMonth_dt"].isoformat() if pd.notna(r["PeriodMonth_dt"]) else ""), r["MonthLabel"],
                    ("" if pd.isna(r["FTE"]) else r["FTE"]), r["row_idx"]
                ])

# -------------------------
# Step 3: Aggregate intermediate -> final pivot
# -------------------------
def build_final_output_from_intermediate(tmp_intermediate, output_excel, selected_year, current_month):
    # read intermediate (should be much smaller)
    df = pd.read_csv(tmp_intermediate, dtype=str)
    # fix dtypes
    df["FTE"] = pd.to_numeric(df["FTE"], errors="coerce")
    df["PeriodMonth_iso"] = df["PeriodMonth_iso"].replace("", pd.NA)
    # If duplicate PosKey+Month across chunks, keep the last by row_idx
    df["row_idx"] = pd.to_numeric(df["row_idx"], errors="coerce")
    df.sort_values(by="row_idx", inplace=True)
    df["PosMonth"] = df["PosKey"].astype(str) + "|" + df["MonthLabel"].astype(str)
    df = df.drop_duplicates(subset=["PosMonth"], keep="last")
    # pivot to wide: rows = PosKey (but we need dimensions too)
    # Build list of month headers for the 24-month horizon
    start = datetime(selected_year, 1, 1)
    # We need labels from Jan-selected_year to Dec-(selected_year+1)
    labels = []
    months = []
    for y in (selected_year, selected_year + 1):
        for m in range(1, 13):
            d = datetime(y, m, 1)
            labels.append(month_label(d))
            months.append(d)
    # We'll create pivot table with these labels
    # create a column 'MonthLabel' already exists; ensure consistent format (e.g., 'Jan-25')
    # Pivot:
    pivot = df.pivot_table(index="PosKey", columns="MonthLabel", values="FTE", aggfunc="last")
    # Ensure all labels exist
    for lbl in labels:
        if lbl not in pivot.columns:
            pivot[lbl] = pd.NA
    pivot = pivot[labels]  # reorder

    # We now need to assemble dimension columns per PosKey:
    # Take the most recent non-null record for each PosKey (by row_idx)
    dims = df.sort_values(by="row_idx").groupby("PosKey", as_index=False).last()
    # dims contains PID, MPP ID, GCB, Business Framework, etc.
    # Compose final dataframe
    final = dims[[
        "Business Service L2", "Business Framework Group", "Business Framework",
        "Country R1", "Country R2", "Country R3", "PID", "GCB", "MPP ID", "Stack"
    ]].copy()
    final.index = dims["PosKey"]
    # Add FTE (current month FTE): we must determine the current month label
    curr_dt = datetime(selected_year, current_month, 1)
    current_label = month_label(curr_dt)
    # If current_label not in pivot columns (because current_month might not be in the two-year window), handle
    if current_label not in pivot.columns:
        # If current month is outside our 24-month window, choose nearest (last available)
        # But usually it will be within window
        print(f"Warning: current month label {current_label} not in pivot columns.")
        final["FTE"] = pd.NA
    else:
        final["FTE"] = pivot[current_label]

    # Attach monthly columns after the 'Stack' column; user asked order:
    # Required order: Business Service L2, Business Framework Group, Business Framework, Country R1, Country R2, Country R3, PID, GCB, FTE, MPP ID, Stack, then month columns
    ordered = final[[
        "Business Service L2", "Business Framework Group", "Business Framework", "Country R1",
        "Country R2", "Country R3", "PID", "GCB", "FTE", "MPP ID", "Stack"
    ]].copy()

    # Add the 24 month columns (forward-fill logic per rules)
    for lbl in labels:
        ordered[lbl] = pivot[lbl] if lbl in pivot.columns else pd.NA

    # Forward-fill months after current month per PosKey if required:
    # Rules recap:
    # - For PID-present groups: After current month, forward-fill with current month FTE for that row (if PID group exists and current month has value)
    # - For MPP groups (PID missing): If first non-null month is future hire, then fill from that month forward with that FTE
    # Simpler universal approach:
    # 1) For rows where PID is present:
    #    - If current month has non-null FTE value, fill NaNs for months after current month with that value
    # 2) For rows where PID is missing:
    #    - Find the earliest non-null month label; starting from that month forward, fill forward with that value (this handles hire/demise logic if FTE is -1)
    # Apply:
    for idx, row in ordered.iterrows():
        pid = row.get("PID", "")
        # build month-value series
        month_vals = row[labels].copy()
        if pid not in (None, "", "nan"):
            # PID present => forward-fill after current month with current month value
            curr_val = row.get("FTE")
            if pd.notna(curr_val):
                after_mask = [ (months[i] > curr_dt) for i in range(len(months)) ]
                for i, after in enumerate(after_mask):
                    if after:
                        # if month empty, set to curr_val
                        if pd.isna(month_vals.iloc[i]):
                            month_vals.iloc[i] = curr_val
        else:
            # MPP case: find first non-null month index
            nonnull = month_vals.first_valid_index()
            if nonnull is not None:
                # find the value at that month (first non-null)
                first_idx = month_vals.first_valid_index()
                if first_idx is not None:
                    first_val = month_vals.loc[first_idx]
                    # fill forward from that month onward if blanks exist
                    start_idx = labels.index(first_idx)
                    for i in range(start_idx, len(labels)):
                        if pd.isna(month_vals.iloc[i]):
                            month_vals.iloc[i] = first_val
        # write back
        for lbl in labels:
            ordered.at[idx, lbl] = month_vals[lbl]

    # Final tidy: sort as requested
    ordered = ordered.sort_values(by=["Business Service L2", "Business Framework Group", "Business Framework", "PID", "GCB"])

    # Write to Excel as sheet 'Output'
    ordered.to_excel(output_excel, index=False, sheet_name="Output")
    print(f"Final output written to {output_excel}")

# -------------------------
# Main
# -------------------------
def main():
    args = parse_args()
    excel_path = args.input
    sheet_name = args.sheet
    out_file = args.out
    year = args.year
    curr_month = args.current_month
    chunksize = args.chunksize

    # temp files
    tmp_dir = tempfile.mkdtemp(prefix="adwp_")
    csv_path = os.path.join(tmp_dir, "data_stream.csv")
    tmp_intermediate = os.path.join(tmp_dir, "intermediate_posmonth.csv")
    print("Temporary dir:", tmp_dir)

    print("Streaming Excel sheet to CSV (memory-friendly)...")
    excel_sheet_to_csv(excel_path, sheet_name, csv_path)

    print("Processing CSV in chunks and building compact intermediate file...")
    process_chunks_to_intermediate(csv_path, tmp_intermediate, chunksize=chunksize)

    print("Building final pivoted Output sheet (this reads a much smaller intermediate file)...")
    build_final_output_from_intermediate(tmp_intermediate, out_file, year, curr_month)

    print("Done. Temporary files located at", tmp_dir)
    # Optionally remove tmp_dir if you want to conserve disk

if __name__ == "__main__":
    main()
