In [None]:
# ADWP Processing Notebook (memory-friendly)

import os
import tempfile
import csv
from datetime import datetime
from dateutil import parser as dateparser

import pandas as pd
from openpyxl import load_workbook
from ipywidgets import interact, Dropdown
import tkinter as tk
from tkinter import filedialog
from tqdm.notebook import tqdm

# -------------------------
# Helpers
# -------------------------
def month_label(dt):
    return dt.strftime("%b-%y")

def first_of_month_from_str(s):
    if pd.isna(s) or str(s).strip() == "":
        return None
    try:
        d = dateparser.parse(str(s), dayfirst=False)
        return d.replace(day=1)
    except Exception:
        return None

# -------------------------
# Step 1: File picker
# -------------------------
root = tk.Tk()
root.withdraw()  # hide root window
excel_path = filedialog.askopenfilename(
    title="Select Excel file",
    filetypes=[("Excel files", "*.xlsx *.xlsm *.xls")]
)
if not excel_path:
    raise SystemExit("No file selected.")

print("Selected file:", excel_path)

# -------------------------
# Step 2: Dropdown for year and month
# -------------------------
this_year = datetime.today().year
years = [this_year-1, this_year, this_year+1]
months = [(i, datetime(2000, i, 1).strftime("%B")) for i in range(1,13)]

def choose_params(year, month):
    global selected_year, selected_month
    selected_year = int(year)
    selected_month = int(month)
    print(f"Selected Year = {selected_year}, Current Month = {selected_month}")

interact(
    choose_params,
    year=Dropdown(options=years, description="Year:", value=this_year),
    month=Dropdown(options=months, description="Month:", value=(datetime.today().month, datetime.today().strftime("%B")))
)

# -------------------------
# Step 3: Functions to stream Excel -> CSV
# -------------------------
def excel_sheet_to_csv(excel_path, sheet_name, csv_path):
    wb = load_workbook(filename=excel_path, read_only=True, data_only=True)
    if sheet_name not in wb.sheetnames:
        raise ValueError(f"Sheet {sheet_name} not found in {excel_path}")
    ws = wb[sheet_name]
    with open(csv_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        for row in ws.iter_rows(values_only=True):
            writer.writerow([("" if v is None else v) for v in row])
    wb.close()

# -------------------------
# Step 4: Core processing
# -------------------------
def run_adwp(input_file, sheet_name, year, current_month, out_file, chunksize=100000):
    tmp_dir = tempfile.mkdtemp(prefix="adwp_")
    csv_path = os.path.join(tmp_dir, "data_stream.csv")
    inter_csv = os.path.join(tmp_dir, "intermediate.csv")
    print("Temp dir:", tmp_dir)

    # 1) Stream Excel to CSV
    print("Streaming Excel to CSV...")
    excel_sheet_to_csv(input_file, sheet_name, csv_path)

    # 2) Process in chunks -> intermediate
    cols_needed = ["PID","MPP ID","GCB","Business Framework","Business Framework Group",
                   "Business Service L2","Country R1","Country R2","Country R3",
                   "Stack","FTE","Period Month"]
    first = True
    row_idx = 0
    for chunk in tqdm(pd.read_csv(csv_path, chunksize=chunksize, dtype=str), desc="Chunks"):
        chunk.columns = [c.strip() for c in chunk.columns]
        # keep only needed cols if present
        for col in cols_needed:
            if col not in chunk.columns:
                chunk[col] = ""
        chunk["PeriodMonth_dt"] = chunk["Period Month"].apply(first_of_month_from_str)
        chunk["MonthLabel"] = chunk["PeriodMonth_dt"].apply(lambda x: month_label(x) if pd.notna(x) else "")
        chunk["FTE"] = pd.to_numeric(chunk["FTE"], errors="coerce")

        # PosKey
        def build_key(r):
            if r["PID"] not in ("", "nan", None):
                return f"PID|{r['PID']}|{r['GCB']}|{r['Business Framework']}"
            else:
                return f"MPP|{r['MPP ID']}|{r['GCB']}|{r['Business Framework']}"
        chunk["PosKey"] = chunk.apply(build_key, axis=1)

        chunk["row_idx"] = range(row_idx, row_idx+len(chunk))
        row_idx += len(chunk)

        if first:
            chunk.to_csv(inter_csv, index=False, mode="w")
            first = False
        else:
            chunk.to_csv(inter_csv, index=False, mode="a", header=False)

    # 3) Read intermediate fully (much smaller than raw)
    df = pd.read_csv(inter_csv, dtype=str)
    df["FTE"] = pd.to_numeric(df["FTE"], errors="coerce")
    df["row_idx"] = pd.to_numeric(df["row_idx"], errors="coerce")

    # Keep last row per PosKey+MonthLabel
    df.sort_values("row_idx", inplace=True)
    df = df.drop_duplicates(subset=["PosKey","MonthLabel"], keep="last")

    # Horizon labels
    months_list, labels = [], []
    for y in (year, year+1):
        for m in range(1,13):
            d = datetime(y,m,1)
            months_list.append(d)
            labels.append(month_label(d))

    # Pivot
    pivot = df.pivot_table(index="PosKey", columns="MonthLabel", values="FTE", aggfunc="last")
    for lbl in labels:
        if lbl not in pivot.columns:
            pivot[lbl] = pd.NA
    pivot = pivot[labels]

    # Dimensions: take last record per PosKey
    dims = df.groupby("PosKey").last().reset_index()

    final = dims[["Business Service L2","Business Framework Group","Business Framework",
                  "Country R1","Country R2","Country R3","PID","GCB","MPP ID","Stack"]].copy()
    final.index = dims["PosKey"]

    # FTE = current month’s value
    curr_dt = datetime(year, current_month, 1)
    curr_lbl = month_label(curr_dt)
    final["FTE"] = pivot[curr_lbl] if curr_lbl in pivot.columns else pd.NA

    # Merge month cols
    for lbl in labels:
        final[lbl] = pivot[lbl]

    # Forward-fill logic
    for idx,row in final.iterrows():
        month_vals = row[labels].copy()
        if row["PID"] not in ("", "nan", None):
            val = row["FTE"]
            if pd.notna(val):
                for i,d in enumerate(months_list):
                    if d>curr_dt and pd.isna(month_vals.iloc[i]):
                        month_vals.iloc[i]=val
        else:
            first_valid = month_vals.first_valid_index()
            if first_valid:
                fv = month_vals[first_valid]
                start_i = labels.index(first_valid)
                for i in range(start_i,len(labels)):
                    if pd.isna(month_vals.iloc[i]):
                        month_vals.iloc[i]=fv
        for lbl in labels:
            final.at[idx,lbl]=month_vals[lbl]

    # Order
    ordered = final[["Business Service L2","Business Framework Group","Business Framework",
                     "Country R1","Country R2","Country R3","PID","GCB","FTE","MPP ID","Stack"]+labels]
    ordered = ordered.sort_values(by=["Business Service L2","Business Framework Group","Business Framework","PID","GCB"])

    # Write
    out_path = os.path.join(os.path.dirname(input_file),"ADWP_Output.xlsx")
    ordered.to_excel(out_path,index=False,sheet_name="Output")
    print("Output written to:", out_path)

# -------------------------
# Step 5: Run
# -------------------------
# After selecting year & month in dropdowns, run this cell:
# run_adwp(excel_path, "Data", selected_year, selected_month, "ADWP_Output.xlsx")
