In [None]:
"""
Build Qwen-style optimization dataset from Italy_data_actual.csv

- Uses your existing MILP model (milp_mcp_server.py)
- For each day in the CSV:
    * builds EnergyDataRecord list
    * calls milp_solve_from_records(...) to get the optimal schedule
    * writes a JSONL dataset with fields: "problem", "solution"

Each line of train.jsonl will look like:
{
  "problem": "<text describing the MILP instance>",
  "solution": "<step-by-step reasoning + JSON with schedules>"
}
"""

import json
from pathlib import Path
from typing import List

import pandas as pd

# ---- adjust these imports to your actual package layout ----
# If your project is a package `agentic_energy`, this is likely correct:
from agentic_energy.schemas import (
    BatteryParams,
    EnergyDataRecord,
    SolveFromRecordsRequest,
)
from agentic_energy.milp.milp_mcp_server import milp_solve_from_records
# ------------------------------------------------------------


# ---------- CONFIG ----------

DATA_CSV = Path("Italy_data_actual.csv")   # path to your CSV
OUTPUT_JSONL = Path("train_qwen_milp_italy.jsonl")

# Default battery you want the model to learn for
DEFAULT_BATTERY = BatteryParams(
    capacity_kwh=50.0,
    max_charge_kw=25.0,
    max_discharge_kw=25.0,
    eta_c=0.95,          # or batt.efficiency_charge in your schema
    eta_d=0.95,          # or batt.efficiency_discharge
    soc_min=0.10,
    soc_max=0.90,
    soc_init=0.50,
)

DT_HOURS = 1.0          # your Italy data is hourly
ALLOW_EXPORT = False    # set True if you want export in the MILP
MAX_DAYS = None         # or set to an int to limit dataset size (e.g., 200)


# ---------- TEXT BUILDERS ----------

def build_problem_text(
    date_str: str,
    battery: BatteryParams,
    df_day: pd.DataFrame,
    dt_hours: float,
    allow_export: bool,
) -> str:
    """
    Build the 'problem' text for Qwen: description of battery + 1-day time series.
    """
    lines: List[str] = []

    lines.append(
        "We consider a single battery participating in a day-ahead electricity market.\n"
    )
    lines.append(f"Date: {date_str}\n")
    lines.append(f"Time step: dt = {dt_hours} hours.\n")
    T = len(df_day)
    lines.append(f"Number of periods: T = {T}.\n\n")

    lines.append("Battery parameters:\n")
    lines.append(f"- Energy capacity: {battery.capacity_kwh} kWh\n")
    lines.append(f"- Max charge power: {battery.max_charge_kw} kW\n")
    lines.append(f"- Max discharge power: {battery.max_discharge_kw} kW\n")
    lines.append(f"- Charge efficiency (eta_c): {battery.eta_c}\n")
    lines.append(f"- Discharge efficiency (eta_d): {battery.eta_d}\n")
    lines.append(f"- Minimum SoC (fraction of capacity): {battery.soc_min}\n")
    lines.append(f"- Maximum SoC (fraction of capacity): {battery.soc_max}\n")
    lines.append(f"- Initial SoC (fraction of capacity): {battery.soc_init}\n")
    lines.append(
        f"- Allow exporting to the grid: {'YES' if allow_export else 'NO'}\n\n"
    )

    lines.append(
        "At each time t we have:\n"
        "- p(t): electricity price in $/MWh (or chosen units)\n"
        "- D(t): demand in kW that must be supplied by the grid plus battery.\n\n"
    )

    lines.append(
        "The control variables at each time t are:\n"
        "- c_t : charge power (kW)\n"
        "- d_t : discharge power (kW)\n"
        "- imp_t : net import from the grid (kW)\n"
        + ("- exp_t : net export to the grid (kW)\n" if allow_export else "")
        + "- SoC_t : state-of-charge (kWh)\n"
        "- binary indicators y_c_t, y_d_t to prevent simultaneous charge & discharge.\n\n"
    )

    lines.append("The objective is to MINIMIZE total net energy cost over the day:\n")
    if allow_export:
        lines.append(
            "  sum_t dt * ( p(t) * imp_t  -  p(t) * exp_t )\n\n"
        )
    else:
        lines.append(
            "  sum_t dt * ( p(t) * imp_t )\n\n"
        )

    lines.append("Subject to, for all t:\n")
    lines.append("- 0 ≤ c_t ≤ max_charge_kw\n")
    lines.append("- 0 ≤ d_t ≤ max_discharge_kw\n")
    lines.append("- SoC_{t+1} = SoC_t + (eta_c * c_t * dt - d_t * dt / eta_d)\n")
    lines.append(
        "- soc_min * capacity_kwh ≤ SoC_t ≤ soc_max * capacity_kwh\n"
    )
    if allow_export:
        lines.append(
            "- imp_t - exp_t + d_t = D(t) + c_t  (supply-demand balance)\n"
        )
    else:
        lines.append(
            "- imp_t + d_t ≥ D(t) + c_t  (no export, only net import)\n"
        )
    lines.append("- y_c_t + y_d_t ≤ 1 (no simultaneous charge & discharge)\n\n")

    lines.append("Here is the time series data for this day:\n")
    lines.append("t, timestamp, price, demand\n")
    for idx, row in df_day.iterrows():
        lines.append(
            f"{row['t_index']}, {row['timestamps']}, "
            f"{row['prices']:.4f}, {row['consumption']:.4f}\n"
        )

    lines.append(
        "\nTask: Solve this optimization problem and output the optimal trajectories "
        "for the day: charge_kw[t], discharge_kw[t], import_kw[t]"
    )
    if allow_export:
        lines.append(", export_kw[t]")
    lines.append(", and soc[t].\n")
    lines.append(
        "Provide your final answer as a JSON object with keys "
        "\"charge_kw\", \"discharge_kw\", \"import_kw\""
    )
    if allow_export:
        lines.append(", \"export_kw\"")
    lines.append(", \"soc\".\n")

    return "".join(lines)


def build_solution_text(sol, allow_export: bool) -> str:
    """
    Build the 'solution' text: short explanation + JSON schedules from SolveResponse.
    Assumes `sol` is a SolveResponse from your schemas.
    """
    explanation_lines: List[str] = []

    explanation_lines.append(
        "We solve the described mixed-integer linear program (MILP) with the given "
        "battery parameters and time series of prices and demand.\n"
    )
    explanation_lines.append(
        "The optimal policy charges during low-price periods and discharges during "
        "high-price periods, subject to power and SoC constraints.\n\n"
    )

    # Build the JSON-like object from the response
    json_obj = {
        "charge_kw": sol.charge_kw,
        "discharge_kw": sol.discharge_kw,
        "import_kw": sol.import_kw,
        "soc": getattr(sol, "soc", None),
    }
    if allow_export:
        json_obj["export_kw"] = getattr(sol, "export_kw", None)

    explanation_lines.append("The optimal schedules are:\n")
    explanation_lines.append(json.dumps(json_obj, indent=2))
    explanation_lines.append("\n\n")

    explanation_lines.append(
        f"Total minimum cost over the horizon (objective value): {sol.objective_cost}.\n"
    )

    return "".join(explanation_lines)


# ---------- MAIN DATASET BUILDER ----------

def main():
    if not DATA_CSV.exists():
        raise FileNotFoundError(f"CSV file not found: {DATA_CSV}")

    df = pd.read_csv(DATA_CSV)

    # Parse timestamps and add date + index within day
    df["timestamps"] = pd.to_datetime(df["timestamps"])
    df["date"] = df["timestamps"].dt.date

    # Sort by time just in case
    df = df.sort_values("timestamps").reset_index(drop=True)

    # For each day, we’ll assign t_index = 1..T
    df["t_index"] = df.groupby("date").cumcount() + 1

    unique_dates = sorted(df["date"].unique())
    if MAX_DAYS is not None:
        unique_dates = unique_dates[:MAX_DAYS]

    print(f"Found {len(unique_dates)} unique days in {DATA_CSV}.")
    print(f"Building dataset for {len(unique_dates)} days...")

    n_written = 0
    with OUTPUT_JSONL.open("w", encoding="utf-8") as f_out:
        for d in unique_dates:
            df_day = df[df["date"] == d].copy()
            if df_day.empty:
                continue

            # Build EnergyDataRecord list for this day
            records: List[EnergyDataRecord] = []
            for _, row in df_day.iterrows():
                rec = EnergyDataRecord(
                    timestamps=row["timestamps"].isoformat(),
                    prices=float(row["prices"]),
                    consumption=float(row["consumption"]),
                    # add other fields if your EnergyDataRecord has them
                )
                records.append(rec)

            # Build SolveFromRecordsRequest
            solve_req = SolveFromRecordsRequest(
                battery=DEFAULT_BATTERY,
                records=records,
                dt_hours=DT_HOURS,
                allow_export=ALLOW_EXPORT,
                solver=None,       # let cvxpy pick GUROBI/CPLEX/etc if installed
                solver_opts=None,
            )

            # Run your MILP solver
            sol = milp_solve_from_records(solve_req)

            if sol.status not in ("optimal", "OPTIMAL"):
                print(f"Skipping {d}: solver status = {sol.status}")
                continue

            # Build problem/solution texts
            problem_text = build_problem_text(
                date_str=str(d),
                battery=DEFAULT_BATTERY,
                df_day=df_day,
                dt_hours=DT_HOURS,
                allow_export=ALLOW_EXPORT,
            )

            solution_text = build_solution_text(sol, allow_export=ALLOW_EXPORT)

            example = {
                "problem": problem_text,
                "solution": solution_text,
            }

            f_out.write(json.dumps(example) + "\n")
            n_written += 1

    print(f"Done. Wrote {n_written} examples to {OUTPUT_JSONL}")


if __name__ == "__main__":
    main()


In [2]:
!nvidia-smi

/bin/bash: line 1: nvidia-smi: command not found
