In [None]:
import sys
import os

notebook_dir = os.getcwd()

project_root = os.path.abspath(os.path.join(notebook_dir, '..'))


if project_root not in sys.path:
    sys.path.append(project_root)

In [3]:
import numpy as np
import pandas as pd
from scipy.io import loadmat
from pathlib import Path
from typing import Optional, List, Dict, Any
import warnings

# Suppress specific pandas warnings if necessary
warnings.filterwarnings("ignore", category=FutureWarning)


RATED_CAPACITY = 2.0  # Rated capacity of the 18650 battery in Ah

def _extract_discharge_features(data: Dict[str, Any], cycle_idx: int) -> Optional[Dict[str, Any]]:
    """
    Extracts high-fidelity features from a single discharge cycle.
    Calculates Internal Resistance (IR) using dV/dI.
    """
    try:
        # Flatten arrays to 1D
        V = np.array(data["Voltage_measured"]).flatten()
        I = np.array(data["Current_measured"]).flatten()
        T = np.array(data["Temperature_measured"]).flatten()
        t = np.array(data["Time"]).flatten()
        
        # Get the raw object
        cap_raw = data.get("Capacity")
        
        
        if cap_raw is None:
            return None
            
        # Convert to numpy array safely. This handles both [1.5] (array) and 1.5 (scalar)
        cap_arr = np.array(cap_raw).flatten()
        
       
        if cap_arr.size == 0:
            return None
            
        # Extract the value
        cap = float(cap_arr[0])
       
        
    except (KeyError, IndexError, ValueError) as e:
        return None

    # PHYSICS FEATURE: Internal Resistance (IR) 
    # IR ≈ ΔVoltage / ΔCurrent at the very start of discharge
    # We look at the first 5 data points
    if len(V) < 6 or len(I) < 6:
        internal_resistance = np.nan
    else:
        delta_v = np.abs(np.diff(V[:5]))
        delta_i = np.abs(np.diff(I[:5]))
        
        # Avoid division by zero
        with np.errstate(divide="ignore", invalid="ignore"):
            ir_samples = np.where(delta_i > 1e-6, delta_v / delta_i, np.nan)
        
        internal_resistance = float(np.nanmedian(ir_samples))

    #STATISTICAL FEATURES 
    return {
        "cycle_index": cycle_idx,
        "type": "discharge",
        "discharge_capacity": cap,
        "soh": cap / RATED_CAPACITY,             # State of Health (0.0 - 1.0)
        "avg_voltage_load": float(np.mean(V)),   # Average voltage under load
        "min_voltage_load": float(np.min(V)),    # End-of-discharge voltage
        "max_temperature": float(np.max(T)),     # Peak temp (safety metric)
        "avg_temperature": float(np.mean(T)),    # Thermal stress metric
        "time_to_discharge": float(t[-1] - t[0]),# How long it lasted
        "internal_resistance": internal_resistance
    }

def _extract_impedance_features(data: Dict[str, Any], cycle_idx: int) -> Optional[Dict[str, Any]]:
    """
    Extracts Re (Electrolyte Resistance) and Rct (Charge Transfer Resistance)
    from Impedance cycles.
    """
    try:
        if "Re" not in data or "Rct" not in data:
            return None
        
        re = float(np.array(data["Re"]).flatten()[-1])
        rct = float(np.array(data["Rct"]).flatten()[-1])
    except (KeyError, IndexError, ValueError):
        return None

    return {
        "cycle_index": cycle_idx, 
        "type": "impedance", 
        "re": re, 
        "rct": rct
    }

def _merge_impedance(df_dis: pd.DataFrame, df_imp: pd.DataFrame) -> pd.DataFrame:
    """
    Merges Impedance data (which happens rarely) into Discharge data (which happens often).
    Uses 'backward' merge_asof to attach the *most recent* impedance test to the current cycle.
    """
    if df_imp.empty:
        df_dis["re"] = np.nan
        df_dis["rct"] = np.nan
        return df_dis

    df_dis_sorted = df_dis.sort_values("cycle_index")
    df_imp_sorted = df_imp.sort_values("cycle_index")

    # Merge: For every discharge cycle, find the NEAREST PREVIOUS impedance cycle
    merged = pd.merge_asof(
        df_dis_sorted,
        df_imp_sorted[["cycle_index", "re", "rct"]],
        on="cycle_index",
        direction="backward" 
    )
    
    # Forward fill any remaining gaps (if discharge happened before first impedance)
    merged["re"] = merged["re"].ffill().bfill()
    merged["rct"] = merged["rct"].ffill().bfill()

    return merged

def process_battery_file(mat_path: Path, battery_id: str) -> pd.DataFrame:
    """
    Main function to process a single .mat file.
    """
    mat = loadmat(str(mat_path), simplify_cells=True)
    
    # Handle the weird NASA structure where key is 'B0005' or 'B0006' etc.
    # We find the key that holds the 'cycle' struct
    cycle_key = None
    for key in mat.keys():
        if "cycle" in str(mat[key]): # Heuristic to find the right key
             cycle_key = key
             break
    
    # Fallback if heuristic fails (usually key is just battery_id)
    if battery_id in mat:
        cycle_key = battery_id
        
    cycles = mat[cycle_key]["cycle"]
    
    discharge_rows = []
    impedance_rows = []

    for idx, cycle in enumerate(cycles):
        cycle_type = str(cycle.get("type", "")).strip().lower()
        data = cycle.get("data", {})

        if cycle_type == "discharge":
            feat = _extract_discharge_features(data, idx)
            if feat: discharge_rows.append(feat)
        
        elif cycle_type == "impedance":
            feat = _extract_impedance_features(data, idx)
            if feat: impedance_rows.append(feat)

    if not discharge_rows:
        print(f"Warning: No discharge cycles found for {battery_id}")
        return pd.DataFrame()

    df_dis = pd.DataFrame(discharge_rows)
    df_imp = pd.DataFrame(impedance_rows)

    # Merge the two data streams
    df = _merge_impedance(df_dis, df_imp)
    
    df.insert(0, "battery_id", battery_id)
    return df

def main():
    RAW_DIR = Path("../data/Battery/1. BatteryAgingARC-FY08Q4") 
    PROCESSED_FILE = Path("../data/processed/final_battery_features.csv")
    
    # Ensure processed directory exists
    PROCESSED_FILE.parent.mkdir(parents=True, exist_ok=True)

    batteries = ["B0005", "B0006", "B0007", "B0018"]
    all_dfs = []

    print(f"Starting processing for: {batteries}")
    
    for batt_id in batteries:
        file_path = RAW_DIR / f"{batt_id}.mat"
        if not file_path.exists():
            print(f"Skipping {batt_id}: File not found at {file_path}")
            continue
            
        print(f"Processing {batt_id}...")
        df = process_battery_file(file_path, batt_id)
        all_dfs.append(df)
        print(f"  -> Extracted {len(df)} cycles.")

    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)
        final_df.to_csv(PROCESSED_FILE, index=False)
        print(f"\nSuccess! Saved processed data to: {PROCESSED_FILE}")
        print(final_df.head())
    else:
        print("\nNo data processed.")

if __name__ == "__main__":
    main()

Starting processing for: ['B0005', 'B0006', 'B0007', 'B0018']
Processing B0005...
  -> Extracted 168 cycles.
Processing B0006...
  -> Extracted 168 cycles.
Processing B0007...
  -> Extracted 168 cycles.
Processing B0018...
  -> Extracted 132 cycles.

Success! Saved processed data to: ..\data\processed\final_battery_features.csv
  battery_id  cycle_index       type  discharge_capacity       soh  \
0      B0005            1  discharge            1.856487  0.928244   
1      B0005            3  discharge            1.846327  0.923164   
2      B0005            5  discharge            1.835349  0.917675   
3      B0005            7  discharge            1.835263  0.917631   
4      B0005            9  discharge            1.834646  0.917323   

   avg_voltage_load  min_voltage_load  max_temperature  avg_temperature  \
0          3.529829          2.612467        38.982181        32.572328   
1          3.537320          2.587209        39.033398        32.725235   
2          3.543737     

At nasa the even cycle_index we're seeing is due to charge cycles(putting energy in). We only care about discharge cycles.