## **2.1 Load Libraries & Segmentation**

In [1]:
# === 1. Import Libraries ===
import os
import random
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# === 2. Define Project Paths ===
PHASE_NUMBER = 2

RESULT_DIR = f"result/phase_{PHASE_NUMBER}"
PLOT_DIR = os.path.join(RESULT_DIR, "plot")
DATA_DIR = os.path.join(RESULT_DIR, "data")

os.makedirs(PLOT_DIR, exist_ok=True)
os.makedirs(DATA_DIR, exist_ok=True)

print(f"Phase {PHASE_NUMBER} directories created/verified:")
print(f"  Plots: {PLOT_DIR}")
print(f"  Data:  {DATA_DIR}")

# === 3. Set Random Seed for Reproducibility ===
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
print(f"Random seed set to {SEED}")

# === 4. Matplotlib Plotting Settings (for IEEE) ===
def setup_ieee_plots():
    """Apply consistent, professional plot settings for IEEE publication."""
    plt.rcParams.update({
        'figure.figsize': (8, 5),
        'figure.dpi': 300,
        'font.family': 'serif',
        'font.size': 12,
        'axes.titlesize': 14,
        'axes.labelsize': 12,
        'xtick.labelsize': 10,
        'ytick.labelsize': 10,
        'legend.fontsize': 10,
        'lines.linewidth': 2,
        'lines.markersize': 5,
        'grid.alpha': 0.3,
        'grid.linestyle': '--',
        'axes.grid': True,
    })
    print("IEEE plot settings applied.")

setup_ieee_plots()

# === 5. Load Phase 1 Segmentation ===
cycles_path = Path('./result/phase_1/data/cycles.pkl')
with open(cycles_path, 'rb') as f:
    cycles = pickle.load(f)

print(f"\nLoaded {len(cycles)} discharge cycles from {cycles_path}")
print(f"  First cycle: start_idx={cycles[0]['start_idx']}, end_idx={cycles[0]['end_idx']}")
print(f"  Last cycle: start_idx={cycles[-1]['start_idx']}, end_idx={cycles[-1]['end_idx']}")

Phase 2 directories created/verified:
  Plots: result/phase_2\plot
  Data:  result/phase_2\data
Random seed set to 42
IEEE plot settings applied.

Loaded 1241 discharge cycles from result\phase_1\data\cycles.pkl
  First cycle: start_idx=12966, end_idx=16645
  Last cycle: start_idx=20662901, end_idx=20664702


## **2.2 Define Per-Cycle Feature Computations**

In [2]:
dt = 1.0

print("Feature computation parameters defined:")
print(f"  Sampling interval (dt): {dt} s")
print(f"  Features to compute: 8 (capacity_Ah, energy_Wh, duration_s, v_min, v_max, v_mean, i_rms, dVdt_abs_mean)")

Feature computation parameters defined:
  Sampling interval (dt): 1.0 s
  Features to compute: 8 (capacity_Ah, energy_Wh, duration_s, v_min, v_max, v_mean, i_rms, dVdt_abs_mean)


## **2.3 Compute Features for All Discharge Cycles**

In [3]:
raw_data_path = './data/data.csv'
print(f"Loading raw data from {raw_data_path}...")

df_raw = pd.read_csv(raw_data_path, header=None, names=['Current', 'Voltage'])
print(f"Loaded {len(df_raw)} rows")

features_list = []

print(f"\nComputing features for {len(cycles)} discharge cycles...")
for i, cycle in enumerate(cycles):
    start_idx = cycle['start_idx']
    end_idx = cycle['end_idx']
    
    current = df_raw.loc[start_idx:end_idx, 'Current'].values
    voltage = df_raw.loc[start_idx:end_idx, 'Voltage'].values
    
    capacity_Ah = np.sum(np.abs(current)) * dt / 3600.0
    energy_Wh = np.sum(voltage * np.abs(current)) * dt / 3600.0
    duration_s = len(current) * dt
    v_min = np.min(voltage)
    v_max = np.max(voltage)
    v_mean = np.mean(voltage)
    i_rms = np.sqrt(np.mean(current**2))
    dV = np.diff(voltage)
    dVdt_abs_mean = np.mean(np.abs(dV / dt))
    
    features_list.append({
        'cycle_idx': i + 1,
        'capacity_Ah': capacity_Ah,
        'energy_Wh': energy_Wh,
        'duration_s': duration_s,
        'v_min': v_min,
        'v_max': v_max,
        'v_mean': v_mean,
        'i_rms': i_rms,
        'dVdt_abs_mean': dVdt_abs_mean
    })
    
    if (i + 1) % 200 == 0 or (i + 1) == len(cycles):
        print(f"  Processed {i + 1}/{len(cycles)} cycles")

df_features = pd.DataFrame(features_list)
print(f"\nFeature extraction complete: {len(df_features)} cycles")
print(f"Feature table shape: {df_features.shape}")
print(f"\nFirst 5 cycles:")
print(df_features.head())

Loading raw data from ./data/data.csv...
Loaded 22714175 rows

Computing features for 1241 discharge cycles...
  Processed 200/1241 cycles
  Processed 400/1241 cycles
  Processed 600/1241 cycles
  Processed 800/1241 cycles
  Processed 1000/1241 cycles
  Processed 1200/1241 cycles
  Processed 1241/1241 cycles

Feature extraction complete: 1241 cycles
Feature table shape: (1241, 9)

First 5 cycles:
   cycle_idx  capacity_Ah  energy_Wh  duration_s    v_min    v_max    v_mean  \
0          1     3.270184  11.696697      3680.0  3.00000  4.08156  3.576643   
1          2     3.266630  11.682788      3676.0  3.00000  4.08171  3.576276   
2          3     3.265743  11.679512      3675.0  3.00000  4.08232  3.576245   
3          4     3.266631  11.682687      3676.0  2.99985  4.08324  3.576245   
4          5     3.255080  11.641008      3663.0  3.00000  4.08309  3.576131   

      i_rms  dVdt_abs_mean  
0  3.199528       0.000362  
1  3.199529       0.000363  
2  3.199531       0.000360  
3  

## **2.4 Assemble Feature Table & Basic QA (NaNs, Infs, Ranges)**

In [4]:
print("=== Feature Table Quality Assurance ===\n")

nan_counts = df_features.isna().sum()
print("NaN counts per column:")
print(nan_counts)
print(f"\nTotal NaNs: {nan_counts.sum()}")

inf_counts = np.isinf(df_features.select_dtypes(include=[np.number])).sum()
print("\nInf counts per column:")
print(inf_counts)
print(f"\nTotal Infs: {inf_counts.sum()}")

print("\n=== Feature Statistics ===")
print(df_features.describe())

print("\n=== Feature Ranges ===")
feature_cols = ['capacity_Ah', 'energy_Wh', 'duration_s', 'v_min', 'v_max', 'v_mean', 'i_rms', 'dVdt_abs_mean']
for col in feature_cols:
    min_val = df_features[col].min()
    max_val = df_features[col].max()
    print(f"{col:20s}: [{min_val:.6f}, {max_val:.6f}]")

print("\nQuality assurance complete")

=== Feature Table Quality Assurance ===

NaN counts per column:
cycle_idx        0
capacity_Ah      0
energy_Wh        0
duration_s       0
v_min            0
v_max            0
v_mean           0
i_rms            0
dVdt_abs_mean    0
dtype: int64

Total NaNs: 0

Inf counts per column:
cycle_idx        0
capacity_Ah      0
energy_Wh        0
duration_s       0
v_min            0
v_max            0
v_mean           0
i_rms            0
dVdt_abs_mean    0
dtype: int64

Total Infs: 0

=== Feature Statistics ===
         cycle_idx  capacity_Ah    energy_Wh   duration_s        v_min  \
count  1241.000000  1241.000000  1241.000000  1241.000000  1241.000000   
mean    621.000000     2.720129     9.633363  3061.181305     3.000215   
std     358.390151     0.390261     1.426536   439.049050     0.008109   
min       1.000000     1.600867     5.569795  1802.000000     2.999690   
25%     311.000000     2.675526     9.456238  3011.000000     3.000000   
50%     621.000000     2.820412     9.9943

## **2.5 Save Features**

In [5]:
features_path = os.path.join(DATA_DIR, 'features.csv')
df_features.to_csv(features_path, index=False)

print(f"Features saved to: {features_path}")
print(f"  Shape: {df_features.shape}")
print(f"  Columns: {list(df_features.columns)}")
print(f"  Cycles: {len(df_features)}")

Features saved to: result/phase_2\data\features.csv
  Shape: (1241, 9)
  Columns: ['cycle_idx', 'capacity_Ah', 'energy_Wh', 'duration_s', 'v_min', 'v_max', 'v_mean', 'i_rms', 'dVdt_abs_mean']
  Cycles: 1241
