In [None]:

import pandas as pd  


# load data
path = "/Users/omanand/Downloads/Data_for_UCI_named.csv"


df = pd.read_csv(path)

df.rename(columns={
    "tau1": "time_const1",
    "tau2": "time_const2",
    "tau3": "time_const3",
    "tau4": "time_const4",
    "p1": "power_out1",
    "p2": "power_out2",
    "p3": "power_out3",
    "p4": "power_out4",
    "g1": "generator_load1",
    "g2": "generator_load2",
    "g3": "generator_load3",
    "g4": "generator_load4",
    "stab": "stability_index",
    "stabf": "stability_label"
}, inplace=True)


df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

df["stability_label"] = df["stability_label"].map({"stable": 1, "unstable": 0})
feature_cols = df.columns.drop("stability_label")
df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors="coerce")

df.dropna(inplace=True)

output_path = '/Users/omanand/Auralis/data/processed/cleaned_data.csv'
df.to_csv(output_path, index=False)
print(f"Cleaned Data saved at {output_path}")
print(f"Shape after cleaning: {df.shape}")
print(f"Feature columns: {list(feature_cols)}")


print("\n === Dataset Summary ===")
print(f"Total rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print(f"Feature columns ({len(feature_cols)}): {list(feature_cols)}")

class_counts = df["stability_label"].value_counts(normalize=True) * 100
print("\n Stability Label Distribution (%):")
for label, pct in class_counts.items():
    status = "Stable" if label == 1 else "Unstable"
    print(f"  {status}: {pct:.2f}%")


print("\n Feature Statistics:")
print(df[feature_cols].describe().T)
remaining_nans = df.isna().sum()
if remaining_nans.any():
    print("\nColumns with remaining NaNs:")
    print(remaining_nans[remaining_nans > 0])
else:
    print("\nNo remaining NaNs in dataset.")






Cleaned Data saved at /Users/omanand/Auralis/data/processed/cleaned_data.csv
Shape after cleaning: (10000, 14)
Feature columns: ['time_const1', 'time_const2', 'time_const3', 'time_const4', 'power_out1', 'power_out2', 'power_out3', 'power_out4', 'generator_load1', 'generator_load2', 'generator_load3', 'generator_load4', 'stability_index']

 === Dataset Summary ===
Total rows: 10000
Total columns: 14
Feature columns (13): ['time_const1', 'time_const2', 'time_const3', 'time_const4', 'power_out1', 'power_out2', 'power_out3', 'power_out4', 'generator_load1', 'generator_load2', 'generator_load3', 'generator_load4', 'stability_index']

 Stability Label Distribution (%):
  Unstable: 63.80%
  Stable: 36.20%

 Feature Statistics:
                   count      mean       std       min       25%       50%  \
time_const1      10000.0  5.250000  2.742548  0.500793  2.874892  5.250004   
time_const2      10000.0  5.250001  2.742549  0.500141  2.875140  5.249981   
time_const3      10000.0  5.250004  