Data Cleaning

In [None]:
import pandas as pd

In [None]:

# Load the three datasets
good_df = pd.read_csv("good.csv")
low_bad_df = pd.read_csv("low bad.csv")
high_bad_df = pd.read_csv("high bad.csv")

# Add a new column to identify the quality class
good_df['Quality'] = 'good'
low_bad_df['Quality'] = 'low_bad'
high_bad_df['Quality'] = 'high_bad'

# Combine all three into a single dataset
combined_df = pd.concat([good_df, low_bad_df, high_bad_df], ignore_index=True)

# Preview the combined dataset
print(combined_df.head())

In [None]:
# Clean column names
combined_df.columns = (
    combined_df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

# Convert Set Time to datetime
combined_df['set_time'] = pd.to_datetime(combined_df['set_time'], dayfirst=True)


In [None]:
# Preview all columns
print("All columns:\n", combined_df.columns.tolist())

# Create lists based on column naming patterns
sp_columns = [col for col in combined_df.columns if "_sp" in col]
pv_columns = [col for col in combined_df.columns if "_pv" in col]

# Metadata columns (manually listed)
meta_columns = ['vyp_batch', 'part', 'set_time', 'quality']

# Optional: Remaining system-specific or ungrouped columns
other_columns = list(set(combined_df.columns) - set(sp_columns) - set(pv_columns) - set(meta_columns))

# Print summaries
print(f"\nSet Point (SP) columns: {len(sp_columns)} →", sp_columns)
print(f"\nProcess Variable (PV) columns: {len(pv_columns)} →", pv_columns)
print(f"\nOther columns: {len(other_columns)} →", other_columns)


In [None]:
for col in sp_columns + pv_columns:
    combined_df[f"{col}_lag1"] = combined_df[col].shift(1)


In [None]:
for col in sp_columns + pv_columns:
    combined_df[f"{col}_roll3"] = combined_df[col].rolling(window=3).mean()


In [None]:
for col in sp_columns + pv_columns:
    combined_df[f"{col}_delta"] = combined_df[col].diff()


In [None]:
combined_df = pd.get_dummies(combined_df, columns=['part'])


In [None]:
combined_df.dropna(inplace=True)


In [None]:
combined_df.to_csv("combined_cleaned_features.csv", index=False)
