Data Cleaning

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [21]:

# Load the three datasets
good_df = pd.read_csv("good.csv")
low_bad_df = pd.read_csv("low bad.csv")
high_bad_df = pd.read_csv("high bad.csv")

# Add a new column to identify the quality class
good_df['Quality'] = 'good'
low_bad_df['Quality'] = 'low_bad'
high_bad_df['Quality'] = 'high_bad'

# Combine all three into a single dataset
combined_df = pd.concat([good_df, low_bad_df, high_bad_df], ignore_index=True)

# Preview the combined dataset
print(combined_df.head())

        VYP batch         Part        Set Time  FFTE Feed solids SP  \
0  102_2019_07_02  Yeast - BRN  2/07/2019 0:10                 50.0   
1  102_2019_07_02  Yeast - BRN  2/07/2019 0:10                 50.0   
2  102_2019_07_02  Yeast - BRN  2/07/2019 0:10                 50.0   
3  102_2019_07_02  Yeast - BRN  2/07/2019 0:10                 50.0   
4  102_2019_07_02  Yeast - BRN  2/07/2019 0:10                 50.0   

   FFTE Production solids SP  FFTE Steam pressure SP  TFE Out flow SP  \
0                      41.09                  118.44          2186.05   
1                      41.09                  118.44          2186.05   
2                      41.09                  118.44          2186.05   
3                      41.09                  118.44          2186.05   
4                      41.09                  118.44          2186.05   

   TFE Production solids SP  TFE Vacuum pressure SP  TFE Steam pressure SP  \
0                      67.0                  -79.82     

In [22]:
# Clean column names
combined_df.columns = (
    combined_df.columns
    .str.strip()
    .str.lower()
    .str.replace(" ", "_")
)

# Convert Set Time to datetime
combined_df['set_time'] = pd.to_datetime(combined_df['set_time'], dayfirst=True)


In [23]:
# Preview all columns
print("All columns:\n", combined_df.columns.tolist())

# Create lists based on column naming patterns
sp_columns = [col for col in combined_df.columns if "_sp" in col]
pv_columns = [col for col in combined_df.columns if "_pv" in col]

# Metadata columns (manually listed)
meta_columns = ['vyp_batch', 'part', 'set_time', 'quality']

# Optional: Remaining system-specific or ungrouped columns
other_columns = list(set(combined_df.columns) - set(sp_columns) - set(pv_columns) - set(meta_columns))

# Print summaries
print(f"\nSet Point (SP) columns: {len(sp_columns)} →", sp_columns)
print(f"\nProcess Variable (PV) columns: {len(pv_columns)} →", pv_columns)
print(f"\nOther columns: {len(other_columns)} →", other_columns)


All columns:
 ['vyp_batch', 'part', 'set_time', 'ffte_feed_solids_sp', 'ffte_production_solids_sp', 'ffte_steam_pressure_sp', 'tfe_out_flow_sp', 'tfe_production_solids_sp', 'tfe_vacuum_pressure_sp', 'tfe_steam_pressure_sp', 'extract_tank_level', 'ffte_discharge_density', 'ffte_discharge_solids', 'ffte_feed_flow_rate_pv', 'ffte_feed_solids_pv', 'ffte_heat_temperature_1', 'ffte_heat_temperature_2', 'ffte_heat_temperature_3', 'ffte_production_solids_pv', 'ffte_steam_pressure_pv', 'tfe_input_flow_pv', 'tfe_level', 'tfe_motor_current', 'tfe_motor_speed', 'tfe_out_flow_pv', 'tfe_product_out_temperature', 'tfe_production_solids_pv', 'tfe_production_solids_density', 'tfe_steam_pressure_pv', 'tfe_steam_temperature', 'tfe_tank_level', 'tfe_temperature', 'tfe_vacuum_pressure_pv', 'quality']

Set Point (SP) columns: 8 → ['ffte_feed_solids_sp', 'ffte_production_solids_sp', 'ffte_steam_pressure_sp', 'tfe_out_flow_sp', 'tfe_production_solids_sp', 'tfe_vacuum_pressure_sp', 'tfe_steam_pressure_sp', 'tf

In [24]:
for col in sp_columns + pv_columns:
    combined_df[f"{col}_lag1"] = combined_df[col].shift(1)


In [25]:
for col in sp_columns + pv_columns:
    combined_df[f"{col}_roll3"] = combined_df[col].rolling(window=3).mean()


In [26]:
for col in sp_columns + pv_columns:
    combined_df[f"{col}_delta"] = combined_df[col].diff()


In [27]:
combined_df = pd.get_dummies(combined_df, columns=['part'])


In [28]:
combined_df.dropna(inplace=True)


In [29]:
combined_df.to_csv("combined_cleaned_features.csv", index=False)


In [30]:
# Rolling mean over 3 steps
for col in sp_columns + pv_columns:
    combined_df[f"{col}_roll3"] = combined_df[col].rolling(window=3).mean()

# Rate of change (delta)
for col in sp_columns + pv_columns:
    combined_df[f"{col}_delta"] = combined_df[col].diff()


In [31]:
# One-hot encode yeast type
#combined_df = pd.get_dummies(combined_df, columns=['part'])

# Drop nulls from lag/rolling/diff
combined_df.dropna(inplace=True)


In [32]:

# Define features and labels
exclude = ['vyp_batch', 'set_time', 'quality']
X = combined_df.drop(columns=exclude)
y = combined_df['quality']

# 70% training, 30% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)


In [33]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict and evaluate
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

        good       0.97      0.98      0.97      3990
    high_bad       0.98      0.98      0.98      4349
     low_bad       0.96      0.91      0.94       773

    accuracy                           0.97      9112
   macro avg       0.97      0.96      0.96      9112
weighted avg       0.97      0.97      0.97      9112

[[3901   64   25]
 [  77 4267    5]
 [  53   13  707]]


SP Recommendation via Regression


In [34]:
# Filter only 'good' quality batches
df_good = combined_df[combined_df['quality'] == 'good'].copy()

# Select SP targets to predict
sp_targets = [
    'ffte_feed_solids_sp',
    'ffte_production_solids_sp',
    'tfe_production_solids_sp',
    'tfe_steam_pressure_sp'
]

# Define input features (exclude meta and targets)
exclude_cols = ['vyp_batch', 'set_time', 'quality'] + sp_targets
X = df_good.drop(columns=exclude_cols)
y = df_good[sp_targets]


In [42]:

# Wrap base regressor for multi-target prediction
model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42))
model.fit(X, y) 

# Predict SPs
y_pred = model.predict(X)

# Evaluate performance for each SP target
for i, col in enumerate(y.columns):
    print(f"\n📍 {col}")
    print("MAE:", mean_absolute_error(y[col], y_pred[:, i]))
    mse = mean_squared_error(y[col], y_pred[:, i])
    rmse = np.sqrt(mse)
    print("RMSE:", rmse)



📍 ffte_feed_solids_sp
MAE: 0.003064235223341878
RMSE: 0.06770407091326419

📍 ffte_production_solids_sp
MAE: 0.00124665363212656
RMSE: 0.017264871918907812

📍 tfe_production_solids_sp
MAE: 0.05355090239133742
RMSE: 0.26167651005185627

📍 tfe_steam_pressure_sp
MAE: 0.03754699954880431
RMSE: 0.6150874175332319
