In [2]:
import pandas as pd

df = pd.read_csv("vegemite.csv")
df.head()


Unnamed: 0,FFTE Feed tank level SP,FFTE Production solids SP,FFTE Steam pressure SP,TFE Out flow SP,TFE Production solids SP,TFE Vacuum pressure SP,TFE Steam pressure SP,TFE Steam temperature SP,FFTE Feed flow SP,FFTE Out steam temp SP,...,TFE Out flow PV,TFE Product out temperature,TFE Production solids PV,TFE Production solids density,TFE Steam pressure PV,TFE Steam temperature,TFE Tank level,TFE Temperature,TFE Vacuum pressure PV,Class
0,50.0,40.74,125.0,2897.65,69.0,-80.0,125.0,80.0,10130.0,49.88,...,2577.82,0.0,62.87,1.19,125.03,72.68,83.68,71.0,-77.74,1
1,50.0,40.74,125.0,2897.65,69.0,-79.45,125.0,80.0,10130.0,49.88,...,1551.88,0.0,67.98,1.19,125.03,64.66,82.95,69.0,-79.49,1
2,50.0,40.74,125.0,2897.65,69.0,-71.54,125.0,80.0,10130.0,49.88,...,1564.85,0.0,70.38,1.19,124.71,65.63,82.77,72.0,-75.28,1
3,50.0,40.74,125.0,2897.65,69.0,-68.44,125.0,80.0,10130.0,49.88,...,1512.97,0.0,69.18,1.19,124.71,74.35,82.55,76.0,-69.65,1
4,50.0,39.0,90.0,2694.62,64.0,-80.0,120.0,80.0,9300.0,49.88,...,1795.72,0.0,41.14,0.9,119.91,70.82,83.96,71.0,-75.28,1


In [3]:
# Step 1.1: Shuffle the dataset
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
df_shuffled.head()


Unnamed: 0,FFTE Feed tank level SP,FFTE Production solids SP,FFTE Steam pressure SP,TFE Out flow SP,TFE Production solids SP,TFE Vacuum pressure SP,TFE Steam pressure SP,TFE Steam temperature SP,FFTE Feed flow SP,FFTE Out steam temp SP,...,TFE Out flow PV,TFE Product out temperature,TFE Production solids PV,TFE Production solids density,TFE Steam pressure PV,TFE Steam temperature,TFE Tank level,TFE Temperature,TFE Vacuum pressure PV,Class
0,50.0,43.0,122.0,2609.3,69.0,-53.42,120.0,80.0,10300.0,50.12,...,943.58,0.0,54.33,1.26,120.23,67.88,82.52,67.0,-79.13,2
1,50.0,41.5,100.0,2038.55,65.0,-80.0,120.0,80.0,9400.0,50.0,...,1840.47,0.0,61.01,1.22,119.91,69.76,41.13,78.0,-72.11,0
2,50.0,40.5,115.0,2119.48,39.5,-46.55,120.0,80.0,9230.0,50.0,...,2756.81,0.0,50.27,1.19,0.0,51.14,83.38,62.0,-72.82,0
3,50.0,40.5,125.0,1982.74,63.0,-76.34,120.0,80.0,9400.0,50.0,...,1763.94,0.0,54.19,1.22,119.91,63.41,86.41,70.0,-77.74,0
4,50.0,43.0,90.0,2897.65,5.0,-45.0,100.0,80.0,10130.0,49.28,...,988.33,0.0,74.35,1.27,119.91,82.29,79.2,82.0,-70.36,1


In [4]:
# Split 1000 rows with approximately equal class distribution
df_class_0 = df_shuffled[df_shuffled['Class'] == 0].sample(n=334, random_state=42)
df_class_1 = df_shuffled[df_shuffled['Class'] == 1].sample(n=333, random_state=42)
df_class_2 = df_shuffled[df_shuffled['Class'] == 2].sample(n=333, random_state=42)

# Combine to form the test set
df_test = pd.concat([df_class_0, df_class_1, df_class_2]).sample(frac=1, random_state=42).reset_index(drop=True)

# Create training set by excluding the test set
df_train = df_shuffled.drop(df_test.index).reset_index(drop=True)

# Show class distribution in test set and shape of training set
df_test['Class'].value_counts(), df_train.shape

(Class
 0    334
 1    333
 2    333
 Name: count, dtype: int64,
 (14237, 47))

In [5]:
# Redefine constant_columns and remove them
constant_columns = [col for col in df_train.columns if df_train[col].nunique() == 1]

# Remove constant columns from both train and test sets
df_train = df_train.drop(columns=constant_columns)
df_test = df_test.drop(columns=constant_columns)

# Confirm removal
constant_columns, df_train.shape, df_test.shape


(['TFE Steam temperature SP', 'TFE Product out temperature'],
 (14237, 45),
 (1000, 45))

In [6]:
# Convert identified columns to categorical
categorical_cols = ['FFTE Feed tank level SP', 'FFTE Pump 1', 'FFTE Pump 1 - 2', 'FFTE Pump 2', 'TFE Motor speed']
df_train[categorical_cols] = df_train[categorical_cols].astype('category')
df_test[categorical_cols] = df_test[categorical_cols].astype('category')


In [7]:
# Step 1.3.3: Check class distribution in the training set
df_train['Class'].value_counts(normalize=True) * 100

Class
2    49.462668
1    33.075788
0    17.461544
Name: proportion, dtype: float64

In [8]:
# Redefine features and target from df_train
X_train = df_train.drop(columns=['Class'])
y_train = df_train['Class']
X_test = df_test.drop(columns=['Class'])
y_test = df_test['Class']

# Step 1.3.4: Create composite features
X_train['FFTE_Temperature_avg'] = X_train[['FFTE Heat temperature 1', 'FFTE Heat temperature 2', 'FFTE Heat temperature 3']].mean(axis=1)
X_train['TFE_Pressure_diff'] = X_train['TFE Steam pressure PV'] - X_train['TFE Vacuum pressure PV']

X_test['FFTE_Temperature_avg'] = X_test[['FFTE Heat temperature 1', 'FFTE Heat temperature 2', 'FFTE Heat temperature 3']].mean(axis=1)
X_test['TFE_Pressure_diff'] = X_test['TFE Steam pressure PV'] - X_test['TFE Vacuum pressure PV']

# Check updated shape
X_train.shape, X_test.shape


((14237, 46), (1000, 46))

In [9]:
# Step 2.1: Feature Selection - Check feature importance using RandomForest
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Use RandomForest to get feature importances
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Create a dataframe of feature importances
feature_importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Display top 15 features
feature_importances.head(15)


Unnamed: 0,Feature,Importance
3,TFE Out flow SP,0.067582
10,Extract tank Out flow PV,0.044629
11,FFTE Discharge density,0.042642
7,FFTE Feed flow SP,0.040032
28,FFTE Temperature 3 - 1,0.03864
13,FFTE Feed flow rate PV,0.037315
15,FFTE Heat temperature 1,0.036935
38,TFE Production solids density,0.036878
29,FFTE Temperature 3 - 2,0.034532
24,FFTE Temperature 1 - 1,0.033276


In [13]:
# Re-import required models from previous steps (since session lost variable state)
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# Re-train the models
dt_model = DecisionTreeClassifier(random_state=42, class_weight='balanced').fit(X_train, y_train)
lr_model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42).fit(X_train, y_train)
rf_model = RandomForestClassifier(random_state=42, class_weight='balanced').fit(X_train, y_train)
gb_model = GradientBoostingClassifier(random_state=42).fit(X_train, y_train)
svm_model = SVC(class_weight='balanced', probability=True, random_state=42).fit(X_train, y_train)

# Store models in a dictionary for evaluation
trained_models = {
    'Decision Tree': dt_model,
    'Logistic Regression': lr_model,
    'Random Forest': rf_model,
    'Gradient Boosting': gb_model,
    'SVM': svm_model
}

# Step 2.3 & 2.4: Evaluate models and show classification reports & confusion matrices
from sklearn.metrics import classification_report, confusion_matrix
evaluation_results = {}

for name, model in trained_models.items():
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    cm = confusion_matrix(y_test, y_pred)
    
    # Store results
    evaluation_results[name] = {
        'report': report,
        'confusion_matrix': cm
    }

import pandas as pd
comparison_df = pd.DataFrame({
    model: {
        'Precision': round(evaluation_results[model]['report']['weighted avg']['precision'], 3),
        'Recall': round(evaluation_results[model]['report']['weighted avg']['recall'], 3),
        'F1-Score': round(evaluation_results[model]['report']['weighted avg']['f1-score'], 3),
        'Accuracy': round(evaluation_results[model]['report']['accuracy'], 3)
    }
    for model in evaluation_results
}).T

print("Model Comparison", comparison_df)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Comparison                      Precision  Recall  F1-Score  Accuracy
Decision Tree            0.998   0.998     0.998     0.998
Logistic Regression      0.505   0.511     0.504     0.511
Random Forest            1.000   1.000     1.000     1.000
Gradient Boosting        0.938   0.935     0.935     0.935
SVM                      0.570   0.498     0.424     0.498


In [14]:
import joblib

# Save the best model
joblib.dump(rf_model, 'random_forest_model.pkl')


['random_forest_model.pkl']

In [15]:
# Step 3: Load the saved model and evaluate on the unseen test set
import joblib

# Save the model (in case it's not saved yet)
joblib.dump(rf_model, 'random_forest_model.pkl')

# Load the model
loaded_model = joblib.load('random_forest_model.pkl')

# Predict on the 1000 unseen test samples
y_pred_1000 = loaded_model.predict(X_test)

# Generate evaluation report
report_1000 = classification_report(y_test, y_pred_1000, output_dict=True)
report_1000_summary = {
    'Precision': round(report_1000['weighted avg']['precision'], 3),
    'Recall': round(report_1000['weighted avg']['recall'], 3),
    'F1-Score': round(report_1000['weighted avg']['f1-score'], 3),
    'Accuracy': round(report_1000['accuracy'], 3)
}

report_1000_summary


{'Precision': 1.0, 'Recall': 1.0, 'F1-Score': 1.0, 'Accuracy': 1.0}

In [16]:
from sklearn.metrics import classification_report
import pandas as pd

# Dictionary of your trained models
trained_models = {
    'Decision Tree': dt_model,
    'Logistic Regression': lr_model,
    'Random Forest': rf_model,
    'Gradient Boosting': gb_model, 
    'SVM': svm_model
}

# Evaluate each model on X_test
model_eval_results = {}

for name, model in trained_models.items():
    y_pred = model.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True)
    model_eval_results[name] = {
        'Precision': round(report['weighted avg']['precision'], 3),
        'Recall': round(report['weighted avg']['recall'], 3),
        'F1-Score': round(report['weighted avg']['f1-score'], 3),
        'Accuracy': round(report['accuracy'], 3)
    }

# Create comparison DataFrame
results_df = pd.DataFrame(model_eval_results).T
print(results_df)


                     Precision  Recall  F1-Score  Accuracy
Decision Tree            0.998   0.998     0.998     0.998
Logistic Regression      0.505   0.511     0.504     0.511
Random Forest            1.000   1.000     1.000     1.000
Gradient Boosting        0.938   0.935     0.935     0.935
SVM                      0.570   0.498     0.424     0.498


In [17]:
# Step 4: Develop rules from ML model using only SP (Set Point) features

# Extract only 'SP' columns from the original training set
sp_columns = [col for col in df_train.columns if col.endswith('SP')]
X_train_sp = df_train[sp_columns]
y_train_sp = df_train['Class']

# Train a decision tree on SP features
from sklearn.tree import DecisionTreeClassifier, export_text

sp_tree = DecisionTreeClassifier(max_depth=5, class_weight='balanced', random_state=42)
sp_tree.fit(X_train_sp, y_train_sp)

# Generate textual representation of the decision tree
tree_rules = export_text(sp_tree, feature_names=list(X_train_sp.columns))
print(tree_rules)


|--- FFTE Feed flow SP <= 10165.00
|   |--- TFE Out flow SP <= 2249.11
|   |   |--- FFTE Steam pressure SP <= 119.98
|   |   |   |--- TFE Out flow SP <= 2154.76
|   |   |   |   |--- FFTE Feed flow SP <= 9395.00
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- FFTE Feed flow SP >  9395.00
|   |   |   |   |   |--- class: 0
|   |   |   |--- TFE Out flow SP >  2154.76
|   |   |   |   |--- TFE Out flow SP <= 2215.20
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- TFE Out flow SP >  2215.20
|   |   |   |   |   |--- class: 0
|   |   |--- FFTE Steam pressure SP >  119.98
|   |   |   |--- TFE Out flow SP <= 2036.03
|   |   |   |   |--- FFTE Production solids SP <= 39.75
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- FFTE Production solids SP >  39.75
|   |   |   |   |   |--- class: 0
|   |   |   |--- TFE Out flow SP >  2036.03
|   |   |   |   |--- TFE Out flow SP <= 2184.69
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- TFE Out flow SP >  2184.69
|   |   |   |   |  

In [22]:
from sklearn.tree import export_graphviz
import graphviz

# Export tree as DOT data
dot_data = export_graphviz(
    sp_tree,
    out_file=None,
    feature_names=sp_columns,
    class_names=['Class 0', 'Class 1', 'Class 2'],
    filled=True,
    rounded=True,
    special_characters=True,
    max_depth=5
)

# Visualize as a Graphviz PDF or PNG
graph = graphviz.Source(dot_data)
graph.render("sp_decision_tree", format="png")  # Saves as sp_decision_tree.pdf

'sp_decision_tree.png'