# Load & Shuffle the datset



In [38]:
import pandas as pd

# Load the dataset
vegemite_df = pd.read_csv('vegemite.csv')

# Shuffle the dataset
vegemite_df = vegemite_df.sample(frac=1, random_state=42).reset_index(drop=True)


# Randomly Select 1000 Data Points with Equal Class Distribution

In [2]:
from sklearn.model_selection import train_test_split

# Separate 1000 data points with near equal distribution
vegemite_sampled, vegemite_remaining = train_test_split(
    vegemite_df, test_size=1000, stratify=vegemite_df['Class'], random_state=42)

# Ensure at least 300 samples from each class
class_distribution = vegemite_sampled['Class'].value_counts()



In [6]:
# Check for constant value columns
constant_columns = [col for col in vegemite_df.columns if vegemite_df[col].nunique() == 1]
constant_columns

['TFE Steam temperature SP', 'TFE Product out temperature']

In [12]:
# Remove constant value columns
df = vegemite_df.drop(columns=constant_columns)

In [13]:
# Identify columns with few unique integer values (e.g., less than 10 unique values)
few_values_columns = [col for col in df.columns if df[col].dtype == 'int64' and df[col].nunique() < 10]

# Convert these columns to categorical
df[few_values_columns] = df[few_values_columns].astype('category')

few_values_columns


['Class']

# Checking Class Distribution and Performing Undersampling/Oversampling or Adjusting Class Weights:

In [15]:
from sklearn.utils import resample

# Check class distribution
class_counts = df['Class'].value_counts()
class_counts

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
2,7548
1,5047
0,2642


### Class 2 has significantly more samples than class 0 with class one being in between. We have to use oversamples or undersampling make it more balanced

Instead of using oversampling or undersampling to balance the data, we can use both so that the the data meets in the middle.

Using both oversampling and undersampling we get a more evenly distributed data seen in the result below

In [20]:
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Undersample the majority class (Class 2)
rus = RandomUnderSampler(sampling_strategy={2: 5000}, random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

# Oversample the minority class (Class 0)
smote = SMOTE(sampling_strategy={0: 5000}, random_state=42)
X_final, y_final = smote.fit_resample(X_resampled, y_resampled)

# Check the new class distribution
pd.Series(y_final).value_counts()



Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
1,5047
0,5000
2,5000


### Number of features in the dataset

In [21]:
num_features = df.shape[1]
num_features


45

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42),
    "Logistic Regression": LogisticRegression(random_state=42),
    "k-NN": KNeighborsClassifier()
}

# Train models and evaluate
results = {}
for name, model in models.items():
    model.fit(X_final, y_final)
    y_pred = model.predict(X_final)
    results[name] = {
        "classification_report": classification_report(y_final, y_pred),
        "confusion_matrix": confusion_matrix(y_final, y_pred)
    }

for name, result in results.items():
    print(f"\nModel: {name}")
    print(result["classification_report"])
    print(result["confusion_matrix"])


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Model: Decision Tree
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5000
           1       1.00      1.00      1.00      5047
           2       1.00      1.00      1.00      5000

    accuracy                           1.00     15047
   macro avg       1.00      1.00      1.00     15047
weighted avg       1.00      1.00      1.00     15047

[[5000    0    0]
 [   4 5043    0]
 [   0    0 5000]]

Model: Random Forest
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5000
           1       1.00      1.00      1.00      5047
           2       1.00      1.00      1.00      5000

    accuracy                           1.00     15047
   macro avg       1.00      1.00      1.00     15047
weighted avg       1.00      1.00      1.00     15047

[[5000    0    0]
 [   4 5043    0]
 [   0    0 5000]]

Model: SVM
              precision    recall  f1-score   support

           0     

# Generate a full table with all the algorithms and their respective metrics to evaluate the best performing model

In [27]:
# Define the metrics for each model
data = {
    'Metric': ['Precision (Class 0)', 'Precision (Class 1)', 'Precision (Class 2)',
               'Recall (Class 0)', 'Recall (Class 1)', 'Recall (Class 2)',
               'F1-Score (Class 0)', 'F1-Score (Class 1)', 'F1-Score (Class 2)',
               'Accuracy', 'Macro Average Precision', 'Macro Average Recall',
               'Macro Average F1-Score', 'Weighted Average Precision',
               'Weighted Average Recall', 'Weighted Average F1-Score'],

    'Decision Tree': [1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00,
                       1.00, 1.00, 1.00, 1.00, 1.00, 1.00],

    'Random Forest': [1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00,
                       1.00, 1.00, 1.00, 1.00, 1.00, 1.00],

    'SVM': [0.42, 0.53, 0.53, 0.80, 0.05, 0.53, 0.55, 0.09, 0.53, 0.46,
            0.49, 0.46, 0.39, 0.49, 0.46, 0.39],

    'Logistic Regression': [0.44, 0.42, 0.48, 0.65, 0.18, 0.54, 0.53, 0.25, 0.51, 0.46,
                             0.45, 0.46, 0.43, 0.45, 0.46, 0.43],

    'k-NN': [0.87, 0.90, 0.93, 0.97, 0.87, 0.85, 0.92, 0.88, 0.88, 0.90,
             0.90, 0.90, 0.89, 0.90, 0.90, 0.89]
}


In [29]:
# Create a DataFrame
df_comparison = pd.DataFrame(data)

# Set 'Metric' column as the index
df_comparison.set_index('Metric', inplace=True)

# Display the table
df_comparison


Unnamed: 0_level_0,Decision Tree,Random Forest,SVM,Logistic Regression,k-NN
Metric,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Precision (Class 0),1.0,1.0,0.42,0.44,0.87
Precision (Class 1),1.0,1.0,0.53,0.42,0.9
Precision (Class 2),1.0,1.0,0.53,0.48,0.93
Recall (Class 0),1.0,1.0,0.8,0.65,0.97
Recall (Class 1),1.0,1.0,0.05,0.18,0.87
Recall (Class 2),1.0,1.0,0.53,0.54,0.85
F1-Score (Class 0),1.0,1.0,0.55,0.53,0.92
F1-Score (Class 1),1.0,1.0,0.09,0.25,0.88
F1-Score (Class 2),1.0,1.0,0.53,0.51,0.88
Accuracy,1.0,1.0,0.46,0.46,0.9


**Analysis**

Decision Tree and Random Forest both models achieve perfect scores across all metrics, indicating very high performance.

The k-NN model performs well with high precision, recall, and F1-scores, particularly for Class 0 and Class 2, but slightly lower for Class 1 compared to Decision Tree and Random Forest.

The SVM model has lower performance across all metrics, especially in recall and F1-score for Classes 1 and 2.

The logistice regression model shows decent performance but does not match the Decision Tree, Random Forest, or k-NN models in terms of precision and recall, especially for Class 1.

The k-NN model provides a good balance of precision and recall, making it a strong candidate among the models tested.

# Save the model


In [34]:
import joblib

final_model = models["k-NN"]
joblib.dump(final_model, 'knn_model.pkl')


['knn_model.pkl']

# Step 3: ML to AI



In [39]:
import joblib

# Load the saved k-NN model
model = joblib.load('knn_model.pkl')

# Prepare the Test Data
X_test = vegemite_remaining.drop('Class', axis=1)
y_test = vegemite_remaining['Class']

missing_columns = [col for col in X_final.columns if col not in X_test.columns]
if missing_columns:
    X_test = X_test.drop(columns=missing_columns, errors='ignore')

X_test = X_test[X_final.columns.intersection(X_test.columns)]

# Predict on test data
y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)


array([[165,   7,   2],
       [ 23, 292,  16],
       [ 38,  41, 416]])

In [40]:
classification_report(y_test, y_pred)

'              precision    recall  f1-score   support\n\n           0       0.73      0.95      0.82       174\n           1       0.86      0.88      0.87       331\n           2       0.96      0.84      0.90       495\n\n    accuracy                           0.87      1000\n   macro avg       0.85      0.89      0.86      1000\nweighted avg       0.89      0.87      0.87      1000\n'

In [43]:
model_names = ["Decision Tree", "Random Forest", "SVM", "Logistic Regression", "k-NN"]
for name in model_names:
    if name != "k-NN":
        y_pred = model.predict(X_test)
        "\nModel: {name}"
        print(confusion_matrix(y_test, y_pred))
        print(classification_report(y_test, y_pred))


[[165   7   2]
 [ 23 292  16]
 [ 38  41 416]]
              precision    recall  f1-score   support

           0       0.73      0.95      0.82       174
           1       0.86      0.88      0.87       331
           2       0.96      0.84      0.90       495

    accuracy                           0.87      1000
   macro avg       0.85      0.89      0.86      1000
weighted avg       0.89      0.87      0.87      1000

[[165   7   2]
 [ 23 292  16]
 [ 38  41 416]]
              precision    recall  f1-score   support

           0       0.73      0.95      0.82       174
           1       0.86      0.88      0.87       331
           2       0.96      0.84      0.90       495

    accuracy                           0.87      1000
   macro avg       0.85      0.89      0.86      1000
weighted avg       0.89      0.87      0.87      1000

[[165   7   2]
 [ 23 292  16]
 [ 38  41 416]]
              precision    recall  f1-score   support

           0       0.73      0.95      0.82  