# With treating the unknown class as separate class

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from scipy.stats import mode  # For majority voting
import pandas as pd
import numpy as np

file_path = "Data Fusion Assignment 2025 Data.csv"
df = pd.read_csv(file_path)

spectrum_cols = [col for col in df.columns if col.startswith("spectrum_") and "+ 740" in col]
categorical_cols = ["color", "transparency", "device_id"]
target_col = "class"

scaler = StandardScaler()
X_spectrum = scaler.fit_transform(df[spectrum_cols])

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_cats = encoder.fit_transform(df[categorical_cols])
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))

df = df.drop(columns=categorical_cols).reset_index(drop=True)
df = pd.concat([df, encoded_cats_df], axis=1)
X_categorical = df.drop(columns=[target_col, "id"]).values

y = df[target_col] - 1  # Adjust class labels

X_train_spec, X_test_spec, y_train, y_test = train_test_split(X_spectrum, y, test_size=0.2, random_state=42, stratify=y)
X_train_cat, X_test_cat, _, _ = train_test_split(X_categorical, y, test_size=0.2, random_state=42, stratify=y)

rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_spec, y_train)

xgb_model = XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=5, eval_metric="mlogloss", use_label_encoder=False, random_state=42)
xgb_model.fit(X_train_cat, y_train)

y_pred_rf_train = rf_model.predict_proba(X_train_spec)
y_pred_xgb_train = xgb_model.predict_proba(X_train_cat)

y_pred_rf_test = rf_model.predict_proba(X_test_spec)
y_pred_xgb_test = xgb_model.predict_proba(X_test_cat)

X_meta_train = np.hstack((y_pred_rf_train, y_pred_xgb_train))
X_meta_test = np.hstack((y_pred_rf_test, y_pred_xgb_test))

meta_model = LogisticRegression(max_iter=500, random_state=42)
meta_model.fit(X_meta_train, y_train)

y_pred_combined = meta_model.predict(X_meta_test)

accuracy = accuracy_score(y_test, y_pred_combined)
classification_rep = classification_report(y_test, y_pred_combined, zero_division=1)
conf_matrix = confusion_matrix(y_test, y_pred_combined)

print(f"High-Level Fusion (Bagging + Boosting via Stacking) Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)


Parameters: { "use_label_encoder" } are not used.



High-Level Fusion (Bagging + Boosting via Stacking) Accuracy: 0.7067

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.69      0.75        13
           1       1.00      1.00      1.00        12
           2       0.60      0.75      0.67        12
           3       0.50      0.14      0.22         7
           4       0.63      0.80      0.71        15
           5       0.62      0.80      0.70        10
           6       0.67      0.50      0.57         4
           7       1.00      0.00      0.00         2

    accuracy                           0.71        75
   macro avg       0.73      0.59      0.58        75
weighted avg       0.72      0.71      0.68        75


Confusion Matrix:
 [[ 9  0  1  0  3  0  0  0]
 [ 0 12  0  0  0  0  0  0]
 [ 0  0  9  0  3  0  0  0]
 [ 0  0  0  1  1  4  1  0]
 [ 1  0  2  0 12  0  0  0]
 [ 0  0  2  0  0  8  0  0]
 [ 0  0  0  1  0  1  2  0]
 [ 1  0  1  0  0  0  0  0]]


# Comparison between XGboost and Bagging approaches

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

file_path = "Data Fusion Assignment 2025 Data.csv"
df = pd.read_csv(file_path)

spectrum_cols = [col for col in df.columns if col.startswith("spectrum_") and "+ 740" in col]
categorical_cols = ["color", "transparency", "device_id"]
target_col = "class"

scaler = StandardScaler()
X_spectrum = scaler.fit_transform(df[spectrum_cols])

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_cats = encoder.fit_transform(df[categorical_cols])
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))

df = df.drop(columns=categorical_cols).reset_index(drop=True)
df = pd.concat([df, encoded_cats_df], axis=1)
X_categorical = df.drop(columns=[target_col, "id"]).values

X = np.hstack((X_spectrum, X_categorical))
y = df[target_col] - 1  # Adjust class labels

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

xgb_model = XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=5, eval_metric="mlogloss", use_label_encoder=False, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

classification_rf = classification_report(y_test, y_pred_rf, zero_division=1)
classification_xgb = classification_report(y_test, y_pred_xgb, zero_division=1)

conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)

print(f"Bagging (Random Forest) Accuracy: {accuracy_rf:.4f}")
print("\nClassification Report (Random Forest):\n", classification_rf)
print("\nConfusion Matrix (Random Forest):\n", conf_matrix_rf)

print("\n" + "="*50 + "\n")

print(f"Boosting (XGBoost) Accuracy: {accuracy_xgb:.4f}")
print("\nClassification Report (XGBoost):\n", classification_xgb)
print("\nConfusion Matrix (XGBoost):\n", conf_matrix_xgb)


Parameters: { "use_label_encoder" } are not used.



Bagging (Random Forest) Accuracy: 0.6800

Classification Report (Random Forest):
               precision    recall  f1-score   support

           0       0.83      0.77      0.80        13
           1       0.85      0.92      0.88        12
           2       0.47      0.75      0.58        12
           3       0.67      0.29      0.40         7
           4       0.79      0.73      0.76        15
           5       0.70      0.70      0.70        10
           6       0.25      0.25      0.25         4
           7       1.00      0.00      0.00         2

    accuracy                           0.68        75
   macro avg       0.69      0.55      0.55        75
weighted avg       0.71      0.68      0.67        75


Confusion Matrix (Random Forest):
 [[10  1  1  0  1  0  0  0]
 [ 0 11  0  0  0  0  1  0]
 [ 0  1  9  0  2  0  0  0]
 [ 0  0  1  2  0  3  1  0]
 [ 0  0  4  0 11  0  0  0]
 [ 0  0  2  0  0  7  1  0]
 [ 0  0  2  1  0  0  1  0]
 [ 2  0  0  0  0  0  0  0]]


Boosting (XG

# Assigning a class label to the unknown class

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

file_path = "Data Fusion Assignment 2025 Data.csv"
df = pd.read_csv(file_path)

spectrum_cols = [col for col in df.columns if col.startswith("spectrum_") and "+ 740" in col]
categorical_cols = ["color", "transparency", "device_id"]
target_col = "class"

df[spectrum_cols] = df[spectrum_cols].fillna(df[spectrum_cols].median())
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

scaler = StandardScaler()
X_spectrum = scaler.fit_transform(df[spectrum_cols])

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_cats = encoder.fit_transform(df[categorical_cols])
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))

df = df.drop(columns=categorical_cols).reset_index(drop=True)
df = pd.concat([df, encoded_cats_df], axis=1)
X_categorical = df.drop(columns=[target_col, "id"]).values

y = df[target_col]

df_known = df[df[target_col] != 8]  
X_known = np.hstack((X_spectrum[df_known.index], X_categorical[df_known.index]))
y_known = df_known[target_col] - 1 

X_train_known, X_test_known, y_train_known, y_test_known = train_test_split(X_known, y_known, test_size=0.2, random_state=42, stratify=y_known)

xgb_semi_model = XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=5, eval_metric="mlogloss", use_label_encoder=False, random_state=42)
xgb_semi_model.fit(X_train_known, y_train_known)

df_unknown = df[df[target_col] == 8]
X_unknown = np.hstack((X_spectrum[df_unknown.index], X_categorical[df_unknown.index]))

y_pred_unknown_probs = xgb_semi_model.predict_proba(X_unknown)
y_pred_unknown = np.argmax(y_pred_unknown_probs, axis=1)  # Get the most likely class

confidence_threshold = 0.65  # Threshold for confident predictions
max_confidences = np.max(y_pred_unknown_probs, axis=1)
reliable_indices = max_confidences >= confidence_threshold

df_unknown.loc[reliable_indices, target_col] = y_pred_unknown[reliable_indices] + 1  # Convert back to original label range

df_updated = pd.concat([df_known, df_unknown])

X_updated = np.hstack((X_spectrum[df_updated.index], X_categorical[df_updated.index]))
y_updated = df_updated[target_col] - 1  # Adjust class labels

X_train, X_test, y_train, y_test = train_test_split(X_updated, y_updated, test_size=0.2, random_state=42, stratify=y_updated)

rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

xgb_model = XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=5, eval_metric="mlogloss", use_label_encoder=False, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_rf_train = rf_model.predict_proba(X_train)
y_pred_xgb_train = xgb_model.predict_proba(X_train)

y_pred_rf_test = rf_model.predict_proba(X_test)
y_pred_xgb_test = xgb_model.predict_proba(X_test)

X_meta_train = np.hstack((y_pred_rf_train, y_pred_xgb_train))
X_meta_test = np.hstack((y_pred_rf_test, y_pred_xgb_test))

meta_model = LogisticRegression(max_iter=500, random_state=42)
meta_model.fit(X_meta_train, y_train)

y_pred_combined = meta_model.predict(X_meta_test)

accuracy = accuracy_score(y_test, y_pred_combined)
classification_rep = classification_report(y_test, y_pred_combined, zero_division=1)
conf_matrix = confusion_matrix(y_test, y_pred_combined)

print(f"High-Level Fusion (Bagging + Boosting with Semi-Supervised Learning) Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



High-Level Fusion (Bagging + Boosting with Semi-Supervised Learning) Accuracy: 0.7200

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.69      0.75        13
           1       1.00      1.00      1.00        12
           2       0.56      0.83      0.67        12
           3       0.50      0.14      0.22         7
           4       0.75      0.75      0.75        16
           5       0.62      0.80      0.70        10
           6       0.67      0.50      0.57         4
           7       1.00      0.00      0.00         1

    accuracy                           0.72        75
   macro avg       0.74      0.59      0.58        75
weighted avg       0.73      0.72      0.70        75


Confusion Matrix:
 [[ 9  0  2  0  2  0  0  0]
 [ 0 12  0  0  0  0  0  0]
 [ 0  0 10  0  2  0  0  0]
 [ 0  0  1  1  0  4  1  0]
 [ 1  0  3  0 12  0  0  0]
 [ 0  0  2  0  0  8  0  0]
 [ 0  0  0  1  0  1  2  0]
 [ 1  0  0  0  0  0  0  0]]


In [37]:
df_unknown

Unnamed: 0,class,id,spectrum_0 + 740,spectrum_1 + 740,spectrum_2 + 740,spectrum_3 + 740,spectrum_4 + 740,spectrum_5 + 740,spectrum_6 + 740,spectrum_7 + 740,...,color_red,color_white,color_yellow,transparency_o,transparency_s,transparency_t,device_id_802215F15396F9FE,device_id_B0236F1F2D02C632,device_id_B02EFC17B97B46B2,device_id_D02B30ACFD92433E
363,1,3,1.339342,1.338869,1.338375,1.337812,1.337198,1.336585,1.336034,1.335569,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
364,1,5,1.404761,1.405914,1.4072,1.408563,1.410021,1.411614,1.413385,1.415335,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
365,8,8,0.312587,0.312626,0.312672,0.312711,0.312744,0.31278,0.31283,0.312899,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
366,3,15,0.240503,0.240845,0.241202,0.241559,0.241913,0.242265,0.242618,0.24297,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
367,8,16,0.487288,0.487556,0.487789,0.48797,0.488109,0.488228,0.488354,0.4885,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
368,5,18,0.710654,0.710555,0.710445,0.710305,0.710153,0.71002,0.709941,0.709929,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
369,8,20,0.766737,0.771991,0.777389,0.782837,0.788306,0.793763,0.799186,0.804556,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
370,3,21,0.566854,0.570564,0.5744,0.578296,0.582233,0.586186,0.590141,0.594083,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
371,5,22,0.523741,0.528294,0.533018,0.537834,0.542705,0.547592,0.552464,0.557294,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
372,5,25,0.141744,0.146658,0.152082,0.158028,0.164529,0.171596,0.17923,0.187434,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [38]:
df_known

Unnamed: 0,class,id,spectrum_0 + 740,spectrum_1 + 740,spectrum_2 + 740,spectrum_3 + 740,spectrum_4 + 740,spectrum_5 + 740,spectrum_6 + 740,spectrum_7 + 740,...,color_red,color_white,color_yellow,transparency_o,transparency_s,transparency_t,device_id_802215F15396F9FE,device_id_B0236F1F2D02C632,device_id_B02EFC17B97B46B2,device_id_D02B30ACFD92433E
0,1,1,0.782157,0.754932,0.730555,0.709195,0.690707,0.674890,0.661358,0.649536,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,2,0.948156,0.947904,0.947607,0.947237,0.946815,0.946384,0.945992,0.945662,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,4,0.285657,0.285247,0.284841,0.284432,0.284022,0.283619,0.283231,0.282856,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,1,5,0.296280,0.295966,0.295631,0.295269,0.294887,0.294499,0.294121,0.293758,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,1,6,0.171568,0.171535,0.171500,0.171459,0.171414,0.171371,0.171338,0.171317,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,7,20,0.260549,0.260703,0.260859,0.260962,0.260995,0.260955,0.260871,0.260763,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
359,7,21,0.608727,0.608820,0.608965,0.609044,0.609021,0.608883,0.608687,0.608465,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
360,7,22,0.510561,0.510711,0.510857,0.510904,0.510834,0.510646,0.510399,0.510129,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
361,7,23,0.365356,0.365554,0.365715,0.365774,0.365726,0.365576,0.365372,0.365148,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [39]:
print(np.unique(y_updated))
counts = pd.Series(y_updated).value_counts()
print(counts)

[0 1 2 3 4 5 6 7]
class
4    79
0    64
1    60
2    60
5    51
3    36
6    20
7     3
Name: count, dtype: int64


In [40]:
max_confidences

array([0.8392367 , 0.7701982 , 0.64054996, 0.8154724 , 0.44257867,
       0.9746046 , 0.5946572 , 0.95481944, 0.9110425 , 0.9712498 ],
      dtype=float32)

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

file_path = "Data Fusion Assignment 2025 Data.csv"
df = pd.read_csv(file_path)

spectrum_cols = [col for col in df.columns if col.startswith("spectrum_") and "+ 740" in col]
wr_raw_cols = [col for col in df.columns if col.startswith("wr_raw_") and "+ 740" in col]
sample_raw_cols = [col for col in df.columns if col.startswith("sample_raw_") and "+ 740" in col]
categorical_cols = ["color", "transparency", "device_id"]
target_col = "class"

df[spectrum_cols] = df[spectrum_cols].fillna(df[spectrum_cols].median())
df[wr_raw_cols] = df[wr_raw_cols].fillna(df[wr_raw_cols].median())
df[sample_raw_cols] = df[sample_raw_cols].fillna(df[sample_raw_cols].median())
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

scaler_wr = StandardScaler()
X_wr_raw = scaler_wr.fit_transform(df[wr_raw_cols])

scaler_sample = StandardScaler()
X_sample_raw = scaler_sample.fit_transform(df[sample_raw_cols])

scaler_spectrum = StandardScaler()
X_spectrum = scaler_spectrum.fit_transform(df[spectrum_cols])

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_cats = encoder.fit_transform(df[categorical_cols])
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))

df = df.drop(columns=categorical_cols).reset_index(drop=True)
df = pd.concat([df, encoded_cats_df], axis=1)

X_categorical = df.drop(columns=[target_col, "id"]).values  
X_extended = np.hstack((X_wr_raw, X_sample_raw, X_categorical)) 
y = df[target_col]

df_known = df[df[target_col] != 8] 
X_known = np.hstack((X_spectrum[df_known.index], X_extended[df_known.index]))
y_known = df_known[target_col] - 1  

X_train_known, X_test_known, y_train_known, y_test_known = train_test_split(
    X_known, y_known, test_size=0.2, random_state=42, stratify=y_known
)

xgb_semi_model = XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=5, eval_metric="mlogloss", use_label_encoder=False, random_state=42)
xgb_semi_model.fit(X_train_known, y_train_known)

df_unknown = df[df[target_col] == 8]
X_unknown = np.hstack((X_spectrum[df_unknown.index], X_extended[df_unknown.index]))

y_pred_unknown_probs = xgb_semi_model.predict_proba(X_unknown)
y_pred_unknown = np.argmax(y_pred_unknown_probs, axis=1) 

confidence_threshold = 0.60 
max_confidences = np.max(y_pred_unknown_probs, axis=1)
reliable_indices = max_confidences >= confidence_threshold

df_unknown.loc[reliable_indices, target_col] = y_pred_unknown[reliable_indices] + 1
df_updated = pd.concat([df_known, df_unknown])

X_updated = np.hstack((X_spectrum[df_updated.index], X_extended[df_updated.index]))
y_updated = df_updated[target_col] - 1  

X_train, X_test, y_train, y_test = train_test_split(X_updated, y_updated, test_size=0.2, random_state=42, stratify=y_updated)

rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

xgb_model = XGBClassifier(n_estimators=300, learning_rate=0.1, max_depth=5, eval_metric="mlogloss", use_label_encoder=False, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_rf_train = rf_model.predict_proba(X_train)
y_pred_xgb_train = xgb_model.predict_proba(X_train)

y_pred_rf_test = rf_model.predict_proba(X_test)
y_pred_xgb_test = xgb_model.predict_proba(X_test)

X_meta_train = np.hstack((y_pred_rf_train, y_pred_xgb_train))
X_meta_test = np.hstack((y_pred_rf_test, y_pred_xgb_test))

meta_model = LogisticRegression(max_iter=500, random_state=42)
meta_model.fit(X_meta_train, y_train)

y_pred_combined = meta_model.predict(X_meta_test)

accuracy = accuracy_score(y_test, y_pred_combined)
classification_rep = classification_report(y_test, y_pred_combined, zero_division=1)
conf_matrix = confusion_matrix(y_test, y_pred_combined)

print(f"Bagging + Boosting with Extended Features (wr_raw + sample_raw) & Semi-Supervised Learning Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Bagging + Boosting with Extended Features (wr_raw + sample_raw) & Semi-Supervised Learning Accuracy: 0.7067

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.62      0.73        13
           1       0.92      1.00      0.96        12
           2       0.56      0.83      0.67        12
           3       0.50      0.14      0.22         7
           4       0.71      0.75      0.73        16
           5       0.62      0.80      0.70        10
           6       0.67      0.50      0.57         4
           7       1.00      0.00      0.00         1

    accuracy                           0.71        75
   macro avg       0.73      0.58      0.57        75
weighted avg       0.72      0.71      0.69        75


Confusion Matrix:
 [[ 8  1  2  0  2  0  0  0]
 [ 0 12  0  0  0  0  0  0]
 [ 0  0 10  0  2  0  0  0]
 [ 0  0  1  1  0  4  1  0]
 [ 1  0  3  0 12  0  0  0]
 [ 0  0  2  0  0  8  0  0]
 [ 0  0  0  1  0  1  2  0]
 [ 0  0

In [42]:
max_confidences

array([0.8392367 , 0.7701982 , 0.64054996, 0.8154724 , 0.44257867,
       0.9746046 , 0.5946572 , 0.95481944, 0.9110425 , 0.9712498 ],
      dtype=float32)

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

file_path = "Data Fusion Assignment 2025 Data.csv"
df = pd.read_csv(file_path)

spectrum_cols = [col for col in df.columns if col.startswith("spectrum_") and "+ 740" in col]
categorical_cols = ["color", "transparency", "device_id"]
target_col = "class"

df[spectrum_cols] = df[spectrum_cols].fillna(df[spectrum_cols].median())
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

scaler = StandardScaler()
X_spectrum = scaler.fit_transform(df[spectrum_cols])

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_cats = encoder.fit_transform(df[categorical_cols])
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))

df = df.drop(columns=categorical_cols).reset_index(drop=True)
df = pd.concat([df, encoded_cats_df], axis=1)
X_categorical = df.drop(columns=[target_col, "id"]).values

y = df[target_col]

df_known = df[df[target_col] != 8]  
X_known = np.hstack((X_spectrum[df_known.index], X_categorical[df_known.index]))
y_known = df_known[target_col] - 1  

X_train_known, X_test_known, y_train_known, y_test_known = train_test_split(
    X_known, y_known, test_size=0.2, random_state=42, stratify=y_known
)

xgb_param_grid = {
    "n_estimators": [100, 300, 500],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 5, 7]
}

xgb_search = GridSearchCV(XGBClassifier(eval_metric="mlogloss", use_label_encoder=False, random_state=42),
                          param_grid=xgb_param_grid, cv=3, scoring="accuracy", verbose=1, n_jobs=-1)

xgb_search.fit(X_train_known, y_train_known)
best_xgb_params = xgb_search.best_params_

xgb_semi_model = XGBClassifier(**best_xgb_params, eval_metric="mlogloss", use_label_encoder=False, random_state=42)
xgb_semi_model.fit(X_train_known, y_train_known)

df_unknown = df[df[target_col] == 8]
X_unknown = np.hstack((X_spectrum[df_unknown.index], X_categorical[df_unknown.index]))

y_pred_unknown_probs = xgb_semi_model.predict_proba(X_unknown)
y_pred_unknown = np.argmax(y_pred_unknown_probs, axis=1) 

confidence_threshold = 0.60
max_confidences = np.max(y_pred_unknown_probs, axis=1)
reliable_indices = max_confidences >= confidence_threshold

df_unknown.loc[reliable_indices, target_col] = y_pred_unknown[reliable_indices] + 1  

df_updated = pd.concat([df_known, df_unknown])

X_updated = np.hstack((X_spectrum[df_updated.index], X_categorical[df_updated.index]))
y_updated = df_updated[target_col] - 1 

class_counts = pd.Series(y_updated).value_counts()
print("Class Distribution in Updated Dataset:\n", class_counts)

if any(class_counts < 2):
    X_train, X_test, y_train, y_test = train_test_split(
        X_updated, y_updated, test_size=0.2, random_state=42, shuffle=True
    )
    print("\nUsing shuffle=True (No Stratification) due to class imbalance.")
else:
    X_train, X_test, y_train, y_test = train_test_split(
        X_updated, y_updated, test_size=0.2, random_state=42, stratify=y_updated
    )
    print("\nUsing Stratification for Train-Test Split.")

rf_param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [10, 20, None],
    "min_samples_split": [2, 5, 10]
}

rf_search = GridSearchCV(RandomForestClassifier(random_state=42),
                         param_grid=rf_param_grid, cv=3, scoring="accuracy", verbose=1, n_jobs=-1)

rf_search.fit(X_train, y_train)
best_rf_params = rf_search.best_params_

rf_model = RandomForestClassifier(**best_rf_params, random_state=42)
rf_model.fit(X_train, y_train)

xgb_model = XGBClassifier(**best_xgb_params, eval_metric="mlogloss", use_label_encoder=False, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_rf_train = rf_model.predict_proba(X_train)
y_pred_xgb_train = xgb_model.predict_proba(X_train)

y_pred_rf_test = rf_model.predict_proba(X_test)
y_pred_xgb_test = xgb_model.predict_proba(X_test)

X_meta_train = np.hstack((y_pred_rf_train, y_pred_xgb_train))
X_meta_test = np.hstack((y_pred_rf_test, y_pred_xgb_test))

meta_model = LogisticRegression(max_iter=500, random_state=42)
meta_model.fit(X_meta_train, y_train)

y_pred_combined = meta_model.predict(X_meta_test)

accuracy = accuracy_score(y_test, y_pred_combined)
classification_rep = classification_report(y_test, y_pred_combined, zero_division=1)
conf_matrix = confusion_matrix(y_test, y_pred_combined)

print(f"\nOptimized XGBoost Parameters for Class 8 Labeling: {best_xgb_params}")
print(f"Optimized Random Forest Parameters for Final Classification: {best_rf_params}")

print(f"\nFinal Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)

final_results = pd.DataFrame({
    "Model": ["RandomForest (Bagging)", "XGBoost (Boosting)", "Meta-Classifier (Logistic Regression)"],
    "Final Accuracy": [accuracy, accuracy, accuracy]
})
final_results.to_csv("optimized_model_results.csv", index=False)
print("\nResults saved as: optimized_model_results.csv")


Fitting 3 folds for each of 27 candidates, totalling 81 fits


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Class Distribution in Updated Dataset:
 class
4    79
0    65
2    61
1    60
5    51
3    36
6    20
7     1
Name: count, dtype: int64

Using shuffle=True (No Stratification) due to class imbalance.
Fitting 3 folds for each of 27 candidates, totalling 81 fits


Parameters: { "use_label_encoder" } are not used.




Optimized XGBoost Parameters for Class 8 Labeling: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}
Optimized Random Forest Parameters for Final Classification: {'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}

Final Model Accuracy: 0.7733

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.69      0.73        16
           1       0.71      0.67      0.69        15
           2       0.90      0.90      0.90        10
           3       0.60      0.60      0.60         5
           4       0.81      0.87      0.84        15
           5       0.75      1.00      0.86         9
           6       0.75      0.60      0.67         5

    accuracy                           0.77        75
   macro avg       0.76      0.76      0.76        75
weighted avg       0.77      0.77      0.77        75


Confusion Matrix:
 [[11  3  0  0  2  0  0]
 [ 1 10  0  1  0  2  1]
 [ 0  0  9  0  1  0  0]
 [ 0  1  1  3  0  0 

In [73]:
max_confidences

array([0.77961946, 0.6923859 , 0.6561258 , 0.87704754, 0.51657057,
       0.9749156 , 0.7455382 , 0.9660479 , 0.9749097 , 0.86143905],
      dtype=float32)

In [74]:
df_known

Unnamed: 0,class,id,spectrum_0 + 740,spectrum_1 + 740,spectrum_2 + 740,spectrum_3 + 740,spectrum_4 + 740,spectrum_5 + 740,spectrum_6 + 740,spectrum_7 + 740,...,color_red,color_white,color_yellow,transparency_o,transparency_s,transparency_t,device_id_802215F15396F9FE,device_id_B0236F1F2D02C632,device_id_B02EFC17B97B46B2,device_id_D02B30ACFD92433E
0,1,1,0.782157,0.754932,0.730555,0.709195,0.690707,0.674890,0.661358,0.649536,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1,2,0.948156,0.947904,0.947607,0.947237,0.946815,0.946384,0.945992,0.945662,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1,4,0.285657,0.285247,0.284841,0.284432,0.284022,0.283619,0.283231,0.282856,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,1,5,0.296280,0.295966,0.295631,0.295269,0.294887,0.294499,0.294121,0.293758,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,1,6,0.171568,0.171535,0.171500,0.171459,0.171414,0.171371,0.171338,0.171317,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
358,7,20,0.260549,0.260703,0.260859,0.260962,0.260995,0.260955,0.260871,0.260763,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
359,7,21,0.608727,0.608820,0.608965,0.609044,0.609021,0.608883,0.608687,0.608465,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
360,7,22,0.510561,0.510711,0.510857,0.510904,0.510834,0.510646,0.510399,0.510129,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
361,7,23,0.365356,0.365554,0.365715,0.365774,0.365726,0.365576,0.365372,0.365148,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


In [75]:
df_unknown

Unnamed: 0,class,id,spectrum_0 + 740,spectrum_1 + 740,spectrum_2 + 740,spectrum_3 + 740,spectrum_4 + 740,spectrum_5 + 740,spectrum_6 + 740,spectrum_7 + 740,...,color_red,color_white,color_yellow,transparency_o,transparency_s,transparency_t,device_id_802215F15396F9FE,device_id_B0236F1F2D02C632,device_id_B02EFC17B97B46B2,device_id_D02B30ACFD92433E
363,1,3,1.339342,1.338869,1.338375,1.337812,1.337198,1.336585,1.336034,1.335569,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
364,1,5,1.404761,1.405914,1.4072,1.408563,1.410021,1.411614,1.413385,1.415335,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
365,1,8,0.312587,0.312626,0.312672,0.312711,0.312744,0.31278,0.31283,0.312899,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
366,3,15,0.240503,0.240845,0.241202,0.241559,0.241913,0.242265,0.242618,0.24297,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
367,8,16,0.487288,0.487556,0.487789,0.48797,0.488109,0.488228,0.488354,0.4885,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
368,5,18,0.710654,0.710555,0.710445,0.710305,0.710153,0.71002,0.709941,0.709929,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
369,3,20,0.766737,0.771991,0.777389,0.782837,0.788306,0.793763,0.799186,0.804556,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
370,3,21,0.566854,0.570564,0.5744,0.578296,0.582233,0.586186,0.590141,0.594083,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
371,5,22,0.523741,0.528294,0.533018,0.537834,0.542705,0.547592,0.552464,0.557294,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
372,5,25,0.141744,0.146658,0.152082,0.158028,0.164529,0.171596,0.17923,0.187434,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

file_path = "Data Fusion Assignment 2025 Data.csv"
df = pd.read_csv(file_path)

spectrum_cols = [col for col in df.columns if col.startswith("spectrum_") and "+ 740" in col]
categorical_cols = ["color", "transparency", "device_id"]
target_col = "class"

df = df[df[target_col] != 8]

df[spectrum_cols] = df[spectrum_cols].fillna(df[spectrum_cols].median())
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

scaler = StandardScaler()
X_spectrum = scaler.fit_transform(df[spectrum_cols])

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_cats = encoder.fit_transform(df[categorical_cols])
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))

df = df.drop(columns=categorical_cols).reset_index(drop=True)
df = pd.concat([df, encoded_cats_df], axis=1)
X_categorical = df.drop(columns=[target_col, "id"]).values

y = df[target_col] - 1 

X_final = np.hstack((X_spectrum, X_categorical))
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42, stratify=y
)

rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5, random_state=42)
rf_model.fit(X_train, y_train)

xgb_model = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=3, eval_metric="mlogloss", use_label_encoder=False, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_rf_train = rf_model.predict_proba(X_train)
y_pred_xgb_train = xgb_model.predict_proba(X_train)

y_pred_rf_test = rf_model.predict_proba(X_test)
y_pred_xgb_test = xgb_model.predict_proba(X_test)

X_meta_train = np.hstack((y_pred_rf_train, y_pred_xgb_train))
X_meta_test = np.hstack((y_pred_rf_test, y_pred_xgb_test))

meta_model = LogisticRegression(max_iter=500, random_state=42)
meta_model.fit(X_meta_train, y_train)

y_pred_combined = meta_model.predict(X_meta_test)

accuracy = accuracy_score(y_test, y_pred_combined)
classification_rep = classification_report(y_test, y_pred_combined, zero_division=1)
conf_matrix = confusion_matrix(y_test, y_pred_combined)

print(f"\nFinal Optimized Model Accuracy (Dropping Unknown Class 8): {accuracy:.4f}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)

final_results = pd.DataFrame({
    "Model": ["RandomForest (Bagging)", "XGBoost (Boosting)", "Meta-Classifier (Logistic Regression)"],
    "Final Accuracy": [accuracy, accuracy, accuracy]
})

final_results

Parameters: { "use_label_encoder" } are not used.




Final Optimized Model Accuracy (Dropping Unknown Class 8): 0.7397

Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85        13
           1       1.00      0.92      0.96        12
           2       0.62      0.67      0.64        12
           3       0.50      0.14      0.22         7
           4       0.76      0.87      0.81        15
           5       0.62      0.80      0.70        10
           6       0.50      0.50      0.50         4

    accuracy                           0.74        73
   macro avg       0.69      0.68      0.67        73
weighted avg       0.73      0.74      0.72        73


Confusion Matrix:
 [[11  0  1  0  1  0  0]
 [ 0 11  0  0  0  0  1]
 [ 1  0  8  0  3  0  0]
 [ 0  0  0  1  0  5  1]
 [ 1  0  1  0 13  0  0]
 [ 0  0  2  0  0  8  0]
 [ 0  0  1  1  0  0  2]]


Unnamed: 0,Model,Final Accuracy
0,RandomForest (Bagging),0.739726
1,XGBoost (Boosting),0.739726
2,Meta-Classifier (Logistic Regression),0.739726


# Boosting only approach (without 8 class)

In [None]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

file_path = "Data Fusion Assignment 2025 Data.csv"
df = pd.read_csv(file_path)

spectrum_cols = [col for col in df.columns if col.startswith("spectrum_") and "+ 740" in col]
categorical_cols = ["color", "transparency", "device_id"]
target_col = "class"

df = df[df[target_col] != 8]

df[spectrum_cols] = df[spectrum_cols].fillna(df[spectrum_cols].median())
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

scaler = StandardScaler()
X_spectrum = scaler.fit_transform(df[spectrum_cols])

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_cats = encoder.fit_transform(df[categorical_cols])
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))

df = df.drop(columns=categorical_cols).reset_index(drop=True)
df = pd.concat([df, encoded_cats_df], axis=1)
X_categorical = df.drop(columns=[target_col, "id"]).values

y = df[target_col] - 1 

X_final = np.hstack((X_spectrum, X_categorical))
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42, stratify=y
)

xgb_model = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=3, eval_metric="mlogloss", use_label_encoder=False, random_state=42)
xgb_model.fit(X_train, y_train)

y_pred_xgb = xgb_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_xgb)
classification_rep = classification_report(y_test, y_pred_xgb, zero_division=1)
conf_matrix = confusion_matrix(y_test, y_pred_xgb)

print(f"\nFinal Model Accuracy (Boosting-Only, Dropping Unknown Class 8): {accuracy:.4f}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)

final_results = pd.DataFrame({
    "Model": ["XGBoost (Boosting)"],
    "Final Accuracy": [accuracy]
})
final_results.to_csv("boosting_only_model_results.csv", index=False)
print("\nResults saved as: boosting_only_model_results.csv")


Parameters: { "use_label_encoder" } are not used.




Final Model Accuracy (Boosting-Only, Dropping Unknown Class 8): 0.7123

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.77      0.80        13
           1       1.00      0.92      0.96        12
           2       0.57      0.67      0.62        12
           3       0.50      0.14      0.22         7
           4       0.75      0.80      0.77        15
           5       0.62      0.80      0.70        10
           6       0.40      0.50      0.44         4

    accuracy                           0.71        73
   macro avg       0.67      0.66      0.64        73
weighted avg       0.71      0.71      0.70        73


Confusion Matrix:
 [[10  0  2  0  1  0  0]
 [ 0 11  0  0  0  0  1]
 [ 1  0  8  0  3  0  0]
 [ 0  0  0  1  0  4  2]
 [ 1  0  2  0 12  0  0]
 [ 0  0  2  0  0  8  0]
 [ 0  0  0  1  0  1  2]]

Results saved as: boosting_only_model_results.csv


# Bagging only approach (without class 8)

In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

file_path = "Data Fusion Assignment 2025 Data.csv"
df = pd.read_csv(file_path)

spectrum_cols = [col for col in df.columns if col.startswith("spectrum_") and "+ 740" in col]
categorical_cols = ["color", "transparency", "device_id"]
target_col = "class"

df = df[df[target_col] != 8]

df[spectrum_cols] = df[spectrum_cols].fillna(df[spectrum_cols].median())
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

scaler = StandardScaler()
X_spectrum = scaler.fit_transform(df[spectrum_cols])

encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_cats = encoder.fit_transform(df[categorical_cols])
encoded_cats_df = pd.DataFrame(encoded_cats, columns=encoder.get_feature_names_out(categorical_cols))

df = df.drop(columns=categorical_cols).reset_index(drop=True)
df = pd.concat([df, encoded_cats_df], axis=1)
X_categorical = df.drop(columns=[target_col, "id"]).values

y = df[target_col] - 1 

X_final = np.hstack((X_spectrum, X_categorical))
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.2, random_state=42, stratify=y
)

rf_model = RandomForestClassifier(n_estimators=200, max_depth=10, min_samples_split=5, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred_rf)
classification_rep = classification_report(y_test, y_pred_rf, zero_division=1)
conf_matrix = confusion_matrix(y_test, y_pred_rf)

print(f"\nFinal Model Accuracy (Bagging-Only, Dropping Unknown Class 8): {accuracy:.4f}")
print("\nClassification Report:\n", classification_rep)
print("\nConfusion Matrix:\n", conf_matrix)

final_results = pd.DataFrame({
    "Model": ["RandomForest (Bagging)"],
    "Final Accuracy": [accuracy]
})
final_results.to_csv("bagging_only_model_results.csv", index=False)
print("\nResults saved as: bagging_only_model_results.csv")



Final Model Accuracy (Bagging-Only, Dropping Unknown Class 8): 0.6712

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.92      0.92        13
           1       0.79      0.92      0.85        12
           2       0.37      0.58      0.45        12
           3       1.00      0.14      0.25         7
           4       0.73      0.73      0.73        15
           5       0.75      0.60      0.67        10
           6       0.33      0.25      0.29         4

    accuracy                           0.67        73
   macro avg       0.70      0.59      0.59        73
weighted avg       0.72      0.67      0.66        73


Confusion Matrix:
 [[12  0  0  0  1  0  0]
 [ 0 11  0  0  0  0  1]
 [ 1  1  7  0  3  0  0]
 [ 0  1  2  1  0  2  1]
 [ 0  0  4  0 11  0  0]
 [ 0  0  4  0  0  6  0]
 [ 0  1  2  0  0  0  1]]

Results saved as: bagging_only_model_results.csv
