##### ### The University of Melbourne, School of Computing and Information Systems
# COMP30027 Machine Learning, 2025 Semester 1

## Project 2: Traffic Sign Prediction


<h2>Read me</h2>

+ Run the code below will give the final result for most of the report
+ For SVM
    + To have the graphs that have scaler, run the code that is being commented
+ For Random Forest:
    + Run the code that being commented for result without tuning

## Importing Data

In [None]:
# Importing tools
import pandas as pd
# meta data
train_df = pd.read_csv("./train/train_metadata.csv")
train_img_path = train_df['image_path']
test_df = pd.read_csv("./test/test_metadata.csv")
Y_train = train_df['ClassId']

In [None]:
train_df.head()

In [None]:
len(test_df), len(train_df)

## Prepocessing

#### Using OpenCV

In [None]:
import os
import cv2
import numpy as np

In [None]:
# for SVM 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
class_ids = sorted(train_df['ClassId'].unique())
bins = np.arange(min(class_ids), max(class_ids) + 2)
sns.histplot(train_df['ClassId'], bins=bins, color="crimson")
plt.title("Histogram of class ID")
plt.show()

In [None]:
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score
scoring = {
    'accuracy': 'accuracy',
    'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
    'recall_macro': make_scorer(recall_score, average='macro', zero_division=0),
    'f1_macro': make_scorer(f1_score, average='macro', zero_division=0)
}

In [None]:
# for SVM tuning
C_values = [0.1, 1, 10, 50]

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict, cross_validate
from sklearn.neighbors import KNeighborsClassifier

# cross-validation
skf = StratifiedKFold(n_splits=2)

## My own HOG dataset

In [None]:
hog = cv2.HOGDescriptor()
IMAGE_SIZE = (64, 128)

def extract_feature(img_path):
    image = cv2.imread(img_path)
    image = cv2.resize(image,IMAGE_SIZE)
    return hog.compute(image).flatten()
features = []
sign_class = []
for _, row in train_df.iterrows():
    img_path = os.path.join('./train/',row['image_path'])
    feature = extract_feature(img_path)
    if feature is not None:
        features.append(feature)
        sign_class.append(row['image_path'])

X_hog = np.array(features)
Y_hog = np.array(sign_class)

### using SVM model

In [None]:
# train the svm
model = make_pipeline(StandardScaler(), SVC(kernel='rbf', C=10, gamma='scale'))
scores = cross_validate(model, X_hog, Y_train, cv=skf, scoring=scoring)

In [None]:
print("Cross-validation scores:", scores)

print("Accuracy:", round(scores['test_accuracy'].mean(), 3))
print("Precision (macro):", round(scores['test_precision_macro'].mean(), 3))
print("Recall (macro):", round(scores['test_recall_macro'].mean(), 3))
print("F1 Score (macro):", round(scores['test_f1_macro'].mean(), 3))

##### Testset SVM HOG

In [None]:
# making test dataset
test_X_feat = []
for _, row in test_df.iterrows():
    img_path = os.path.join('./test/',row['image_path'])
    feature = extract_feature(img_path)
    if feature is not None:
        test_X_feat.append(feature)

X_test_hog= np.array(test_X_feat)
model.fit(X_hog, Y_train)

y_test_pred = model.predict(X_test_hog)
test_df['ClassId'] = y_test_pred
test_df.to_csv('./test/test_metadata_HOG_SVM.csv', index=False)

#### confusion matrix svm hog

In [None]:
## confusion matrix
y_pred = cross_val_predict(model, X_hog, Y_train, cv=skf)
cm = metrics.confusion_matrix(Y_train, y_pred)

fig, ax = plt.subplots(figsize=(20, 20))
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(ax=ax, xticks_rotation=90)

plt.title("Confusion Matrix")
plt.tight_layout()
plt.show()

In [None]:
# Split data once (e.g., 80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_hog, Y_train, test_size=0.2, stratify=Y_train)
mean_scores = []

# No need for parallel_backend here since we're not cross-validating
for C in C_values:
    model = make_pipeline(
        StandardScaler(),
        SVC(kernel='rbf', C=C, gamma='scale', class_weight='balanced')
    )
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    mean_scores.append(accuracy)

# Plotting
plt.figure(figsize=(10, 5))
plt.plot(C_values, mean_scores, marker='o', color='green')
plt.xscale('log')
plt.xlabel('C (Regularization Parameter)')
plt.ylabel('Test Accuracy')
plt.title('Effect of C on SVM Performance (Train/Test Split)')
plt.grid(True)
plt.show()

### using random forest model

In [None]:
from sklearn.ensemble import RandomForestClassifier


rf_hog = make_pipeline(RandomForestClassifier())
scores = cross_validate(rf_hog, X_hog, Y_train, cv = skf, scoring=scoring)

In [None]:
print("Accuracy:", round(scores['test_accuracy'].mean(), 3))
print("Precision (macro):", round(scores['test_precision_macro'].mean(), 3))
print("Recall (macro):", round(scores['test_recall_macro'].mean(), 3))
print("F1 Score (macro):", round(scores['test_f1_macro'].mean(), 3))

##### test set RF

In [None]:
rf_hog.fit(X_hog, Y_train)
y_test_pred = model.predict(X_test_hog)
test_df['ClassId'] = y_test_pred
test_df.to_csv('./test/test_metadata_RF_HOG.csv', index=False)

#### Confusion matrix for random forest

In [None]:
y_pred = cross_val_predict(rf_hog, X_hog, Y_train, cv=skf)
cm = metrics.confusion_matrix(Y_train, y_pred)

In [None]:
# confusion matrix
fig, ax = plt.subplots(figsize=(20, 20))
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(ax=ax, xticks_rotation=90)

plt.title("Random Forest Confusion Matrix on HOG dataset ")
plt.tight_layout()
plt.show()

In [None]:

rf_hog.fit(X_hog, Y_train)

In [None]:

y_test_pred = rf_hog.predict(X_test_hog)
test_df['ClassId'] = y_test_pred
test_df.to_csv('./test/test_metadata_HOG_RF.csv', index=False)

## HOG_PCA data set

In [None]:
# test for HOG_PCA
X_hog_test = pd.read_csv('./test/Features/hog_pca.csv')
X_hog_test = X_hog_test.drop(columns=['image_path'])

### SVM model

In [None]:

# read dataframe
# making training dataset
X_hog_train = pd.read_csv('./train/Features/hog_pca.csv')
X_hog_train = X_hog_train.drop(columns=['image_path'])


In [None]:
# making model for svm on hog_pca
SVM_HOG_PCA = make_pipeline( SVC(kernel='rbf', C=10, gamma='scale'))
# SVM_HOG_PCA = make_pipeline(StandardScaler(), SVC(kernel='rbf', C=10, gamma='scale'))
scores = cross_validate(SVM_HOG_PCA, X_hog_train, Y_train, cv=skf, scoring=scoring)

In [None]:
print(scores['test_accuracy'])
print("Accuracy:", round(scores['test_accuracy'].mean(), 3))
print("Precision (macro):", round(scores['test_precision_macro'].mean(), 3))
print("Recall (macro):", round(scores['test_recall_macro'].mean(), 3))
print("F1 Score (macro):", round(scores['test_f1_macro'].mean(), 3))

In [None]:
# Confusion matrix 
y_pred = cross_val_predict(SVM_HOG_PCA, X_hog_train, Y_train, cv=skf)
cm = metrics.confusion_matrix(Y_train, y_pred)

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(ax=ax, xticks_rotation=90)
plt.title("SVM Confusion Matrix on the provided HOG_PCA dataset ")
plt.tight_layout()
plt.show()

In [None]:
mean_scores = []
for C in C_values:
    model = make_pipeline(SVC(kernel='rbf', C=C, gamma='scale', class_weight='balanced'))
    scores = cross_val_score(model, X_hog_train, Y_train, cv=skf, scoring='accuracy')
    mean_scores.append(scores.mean())

## Run the code below for SVC with scaler
# for C in C_values:
#     model = make_pipeline(StandardScaler(),SVC(kernel='rbf', C=C, gamma='scale', class_weight='balanced'))
#     scores = cross_val_score(model, X_hog_train, Y_train, cv=skf, scoring='accuracy')
#     mean_scores.append(scores.mean())

# Plotting
plt.figure(figsize=(10, 5))
plt.plot(C_values, mean_scores, marker='o', color = 'orange')
plt.xscale('log')
plt.xlabel('C (Regularization Parameter)')
plt.ylabel('Cross-Validated Accuracy')
plt.title('Effect of C on SVM Performance on HOG_PCA')
plt.grid(True)
plt.show()

#### Test set SVM HOG_PCA

In [None]:
# train on the full dataset
SVM_HOG_PCA.fit(X_hog_train, Y_train)
y_test_pred = SVM_HOG_PCA.predict(X_hog_test)
test_df['ClassId'] = y_test_pred
test_df.to_csv('./test/test_metadata_HOG_PCA_SVM.csv', index=False)

### KNN on HOG_PCA

In [None]:
neighbors = np.arange(1, 10)
train_accuracy = np.empty(len(neighbors))
# Loop over K values
for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_hog_train, Y_train)
    
    # Compute training and test data accuracy
    train_accuracy[i] = cross_val_score(knn, X_hog_train, Y_train, cv = skf).mean()

plt.plot(neighbors, train_accuracy, label = 'Training dataset Accuracy', color = 'crimson')
plt.title("Training dataset Accuracy vs n-neigbors")
plt.legend()
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.grid(True)
plt.show()

In [None]:
knn_HOG_PCA = KNeighborsClassifier(n_neighbors=1)
knn_HOG_PCA.fit(X_hog_train, Y_train)
y_pred = cross_val_predict(knn_HOG_PCA, X_hog_train, Y_train, cv=skf)
cm = metrics.confusion_matrix(Y_train, y_pred)
fig, ax = plt.subplots(figsize=(20,20))
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(ax=ax, xticks_rotation=90)
plt.title("KNN Confusion Matrix on the provided HOG_PCA dataset ")
plt.tight_layout()
plt.show()

In [None]:

scores = cross_validate(knn_HOG_PCA, X_hog_train, Y_train, cv=skf, scoring=scoring)
print("Accuracy:", round(scores['test_accuracy'].mean(), 3))
print("Precision (macro):", round(scores['test_precision_macro'].mean(), 3))
print("Recall (macro):", round(scores['test_recall_macro'].mean(), 3))
print("F1 Score (macro):", round(scores['test_f1_macro'].mean(), 3))

#### Test KNN HOG_PCA

In [None]:
knn_HOG_PCA.fit(X_hog_train, Y_train)
y_test_pred = knn_HOG_PCA.predict(X_hog_test)
test_df['ClassId'] = y_test_pred
test_df.to_csv('./test/test_metadata_HOG_PCA_KNN.csv', index=False)

## COLOR dataset

In [None]:
# test color
X_color_test = pd.read_csv('./test/Features/color_histogram.csv')
X_color_test = X_color_test.drop(columns=['image_path'])

### SVM

In [None]:
X_color = pd.read_csv('./train/Features/color_histogram.csv')
X_color = X_color.drop(columns=['image_path'])

SVM_color = make_pipeline(SVC(kernel='rbf', C=100, gamma='scale', class_weight='balanced'))

scores = cross_validate(SVM_color, X_color, Y_train, cv=skf, scoring=scoring)
y_pred = cross_val_predict(SVM_color, X_color, Y_train, cv=skf)

In [None]:
print("Cross-validation scores:", scores)
print("Accuracy:", round(scores['test_accuracy'].mean(), 3))
print("Precision (macro):", round(scores['test_precision_macro'].mean(), 3))
print("Recall (macro):", round(scores['test_recall_macro'].mean(), 3))
print("F1 Score (macro):", round(scores['test_f1_macro'].mean(), 3))

In [None]:
mean_scores = []

for C in C_values:
    model = make_pipeline(SVC(kernel='rbf', C=C, gamma='scale', class_weight='balanced'))
    scores = cross_val_score(model, X_color, Y_train, cv=skf, scoring='accuracy')
    mean_scores.append(scores.mean())    
# Plotting
plt.figure(figsize=(10, 5))
plt.plot(C_values, mean_scores, marker='o', color = 'crimson')
plt.xscale('log')
plt.xlabel('C (Regularization Parameter)')
plt.ylabel('Cross-Validated Accuracy')
plt.title('Effect of C on SVM Performance on color_dataset')
plt.grid(True)
plt.show()

In [None]:
cm = metrics.confusion_matrix(Y_train, y_pred)

fig, ax = plt.subplots(figsize=(20, 20))
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(ax=ax, xticks_rotation=90)

plt.title("Confusion Matrix for SVM on color_histogram")
plt.tight_layout()
plt.show()

### test svm color

In [None]:
SVM_color.fit(X_color, Y_train)
y_test_pred = SVM_color.predict(X_color_test)
test_df['ClassId'] = y_test_pred
test_df.to_csv('./test/test_metadata_Color_SVM.csv', index=False)

### KNN-classifier

In [None]:

KNN_Color = KNeighborsClassifier(n_neighbors=1)
KNN_Color.fit(X_color, Y_train)
scores = cross_validate(KNN_Color,X_color, Y_train, cv=skf, scoring=scoring)

In [None]:
print("Accuracy:", round(scores['test_accuracy'].mean(), 3))
print("Precision (macro):", round(scores['test_precision_macro'].mean(), 3))
print("Recall (macro):", round(scores['test_recall_macro'].mean(), 3))
print("F1 Score (macro):", round(scores['test_f1_macro'].mean(), 3))

In [None]:
y_pred = cross_val_predict(KNN_Color, X_color, Y_train, cv=skf)
cm = metrics.confusion_matrix(Y_train, y_pred)
fig, ax = plt.subplots(figsize=(20, 20))
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(ax=ax, xticks_rotation=90)
plt.title("KNN Confusion Matrix on the provided color histogram ")
plt.tight_layout()
plt.show()

In [None]:


neighbors = np.arange(1, 10)
train_accuracy = np.empty(len(neighbors))
# Loop over K values
for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_color, Y_train)
    
    # Compute training and test data accuracy
    train_accuracy[i] = cross_val_score(knn, X_color, Y_train, cv = skf).mean()

plt.plot(neighbors, train_accuracy, label = 'Training dataset Accuracy', color = 'crimson')
plt.title("Training dataset Accuracy vs n-neigbors")
plt.legend()
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.grid(True)
plt.show()

In [None]:
KNN_Color.fit(X_color, Y_train)
y_test_pred = KNN_Color.predict(X_color_test)
test_df['ClassId'] = y_test_pred
test_df.to_csv('./test/test_metadata_KNN_Color.csv', index=False)

### Random forest on color histogram

In [None]:
rf_color= make_pipeline(RandomForestClassifier(random_state=43, min_samples_leaf=1, min_samples_split=2, n_estimators=200))
# # run the code below for standar rf color mode
# rf_color= make_pipeline(RandomForestClassifier(random_state=43))

rf_color_scores = cross_validate(rf_color, X_color, Y_train, cv = skf, scoring=scoring)


In [None]:
print("Accuracy:", round(rf_color_scores['test_accuracy'].mean(), 3))
print("Precision (macro):", round(rf_color_scores['test_precision_macro'].mean(), 3))
print("Recall (macro):", round(rf_color_scores['test_recall_macro'].mean(), 3))
print("F1 Score (macro):", round(rf_color_scores['test_f1_macro'].mean(), 3))

In [None]:
y_pred = cross_val_predict(rf_color, X_color, Y_train, cv=skf)
cm = metrics.confusion_matrix(Y_train, y_pred)
fig, ax = plt.subplots(figsize=(20, 20))
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(ax=ax, xticks_rotation=90)
plt.title("Random Forest Confusion Matrix on the provided color histogram ")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
pipeline = make_pipeline(RandomForestClassifier(random_state=42))
param_grid = {
    'randomforestclassifier__n_estimators': [50, 100, 200],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
}
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=2,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)
grid_search.fit(X_color, Y_train)
print("Best Hyperparameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

### rf color testing

In [None]:

rf_color.fit(X_color, Y_train)
y_test_pred = rf_color.predict(X_color_test)
test_df['ClassId'] = y_test_pred
test_df.to_csv('./test/test_metadata_Color_RF.csv', index=False)

## HOG_PCA Random forest

In [None]:
from sklearn.ensemble import RandomForestClassifier


rf_HOG_PCA = make_pipeline(RandomForestClassifier(random_state=43,min_samples_leaf=1, min_samples_split=2, n_estimators=200))

## run the code below for default setting
# rf_HOG_PCA = make_pipeline(RandomForestClassifier(random_state=43))

scores = cross_validate(rf_HOG_PCA, X_hog_train, Y_train, cv=skf, scoring=scoring)
print("Accuracy:", round(scores['test_accuracy'].mean(), 3))
print("Precision (macro):", round(scores['test_precision_macro'].mean(), 3))
print("Recall (macro):", round(scores['test_recall_macro'].mean(), 3))
print("F1 Score (macro):", round(scores['test_f1_macro'].mean(), 3))

In [None]:
## confusion matrix
y_pred = cross_val_predict(rf_HOG_PCA, X_hog_train, Y_train, cv=skf)
cm = metrics.confusion_matrix(Y_train, y_pred)
fig, ax = plt.subplots(figsize=(81,20))
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(ax=ax, xticks_rotation=90)
plt.title("Random forest Confusion Matrix on the provided HOG_PCA dataset ")
plt.tight_layout()
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
pipeline = make_pipeline(RandomForestClassifier(random_state=42))
param_grid = {
    'randomforestclassifier__n_estimators': [50, 100, 200],
    'randomforestclassifier__min_samples_split': [2, 5, 10],
    'randomforestclassifier__min_samples_leaf': [1, 2, 4],
}
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=2,
    scoring='accuracy',
    verbose=2,
    n_jobs=-1
)
grid_search.fit(X_hog_train, Y_train)
print("Best Hyperparameters:", grid_search.best_params_)
print("Best F1 Score:", grid_search.best_score_)

#### test HOG_PCA RF

In [None]:

rf_HOG_PCA.fit(X_hog_train, Y_train)
y_test_pred = rf_HOG_PCA.predict(X_hog_test)
test_df['ClassId'] = y_test_pred
test_df.to_csv('./test/test_metadata_HOG_PCA_RF.csv', index=False)