#Training and Evaluation with All Type of Classifiers

#### CSV files are obtained from the directory, necessary imports are done other than classifiers.



In [None]:
import pandas as pd
import os
from google.colab import drive
import pandas as pd
import numpy as np
import random
import copy
import matplotlib.pyplot as plt
import time
drive.mount('/content/drive')
MY_DRIVE_PATH = "/content/drive/MyDrive/MLProject_2"
DATA_FOLDER = os.path.join(MY_DRIVE_PATH, 'Data google sheet')
PROCESSED_CSV_FILE = os.path.join(DATA_FOLDER, 'Processed_Fruits_Data.csv')
ONEHOT_CSV_FILE = os.path.join(DATA_FOLDER, 'One_Hot_Processed_Fruits_Data.csv')
PCA_CSV_PATH = os.path.join(DATA_FOLDER, 'PCA_Processed_Fruits.csv')

Mounted at /content/drive


In [None]:
# Initialize
df = pd.read_csv(ONEHOT_CSV_FILE, sep = ";")
random.seed(42)

#### Separation of the test and training dataset
##### For each type, it is separated as 150-450, there is no validation set for this job since we used GridSearchCV for hyperparameter selection and it utilizes from cross-validation.



In [None]:
# Preprocessing
# Removing unnecessary text columns
df.drop(columns=["Image_path","Text","Label"], inplace=True)

# Keeping number of items same for each class
N_TRAIN = 450
N_TEST  = 150
N_TOTAL = N_TRAIN + N_TEST

train_dfs = []
test_dfs = []

categories = ['banana', 'tomato', 'apple', 'orange', 'tangerine']
for category in categories:
    subset = df[df["Fruit"] == category]
    subset = subset.sample(N_TOTAL, random_state=42).reset_index(drop=True)

    train_subset = subset.iloc[:N_TRAIN]
    test_subset = subset.iloc[N_TRAIN : N_TOTAL ]
    train_dfs.append(train_subset)
    test_dfs.append(test_subset)

# 3. Concatenating and shuffling
df_train = pd.concat(train_dfs).sample(frac=1, random_state=42).reset_index(drop=True)
df_test  = pd.concat(test_dfs).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Train Shape: {df_train.shape}")
print(f"Test Shape:  {df_test.shape}")

# Normalization
numerical_cols = ["Weight","Price"]
image_cols = [column for column in df_train.columns if "img" in column]
text_cols = [column for column in df_train.columns if "text" in column]
categorical_cols = [column for column in df_train.columns if (column not in numerical_cols + image_cols + text_cols) and (column != "Fruit")] # We don't want the target
columns_to_normalize = numerical_cols + image_cols + text_cols

epsilon = 1e-8  # To prevent division by zero
for column in columns_to_normalize:
    mean = df_train[column].mean()
    std = df_train[column].std()
    df_train[column] = (df_train[column] - mean) / (std + epsilon)
    df_test[column]  = (df_test[column] - mean) / (std + epsilon)

# Removing the target
target_col = 'Fruit'

X_train = df_train.drop(columns=[target_col])
y_train = df_train[target_col]

X_test = df_test.drop(columns=[target_col])
y_test = df_test[target_col]


Train Shape: (2250, 493)
Test Shape:  (750, 493)


In [None]:
# Classifiers are taken from the library sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

###### Benchmarking method is used for any type of classifier, we calculates the training time and accuracy of test data inside of it.



In [None]:
results=[]
def benchmarking(name,model, params):
  start_time = time.time()
  model.fit(X_train, y_train)
  end_time = time.time()

  y_pred = model.predict(X_test)
  accuracy_on_test = accuracy_score(y_test, model.predict(X_test))
  results.append({
        "Classifier": name,
        "Training time": round(end_time - start_time, 5),
        "Test Accuracy": round(accuracy_on_test, 5),
        "Hyperparameters/Choices": params
    })
  print(f"Finished {name}")


In [None]:
#Logistic Regression (Linear)
# We used lambda for the coefficient of 1/2||w|| part at lecture, but sklearn take C for error part not regularization part,
# so the C is inverse regularization term (1 /lambda). If C is small, regularization term is dominant.

lr_grid = GridSearchCV(LogisticRegression(max_iter=1000), {'C': [0.1, 1, 10,20,30,40]}, cv=5)
lr_grid.fit(X_train, y_train)
benchmarking("Logistic Regression", lr_grid.best_estimator_, f"Best C={lr_grid.best_params_['C']}")


Finished Logistic Regression


In [None]:
#Logistic Regression (Non-linear)
# transformation of x to [1, x_1, x_2, x_1^2, x_1x_2, x_2^2]
# since C= 0.1 is the best for linear one, we used it here also, running it for degree 3-4 crashes the RAM.
poly_logit = Pipeline([('poly', PolynomialFeatures(degree=2)), ('logit', LogisticRegression(max_iter=1000, C=0.1))])
benchmarking("Non-linear Logit", poly_logit, "Degree=2 Polynomial, C=0.1")

Finished Non-linear Logit


In [None]:
#Soft-margin SVM (Linear)
# In the formula , C is coefficient of the error part (min 0.5||w|| + C*(margin errors) )
svm_linear_grid = GridSearchCV(SVC(kernel='linear'), {'C': [0.01, 0.1, 1]}, cv=5)
svm_linear_grid.fit(X_train, y_train)
benchmarking("Soft-margin SVM", svm_linear_grid.best_estimator_, f"Best C={svm_linear_grid.best_params_['C']}")

Finished Soft-margin SVM


In [None]:
#SVM with Kernel Trick
# The RBF Kernel in the lecture is used for kernel trick
svm_kernel_grid = GridSearchCV(SVC(kernel='rbf'), {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}, cv=5)
svm_kernel_grid.fit(X_train, y_train)
benchmarking("Kernel SVM", svm_kernel_grid.best_estimator_, f"Best {svm_kernel_grid.best_params_}")

Finished Kernel SVM


In [None]:
#k-Nearest Neighbor
# without training anything ,we assigned  the majority vote in the "k" nearest point for class
knn_grid = GridSearchCV(KNeighborsClassifier(), {'n_neighbors': [3, 5, 7, 9]}, cv=5)
knn_grid.fit(X_train, y_train)
benchmarking("k-NN", knn_grid.best_estimator_, f"Best k={knn_grid.best_params_['n_neighbors']}")

Finished k-NN


In [None]:
#Naive Bayes
# We have continuous data for parameters like weight price etc, we have to assign a distribution type for them.
benchmarking("Naive Bayes", GaussianNB(), "Gaussian Distribution")

Finished Naive Bayes


In [None]:
#Random Forest
# n_estimators= number of trees
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), {'n_estimators': [50, 100, 200]}, cv=5)
rf_grid.fit(X_train, y_train)
benchmarking("Random Forest", rf_grid.best_estimator_, f"Best N={rf_grid.best_params_['n_estimators']}")

Finished Random Forest


In [None]:
# Final Table
#Convert the results list to a DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Test Accuracy", ascending=False)
print(results_df.to_string(index=False))

         Classifier  Training time  Test Accuracy         Hyperparameters/Choices
    Soft-margin SVM        0.25728        0.98533                      Best C=0.1
      Random Forest       10.65200        0.98267                      Best N=200
Logistic Regression        1.27567        0.98133                      Best C=0.1
         Kernel SVM        0.78344        0.96933 Best {'C': 10, 'gamma': 'auto'}
   Non-linear Logit      109.83466        0.96800      Degree=2 Polynomial, C=0.1
        Naive Bayes        0.03308        0.91467           Gaussian Distribution
               k-NN        0.02130        0.87333                        Best k=3


#### **Evaluations of the Results**
The hyperparameter selections are performed by providing predetermined values to the GridSearch library, which selects the most accurate one by trying each combination.
##### **Training Times**
k-NN and Naive Bayes algorithms are taken the least times, and it is the expected result since k-NN has no training part and Naive Bayes is just straightforward probability multiplications.
Non-linear Logit is taking the most amount of time. This is  due to the  increase in feature space dimensionality when generating degree-2 polynomial features. Random Forest also takes significant time due to the overhead of creating 200 individual decision trees.

##### **Accuracy**
Soft-margin SVM achieved the highest accuracy ($98.53%$) with a  $C$ value ($0.1$), suggesting the data is well-separated by a linear boundary.  Random Forest ($98.27%$) and Logistic Regression ($98.13%$) also have a high accuracy rate. It can be concluded that the relationship between features and the target is predominantly linear from the accuracy of the simple Logistic Regression algorithm.
Kernel SVM and Non-linear Logit performed slightly worse ($96.93%$ and $96.80%$ respectively) than their linear versions. This might indicate that the  complexity of non-linear transformations might create overfitting or  the decision boundary is naturally linear.
