#  Training and Evaluation with All Type of Classifiers with PCA

#### CSV files are obtained from the directory, necessary imports are done other than classifiers.



In [1]:
import pandas as pd
import os
from google.colab import drive
import pandas as pd
import numpy as np
import random
import copy
import matplotlib.pyplot as plt
import time
drive.mount('/content/drive')
MY_DRIVE_PATH = "/content/drive/MyDrive/MLProject_2"
DATA_FOLDER = os.path.join(MY_DRIVE_PATH, 'Data google sheet')
PROCESSED_CSV_FILE = os.path.join(DATA_FOLDER, 'Processed_Fruits_Data.csv')
ONEHOT_CSV_FILE = os.path.join(DATA_FOLDER, 'One_Hot_Processed_Fruits_Data.csv')
PCA_CSV_PATH = os.path.join(DATA_FOLDER, 'PCA_Processed_Fruits.csv')

Mounted at /content/drive


In [2]:
# Initialize
df = pd.read_csv(PCA_CSV_PATH, sep = ",")
random.seed(42)

#### Separation of the test and training dataset
##### For each type, it is separated as 150-450, there is no validation set for this job since we used GridSearchCV for hyperparameter selection and it utilizes from cross-validation.



In [3]:
# Preprocessing
# Removing unnecessary text columns

# Keeping number of items same for each class
N_TRAIN = 450
N_TEST  = 150
N_TOTAL = N_TRAIN + N_TEST


categories = ['banana', 'tomato', 'apple', 'orange', 'tangerine']
train_df_pca = []
test_df_pca = []

for category in categories:
    # We use .get() logic or check the name to be safe
    subset = df[df["Fruit"] == category]
    subset = subset.sample(N_TOTAL, random_state=42).reset_index(drop=True)

    train_subset_pca = subset.iloc[:N_TRAIN]
    test_subset_pca  = subset.iloc[N_TRAIN : N_TOTAL]

    train_df_pca.append(train_subset_pca)
    test_df_pca.append(test_subset_pca)

# Concatenating and shuffling
df_pca_train = pd.concat(train_df_pca).sample(frac=1, random_state=42).reset_index(drop=True)
df_pca_test  = pd.concat(test_df_pca).sample(frac=1, random_state=42).reset_index(drop=True)

# Separate Features (X) and Target (y)
target_col = 'Fruit'
X_train = df_pca_train.drop(columns=[target_col])
y_train = df_pca_train[target_col]
X_test = df_pca_test.drop(columns=[target_col])
y_test = df_pca_test[target_col]

print("Success!")
print(f"PCA Train Shape: {X_train.shape}")


Success!
PCA Train Shape: (2250, 45)


In [4]:
# Classifiers are taken from the library sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

###### Benchmarking method is used for any type of classifier, we calculates the training time and accuracy of test data inside of it.



In [5]:
results=[]
def benchmarking(name,model, params):
  start_time = time.time()
  model.fit(X_train, y_train)
  end_time = time.time()

  y_pred = model.predict(X_test)
  accuracy_on_test = accuracy_score(y_test, model.predict(X_test))
  results.append({
        "Classifier": name,
        "Training time": round(end_time - start_time, 5),
        "Test Accuracy": round(accuracy_on_test, 5),
        "Hyperparameters/Choices": params
    })
  print(f"Finished {name}")


In [6]:
#Logistic Regression (Linear)
# We used lambda for the coefficient of 1/2||w|| part at lecture, but sklearn take C for error part not regularization part,
# so the C is inverse regularization term (1 /lambda). If C is small, regularization term is dominant.

lr_grid = GridSearchCV(LogisticRegression(max_iter=1000), {'C': [0.1, 1, 10,20,30,40]}, cv=5)
lr_grid.fit(X_train, y_train)
benchmarking("Logistic Regression", lr_grid.best_estimator_, f"Best C={lr_grid.best_params_['C']}")


Finished Logistic Regression


In [7]:
#Logistic Regression (Non-linear)
# transformation of x to [1, x_1, x_2, x_1^2, x_1x_2, x_2^2]
# since C= 0.1 is the best for linear one, we used it here also, running it for degree 3-4 crashes the RAM.
poly_logit = Pipeline([('poly', PolynomialFeatures(degree=2)), ('logit', LogisticRegression(max_iter=1000, C=0.1))])
benchmarking("Non-linear Logit", poly_logit, "Degree=2 Polynomial, C=0.1")

Finished Non-linear Logit


In [8]:
#Soft-margin SVM (Linear)
# In the formula , C is coefficient of the error part (min 0.5||w|| + C*(margin errors) )
svm_linear_grid = GridSearchCV(SVC(kernel='linear'), {'C': [0.01, 0.1, 1]}, cv=5)
svm_linear_grid.fit(X_train, y_train)
benchmarking("Soft-margin SVM", svm_linear_grid.best_estimator_, f"Best C={svm_linear_grid.best_params_['C']}")

Finished Soft-margin SVM


In [9]:
#SVM with Kernel Trick
# The RBF Kernel in the lecture is used for kernel trick
svm_kernel_grid = GridSearchCV(SVC(kernel='rbf'), {'C': [0.1, 1, 10], 'gamma': ['scale', 'auto']}, cv=5)
svm_kernel_grid.fit(X_train, y_train)
benchmarking("Kernel SVM", svm_kernel_grid.best_estimator_, f"Best {svm_kernel_grid.best_params_}")

Finished Kernel SVM


In [10]:
#k-Nearest Neighbor
# without training anything ,we assigned  the majority vote in the "k" nearest point for class
knn_grid = GridSearchCV(KNeighborsClassifier(), {'n_neighbors': [3, 5, 7, 9]}, cv=5)
knn_grid.fit(X_train, y_train)
benchmarking("k-NN", knn_grid.best_estimator_, f"Best k={knn_grid.best_params_['n_neighbors']}")

Finished k-NN


In [11]:
#Naive Bayes
# We have continuous data for parameters like weight price etc, we have to assign a distribution type for them.
benchmarking("Naive Bayes", GaussianNB(), "Gaussian Distribution")

Finished Naive Bayes


In [12]:
#Random Forest
# n_estimators= number of trees
rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), {'n_estimators': [50, 100, 200]}, cv=5)
rf_grid.fit(X_train, y_train)
benchmarking("Random Forest", rf_grid.best_estimator_, f"Best N={rf_grid.best_params_['n_estimators']}")

Finished Random Forest


In [13]:
# Final Table
# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Test Accuracy", ascending=False)
print(results_df.to_string(index=False))

         Classifier  Training time  Test Accuracy          Hyperparameters/Choices
Logistic Regression        0.33282        0.91333                         Best C=1
    Soft-margin SVM        0.12495        0.90267                       Best C=0.1
         Kernel SVM        0.16696        0.89333 Best {'C': 10, 'gamma': 'scale'}
   Non-linear Logit       13.35631        0.88267       Degree=2 Polynomial, C=0.1
      Random Forest        4.14986        0.87867                       Best N=200
               k-NN        0.00435        0.82533                         Best k=3
        Naive Bayes        0.00997        0.75600            Gaussian Distribution


#### **Evaluations of the Results and comparison of PCA vs without PCA**
The hyperparameter selections are performed by providing predetermined values to the GridSearch library, which selects the most accurate one by trying each combination.
##### **Training Times**
With the application of PCA, the feature space was reduced, leading to faster training across all models compared to the original dataset.
k-NN and Naive Bayes remain the fastest methods again.
Non-linear Logit and Random Forest required the most time again, but PCA effectively decreased the Non-linear Logit time from over 100 seconds to just 13.
##### **Accuracy**
Some of the information is lost during the projection to a new space due to the reduction of dimensionality.
Logistic Regression is the top accurate method with PCA at $91.33%$, suggesting that the principal components retained a strong linear relationship with the target.
The largest decrease in performance occurs with the Naive Bayes method, dropping to $75.60%$. We gave the gaussian distribution as the distribution type for the continous parameters, the assumption could not hold for the PCA version.