In [1]:
import pandas as pd
import os
from google.colab import drive
import pandas as pd
import numpy as np
import random
import copy
import matplotlib.pyplot as plt
import time
drive.mount('/content/drive')
MY_DRIVE_PATH = "/content/drive/MyDrive/MLProject"
DATA_FOLDER = os.path.join(MY_DRIVE_PATH, 'Data google sheet')
PROCESSED_CSV_FILE = os.path.join(DATA_FOLDER, 'Processed_Fruits_Data.csv')
ONEHOT_CSV_FILE = os.path.join(DATA_FOLDER, 'One_Hot_Processed_Fruits_Data.csv')

Mounted at /content/drive


In [2]:
# Initialize
df = pd.read_csv(ONEHOT_CSV_FILE, sep = ";")
random.seed(42)

In [3]:
# Preprocessing
# Removing unnecessary text columns
df.drop(columns=["Image_path","Text","Label"], inplace=True)

# Keeping number of items same for each class
N_TRAIN = 450
N_VAL   = 50
N_TEST  = 100
N_TOTAL = N_TRAIN + N_VAL + N_TEST

train_dfs = []
val_dfs = []
test_dfs = []

categories = ['banana', 'tomato', 'apple', 'orange', 'tangerine']
for category in categories:
    subset = df[df["Fruit"] == category]
    subset = subset.sample(N_TOTAL, random_state=42).reset_index(drop=True)

    train_subset = subset.iloc[:N_TRAIN]
    val_subset   = subset.iloc[N_TRAIN : N_TRAIN + N_VAL]
    test_subset  = subset.iloc[N_TRAIN + N_VAL : N_TOTAL]

    train_dfs.append(train_subset)
    val_dfs.append(val_subset)
    test_dfs.append(test_subset)

# 3. Concatenating and shuffling
df_train = pd.concat(train_dfs).sample(frac=1, random_state=42).reset_index(drop=True)
df_val   = pd.concat(val_dfs).sample(frac=1, random_state=42).reset_index(drop=True)
df_test  = pd.concat(test_dfs).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Train Shape: {df_train.shape}")
print(f"Val Shape:   {df_val.shape}")
print(f"Test Shape:  {df_test.shape}")

# Normalization
numerical_cols = ["Weight","Price"]
image_cols = [column for column in df_train.columns if "img" in column]
text_cols = [column for column in df_train.columns if "text" in column]
categorical_cols = [column for column in df_train.columns if (column not in numerical_cols + image_cols + text_cols) and (column != "Fruit")] # We don't want the target
columns_to_normalize = numerical_cols + image_cols + text_cols

epsilon = 1e-8  # To prevent division by zero
for column in columns_to_normalize:
    mean = df_train[column].mean()
    std = df_train[column].std()
    df_train[column] = (df_train[column] - mean) / (std + epsilon)
    df_val[column]   = (df_val[column] - mean) / (std + epsilon)
    df_test[column]  = (df_test[column] - mean) / (std + epsilon)

# Removing the target
target_col = 'Fruit'

X_train = df_train.drop(columns=[target_col])
y_train = df_train[target_col]

X_val = df_val.drop(columns=[target_col])
y_val = df_val[target_col]

X_test = df_test.drop(columns=[target_col])
y_test = df_test[target_col]


Train Shape: (2250, 493)
Val Shape:   (250, 493)
Test Shape:  (500, 493)


In [4]:
def sigmoid(x):
  return 1 / (1 + np.exp(-x))

def train_logistic_regression(X_train,y_train,X_val,y_val,class_name,selected_cols,epochs=1000,learning_rate=0.001,regularization_lambda=0.0001,patience=5):
  train = np.array(X_train[selected_cols])
  val = np.array(X_val[selected_cols])
  num_features = train.shape[1]
  N = len(train)

  # Adding a column of ones for the bias term
  ones_train  = np.ones((train.shape[0], 1))
  train_augmented = np.concatenate((ones_train, train), axis=1)
  ones_val    = np.ones((val.shape[0], 1))
  val_augmented   = np.concatenate((ones_val, val), axis=1)

  # Initialize weigths
  weights = np.zeros(np.shape(train_augmented)[1])

  # Creating labels +1 and -1 like the slides
  train_labels = np.where(y_train == class_name, 1, -1)
  val_labels = np.where(y_val == class_name, 1, -1)

  train_losses = []
  val_losses = []

  # Early stopping
  best_val_loss = float('inf')
  patience_counter = 0
  best_weights = copy.deepcopy(weights)

  for epoch in range(epochs):
    train_loss = 0
    val_loss = 0
    # Random shuffling for SGD
    indices = np.arange(N)
    np.random.shuffle(indices)
    for i in range(N):
      wTx = np.dot(weights, train_augmented[i])
      prediction = sigmoid(wTx)
      train_loss += np.log(1 + np.exp(-train_labels[i] * wTx))
      loss_gradient = -train_labels[i] * train_augmented[i] / (1 + np.exp(train_labels[i] * wTx))
      regularization_gradient = regularization_lambda * weights
      gradient = loss_gradient + regularization_gradient
      weights = weights - learning_rate * gradient
    train_loss /= N
    # Calculating all scores at once
    val_wTx = np.dot(val_augmented, weights)
    val_loss = np.mean(np.log(1 + np.exp(-val_labels * val_wTx)))
    if val_loss < best_val_loss:
      best_val_loss = val_loss
      best_weights = copy.deepcopy(weights)
      patience_counter = 0
    else:
      patience_counter += 1

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    if epoch % 50 == 0:
      print(f"Epoch {epoch+1}/{epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f}")
    if patience_counter >= patience:
      print("Stopped with early stopping")
      break
  return best_weights, train_losses, val_losses

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import matplotlib.pyplot as plt
import numpy as np

def train_sklearn_logistic_regression(X_train, y_train, X_val, y_val, class_name,
                                      selected_cols=None,
                                      C=10000, # lambda = 0.0001
                                      max_iter=1000):

    X_train_subset = X_train[selected_cols]
    X_val_subset   = X_val[selected_cols]

    y_train_bin = np.where(y_train == class_name, 1, 0) # Sklearn prefers 0/1 for binary
    y_val_bin   = np.where(y_val == class_name, 1, 0)

    model = LogisticRegression(
        penalty='l2',
        C=C,
        solver='lbfgs',
        max_iter=max_iter,
        random_state=42
        )

    model.fit(X_train_subset, y_train_bin)

    # Calculating the loss with our function
    train_z = model.decision_function(X_train_subset)
    val_z   = model.decision_function(X_val_subset)

    # Converting labels to +1/-1
    y_train_pm1 = np.where(y_train_bin == 1, 1, -1)
    y_val_pm1   = np.where(y_val_bin == 1, 1, -1)

    # Calculate mean log-loss manually
    train_loss = np.mean(np.log(1 + np.exp(-y_train_pm1 * train_z)))
    val_loss   = np.mean(np.log(1 + np.exp(-y_val_pm1 * val_z)))

    print(f"Sklearn Final Results for {class_name}:")
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    return model, train_loss, val_loss

In [6]:
def plot_losses(train_losses, val_losses, class_name, features_used="all", implementation="ours"):
  plt.plot(train_losses, label='Training Loss')
  plt.plot(val_losses, label='Validation Loss')
  plt.title(f"Training curve for class : {class_name}, using {features_used} features")
  plt.savefig(f"{implementation}_training_curve_{class_name}_{features_used}.png")

In [7]:
col_labels = ["image", "text", "categorical", "numerical", "all"]
cols_list = [image_cols, text_cols, categorical_cols, numerical_cols, X_train.columns]
categories = ['banana', 'tomato', 'apple', 'orange', 'tangerine']

trained_models = {label: {} for label in col_labels}
sklearn_models = {label: {} for label in col_labels}
history = {label: {} for label in col_labels} # For losses
runtimes = {label: {} for label in col_labels}

for i, feature_name in enumerate(col_labels):
    current_cols = cols_list[i]
    print(f"\n=== Training Models using {feature_name.upper()} features ===")
    for target in categories:
        print(f"  > Training One-vs-All for: {target}")
        start_time = time.time()
        weights, train_losses, val_losses = train_logistic_regression(
            X_train, y_train,
            X_val, y_val,
            class_name=target,
            selected_cols=current_cols,
            epochs=1000,
            learning_rate=0.0001,
            patience=10
        )
        end_time = time.time()
        our_time = end_time - start_time
        trained_models[feature_name][target] = weights
        plot_losses(train_losses, val_losses, target, features_used=feature_name, implementation="ours")
        plt.close()
        start_time = time.time()
        sklearn_weigths, sklearn_losses, sklearn_val_losses = train_sklearn_logistic_regression(
            X_train, y_train,
            X_val, y_val,
            class_name=target,
            selected_cols=current_cols,
            C=10000
        )
        end_time = time.time()
        sk_time = end_time - start_time
        history[feature_name][target] = {
            'ours': {
                'train': train_losses, # This is a list
                'val': val_losses
            },
            'sklearn': {
                'train': sklearn_losses, # This is a single float
                'val': sklearn_val_losses
            }
        }
        runtimes[feature_name][target] = {
            'ours': our_time,
            'sklearn': sk_time
        }
        sklearn_models[feature_name][target] = sklearn_weigths
print("\nAll models trained and stored!")


=== Training Models using IMAGE features ===
  > Training One-vs-All for: banana
Epoch 1/1000 - Train Loss: 0.6795 - Val Loss: 0.6703
Epoch 51/1000 - Train Loss: 0.4330 - Val Loss: 0.5008
Stopped with early stopping
Sklearn Final Results for banana:
Train Loss: 0.2920 | Val Loss: 1.4148
  > Training One-vs-All for: tomato
Epoch 1/1000 - Train Loss: 0.6815 - Val Loss: 0.6666
Epoch 51/1000 - Train Loss: 0.4493 - Val Loss: 0.4520
Epoch 101/1000 - Train Loss: 0.4343 - Val Loss: 0.4396
Epoch 151/1000 - Train Loss: 0.4279 - Val Loss: 0.4345
Epoch 201/1000 - Train Loss: 0.4237 - Val Loss: 0.4317
Epoch 251/1000 - Train Loss: 0.4207 - Val Loss: 0.4301
Epoch 301/1000 - Train Loss: 0.4184 - Val Loss: 0.4291
Epoch 351/1000 - Train Loss: 0.4164 - Val Loss: 0.4285
Epoch 401/1000 - Train Loss: 0.4148 - Val Loss: 0.4281
Epoch 451/1000 - Train Loss: 0.4134 - Val Loss: 0.4278
Epoch 501/1000 - Train Loss: 0.4121 - Val Loss: 0.4277
Epoch 551/1000 - Train Loss: 0.4110 - Val Loss: 0.4276
Epoch 601/1000 - T

In [8]:
def predict_multiclass_unified(X, models_dict, feature_type, class_names, feature_cols, implementation='ours'):
    # Multiclass prediction by taking the one with highest probability.
    if isinstance(X, pd.DataFrame):
        X_subset = X[feature_cols].values
        X_subset_df = X[feature_cols]

    all_probs = []
    for target in class_names:
        if implementation == 'ours':
            weights = models_dict[feature_type][target]
            # Adding bias
            m = X_subset.shape[0]
            ones = np.ones((m, 1))
            X_augmented = np.concatenate((ones, X_subset), axis=1)

            # Calculate Z and Sigmoid
            z = np.dot(X_augmented, weights)
            prob = sigmoid(z)
            all_probs.append(prob)

        else:
            model = models_dict[feature_type][target]
            # Get probability of 1
            prob = model.predict_proba(X_subset_df)[:, 1]
            all_probs.append(prob)

    all_probs = np.array(all_probs).T
    # Selecting index with highest probability
    predictions_idx = np.argmax(all_probs, axis=1)
    predictions_str = [class_names[i] for i in predictions_idx]

    return np.array(predictions_str), all_probs

In [9]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score,precision_score,recall_score
from sklearn.preprocessing import label_binarize
import pandas as pd
results_table = []
implementations = ['ours', 'sklearn']

print(f"{'IMPL':<8} | {'FEATURE':<12} | {'ACC':<6} | {'F1':<6} | {'AUC':<6} | {'TIME (s)':<8}")
print("-" * 65)

for feature_name in col_labels:
    current_cols = cols_list[col_labels.index(feature_name)]

    # Total training time
    time_ours = sum([runtimes[feature_name][t]['ours'] for t in categories])
    time_sk   = sum([runtimes[feature_name][t]['sklearn'] for t in categories])

    dicts_to_test = {
        'ours': (trained_models, time_ours),
        'sklearn': (sklearn_models, time_sk)
    }

    for implementation_name, (model_dict, total_time) in dicts_to_test.items():

        y_pred, y_probs = predict_multiclass_unified(
            X_test,
            model_dict,
            feature_type=feature_name,
            class_names=categories,
            feature_cols=current_cols,
            implementation=implementation_name
        )

        # 2. Calculate Metrics
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='weighted', zero_division=0)
        rec  = recall_score(y_test, y_pred, average='weighted', zero_division=0)
        f1  = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        y_test_bin = label_binarize(y_test, classes=categories)
        try:
            auc = roc_auc_score(y_test_bin, y_probs, multi_class='ovr', average='weighted')
        except ValueError:
            auc = 0.0

        # AUC Calculation
        from sklearn.preprocessing import label_binarize
        y_test_bin = label_binarize(y_test, classes=categories)
        try:
            auc = roc_auc_score(y_test_bin, y_probs, multi_class='ovr', average='weighted')
        except ValueError:
            auc = 0.0

        print(f"{implementation_name:<8} | {feature_name:<12} | {acc:.3f}  | {prec:.3f}  | {rec:.3f}  | {f1:.3f}  | {auc:.3f}  | {total_time:.4f}")

        results_table.append({
            'Implementation': implementation_name,
            'Features': feature_name,
            'Accuracy': acc,
            'Precision': prec,
            'Recall': rec,
            'F1 Score': f1,
            'AUC': auc,
            'Total Training Time (s)': total_time
        })

df_results = pd.DataFrame(results_table)
df_results = df_results.sort_values(by=['Features', 'Implementation'])

print("\n=== FINAL HEAD-TO-HEAD COMPARISON ===")
print(df_results[['Features', 'Implementation', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC', 'Total Training Time (s)']].to_string(index=False))

ours_avg_time = df_results[df_results['Implementation']=='ours']['Total Training Time (s)'].mean()
sk_avg_time = df_results[df_results['Implementation']=='sklearn']['Total Training Time (s)'].mean()
print(f"Our average train time : {ours_avg_time}")
print(f"Sklearn average train time : {sk_avg_time}")
print(f"\nAverage Speedup (Sklearn vs Ours): {ours_avg_time / sk_avg_time:.2f}x faster")

IMPL     | FEATURE      | ACC    | F1     | AUC    | TIME (s)
-----------------------------------------------------------------
ours     | image        | 0.464  | 0.481  | 0.464  | 0.464  | 0.764  | 105.5174
sklearn  | image        | 0.586  | 0.585  | 0.586  | 0.579  | 0.810  | 0.9866
ours     | text         | 0.820  | 0.828  | 0.820  | 0.822  | 0.974  | 231.4222
sklearn  | text         | 0.808  | 0.811  | 0.808  | 0.809  | 0.959  | 12.5014
ours     | categorical  | 0.908  | 0.932  | 0.908  | 0.904  | 0.981  | 193.0227
sklearn  | categorical  | 0.918  | 0.942  | 0.918  | 0.914  | 0.983  | 0.1090
ours     | numerical    | 0.838  | 0.843  | 0.838  | 0.833  | 0.833  | 188.3523
sklearn  | numerical    | 0.830  | 0.833  | 0.830  | 0.827  | 0.833  | 0.1474
ours     | all          | 0.990  | 0.990  | 0.990  | 0.990  | 0.998  | 175.9349
sklearn  | all          | 0.976  | 0.977  | 0.976  | 0.976  | 0.995  | 1.3620

=== FINAL HEAD-TO-HEAD COMPARISON ===
   Features Implementation  Accuracy  Prec

In [10]:
import glob

plot_files = glob.glob('*_training_curve_*.png')
print(f"Found {len(plot_files)} plot files:")
for f in plot_files:
    print(f)

Found 25 plot files:
ours_training_curve_tangerine_text.png
ours_training_curve_orange_all.png
ours_training_curve_apple_all.png
ours_training_curve_tomato_numerical.png
ours_training_curve_orange_numerical.png
ours_training_curve_apple_image.png
ours_training_curve_tomato_categorical.png
ours_training_curve_apple_categorical.png
ours_training_curve_banana_numerical.png
ours_training_curve_banana_all.png
ours_training_curve_apple_numerical.png
ours_training_curve_tangerine_image.png
ours_training_curve_tangerine_categorical.png
ours_training_curve_tomato_image.png
ours_training_curve_banana_image.png
ours_training_curve_apple_text.png
ours_training_curve_tomato_all.png
ours_training_curve_orange_image.png
ours_training_curve_tangerine_numerical.png
ours_training_curve_orange_categorical.png
ours_training_curve_banana_text.png
ours_training_curve_orange_text.png
ours_training_curve_banana_categorical.png
ours_training_curve_tangerine_all.png
ours_training_curve_tomato_text.png


In [11]:
import zipfile

zip_filename = 'training_plots.zip'
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for file in plot_files:
        zipf.write(file)

print(f"Successfully created '{zip_filename}' containing {len(plot_files)} files.")

Successfully created 'training_plots.zip' containing 25 files.


In [12]:
from google.colab import files
files.download(zip_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>