# Perform the Classification using Bayes Classifier and Naive Bayes Classifier

## Import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import multivariate_normal
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

## Step 1: Load the dataset and prepare the train, validation, and test Data

In [None]:
# Load datasets
df = pd.read_csv('Date_Fruit_Datasets.csv')

# Display first few rows of train data
print(df.head())

# Separate features and target
X = df.drop('Class', axis=1)
y = df['Class']

print("\nFeatures (X) shape:", X.shape)
print("Target  (y) shape:", y.shape)
print("\nNumber of features:", X.shape[1])
print("Number of classes :", y.nunique())
print("Classes           :", sorted(y.unique()))

# --------------------------------------------------------------------------
# First split  →  60% Train  |  40% Temp (Validation + Test)
# --------------------------------------------------------------------------
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.4,
    stratify=y,
    random_state=42
)

# --------------------------------------------------------------------------
# Second split  →  Temp split equally into 20% Validation | 20% Test
# --------------------------------------------------------------------------
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    stratify=y_temp,
    random_state=42
)

# Print sizes
print("\nSplit Summary:")
print(f"  Training set  : {X_train.shape[0]} samples  ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"  Validation set: {X_val.shape[0]} samples  ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"  Test set      : {X_test.shape[0]} samples  ({X_test.shape[0]/len(X)*100:.1f}%)")

# Verify stratification
print("\nClass distribution per split:")
print(f"  Train :  {y_train.value_counts().sort_index().tolist()}")
print(f"  Val   :  {y_val.value_counts().sort_index().tolist()}")
print(f"  Test  :  {y_test.value_counts().sort_index().tolist()}")

## Bayes classifier implementation steps

### 1. Compute mean and covariance matrix of each class
### 2. Computer Prior probability of each class
### 3. For each test example/sample and for each class, calculate likelihood, prior probability, total probability, and then posterior probability
### 4. Now, for each test sample, we have c number of posterior probability. Here, c is the number of classes/targets. Find for which class, posterior probability is maximum and assign this class label to the test sample.
### 5. Get the prediction label for each test sample and evaluate the performance metrics

In [None]:
# Write the code for Bayes Classifier

# Get unique classes
classes = np.unique(y_train)
n_classes = len(classes)

# Create dictionaries to store parameters for each class
class_means = {}
class_covs = {}
class_priors = {}

# Step 1 & 2: Compute mean, covariance matrix, and prior probability for each class
print("Training Bayes Classifier...\n")
print("="*70)
print("PARAMETER ESTIMATION FOR EACH CLASS")
print("="*70)

for cls in classes:
    # Get samples belonging to this class
    X_class = X_train[y_train == cls]
    
    # Compute mean vector (μ_i)
    class_means[cls] = np.mean(X_class, axis=0)
    
    # Compute covariance matrix (Σ_i)
    class_covs[cls] = np.cov(X_class, rowvar=False)
    
    # Compute prior probability P(C_i)
    class_priors[cls] = len(X_class) / len(X_train)
    
    print(f"\nClass: {cls}")
    print(f"  Number of samples: {len(X_class)}")
    print(f"  Prior probability: {class_priors[cls]:.4f}")
    print(f"  Mean vector shape: {class_means[cls].shape}")
    print(f"  Covariance matrix shape: {class_covs[cls].shape}")

print("\n" + "="*70)
print("Training completed!")
print("="*70)

In [None]:
# Function to predict using Bayes Classifier
def bayes_predict(X_data, class_means, class_covs, class_priors, classes):
    """
    Predict class labels using Bayes classifier
    
    Parameters:
    -----------
    X_data : array-like
        Input features
    class_means : dict
        Mean vectors for each class
    class_covs : dict
        Covariance matrices for each class
    class_priors : dict
        Prior probabilities for each class
    classes : array-like
        List of class labels
    
    Returns:
    --------
    predictions : array
        Predicted class labels
    """
    n_samples = len(X_data)
    n_classes = len(classes)
    
    # Initialize array to store posterior probabilities
    # Shape: (n_samples, n_classes)
    posteriors = np.zeros((n_samples, n_classes))
    
    # Step 3: Calculate likelihood and posterior for each class
    for idx, cls in enumerate(classes):
        # Calculate likelihood p(x|μ_i, Σ_i) for all samples at once
        likelihood = multivariate_normal.pdf(
            np.array(X_data),
            mean=class_means[cls],
            cov=class_covs[cls],
            allow_singular=True
        )
        
        # Calculate numerator: likelihood * prior
        posteriors[:, idx] = likelihood * class_priors[cls]
    
    # Calculate total probability (denominator) for normalization
    total_probabilities = np.sum(posteriors, axis=1, keepdims=True)
    
    # Avoid division by zero
    total_probabilities[total_probabilities == 0] = 1e-10
    
    # Calculate normalized posterior probabilities
    posteriors = posteriors / total_probabilities
    
    # Step 4: Assign class with maximum posterior probability
    predictions_idx = np.argmax(posteriors, axis=1)
    predictions = classes[predictions_idx]
    
    return predictions

# Predict on validation data
print("\nPredicting on Validation Data...")
y_val_pred_bayes = bayes_predict(X_val, class_means, class_covs, class_priors, classes)
print(f"Validation predictions shape: {y_val_pred_bayes.shape}")

# Predict on test data
print("\nPredicting on Test Data...")
y_test_pred_bayes = bayes_predict(X_test, class_means, class_covs, class_priors, classes)
print(f"Test predictions shape: {y_test_pred_bayes.shape}")

## Performance metrics evaluation for Bayes classifier

### Find confusion matrix, accuracy, precision, recall, and f1-score for validation and test data

In [None]:
# Write the code for performance metrics evaluation

def evaluate_performance(y_true, y_pred, dataset_name="Dataset"):
    """
    Evaluate and print performance metrics
    
    Parameters:
    -----------
    y_true : array-like
        True labels
    y_pred : array-like
        Predicted labels
    dataset_name : str
        Name of the dataset (for printing)
    """
    print("\n" + "="*70)
    print(f"PERFORMANCE METRICS - {dataset_name}")
    print("="*70)
    
    # Confusion Matrix
    cm = confusion_matrix(y_true, y_pred)
    print("\nConfusion Matrix:")
    print(cm)
    
    # Accuracy
    accuracy = accuracy_score(y_true, y_pred)
    print(f"\nAccuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    
    # Precision (macro and micro)
    precision_macro = precision_score(y_true, y_pred, average='macro')
    precision_micro = precision_score(y_true, y_pred, average='micro')
    print(f"\nPrecision (Macro): {precision_macro:.4f}")
    print(f"Precision (Micro): {precision_micro:.4f}")
    
    # Recall (macro and micro)
    recall_macro = recall_score(y_true, y_pred, average='macro')
    recall_micro = recall_score(y_true, y_pred, average='micro')
    print(f"\nRecall (Macro): {recall_macro:.4f}")
    print(f"Recall (Micro): {recall_micro:.4f}")
    
    # F1-Score (macro and micro)
    f1_macro = f1_score(y_true, y_pred, average='macro')
    f1_micro = f1_score(y_true, y_pred, average='micro')
    print(f"\nF1-Score (Macro): {f1_macro:.4f}")
    print(f"F1-Score (Micro): {f1_micro:.4f}")
    
    print("="*70)
    
    return {
        'confusion_matrix': cm,
        'accuracy': accuracy,
        'precision_macro': precision_macro,
        'precision_micro': precision_micro,
        'recall_macro': recall_macro,
        'recall_micro': recall_micro,
        'f1_macro': f1_macro,
        'f1_micro': f1_micro
    }

# Evaluate Bayes Classifier on Validation Data
print("\n" + "#"*70)
print("# BAYES CLASSIFIER EVALUATION")
print("#"*70)
val_metrics_bayes = evaluate_performance(y_val, y_val_pred_bayes, "Validation Set (Bayes Classifier)")

# Evaluate Bayes Classifier on Test Data
test_metrics_bayes = evaluate_performance(y_test, y_test_pred_bayes, "Test Set (Bayes Classifier)")

## Naive Bayes classifier (Using GaussianNB)

In [None]:
# Train Naive Bayes model
print("\n" + "="*70)
print("TRAINING NAIVE BAYES CLASSIFIER (GaussianNB)")
print("="*70)

nb = GaussianNB()
nb.fit(X_train, y_train)

print("\nNaive Bayes model trained successfully!")
print(f"Number of classes: {len(nb.classes_)}")
print(f"Classes: {nb.classes_}")

# Predicted Labels for Validation Data
y_val_pred_nb = nb.predict(X_val)
print(f"\nValidation predictions shape: {y_val_pred_nb.shape}")

# Predicted Labels for Test Data
y_test_pred_nb = nb.predict(X_test)
print(f"Test predictions shape: {y_test_pred_nb.shape}")

## Performance metrics evaluation for Naive Bayes classifier

### Find confusion matrix, accuracy, precision, recall, and f1-score for validation and test data

In [None]:
# Write the code for performance metrics evaluation

# Evaluate Naive Bayes Classifier on Validation Data
print("\n" + "#"*70)
print("# NAIVE BAYES CLASSIFIER EVALUATION")
print("#"*70)
val_metrics_nb = evaluate_performance(y_val, y_val_pred_nb, "Validation Set (Naive Bayes Classifier)")

# Evaluate Naive Bayes Classifier on Test Data
test_metrics_nb = evaluate_performance(y_test, y_test_pred_nb, "Test Set (Naive Bayes Classifier)")

## Comparison of Bayes and Naive Bayes Classifiers

In [None]:
# Compare performance of both classifiers
print("\n" + "="*70)
print("COMPARISON: BAYES vs NAIVE BAYES CLASSIFIERS")
print("="*70)

comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision (Macro)', 'Precision (Micro)', 
               'Recall (Macro)', 'Recall (Micro)', 'F1-Score (Macro)', 'F1-Score (Micro)'],
    'Bayes (Val)': [
        val_metrics_bayes['accuracy'],
        val_metrics_bayes['precision_macro'],
        val_metrics_bayes['precision_micro'],
        val_metrics_bayes['recall_macro'],
        val_metrics_bayes['recall_micro'],
        val_metrics_bayes['f1_macro'],
        val_metrics_bayes['f1_micro']
    ],
    'Bayes (Test)': [
        test_metrics_bayes['accuracy'],
        test_metrics_bayes['precision_macro'],
        test_metrics_bayes['precision_micro'],
        test_metrics_bayes['recall_macro'],
        test_metrics_bayes['recall_micro'],
        test_metrics_bayes['f1_macro'],
        test_metrics_bayes['f1_micro']
    ],
    'Naive Bayes (Val)': [
        val_metrics_nb['accuracy'],
        val_metrics_nb['precision_macro'],
        val_metrics_nb['precision_micro'],
        val_metrics_nb['recall_macro'],
        val_metrics_nb['recall_micro'],
        val_metrics_nb['f1_macro'],
        val_metrics_nb['f1_micro']
    ],
    'Naive Bayes (Test)': [
        test_metrics_nb['accuracy'],
        test_metrics_nb['precision_macro'],
        test_metrics_nb['precision_micro'],
        test_metrics_nb['recall_macro'],
        test_metrics_nb['recall_micro'],
        test_metrics_nb['f1_macro'],
        test_metrics_nb['f1_micro']
    ]
})

print("\n")
print(comparison_df.to_string(index=False))
print("\n" + "="*70)

# Summary
print("\nSUMMARY:")
print("-" * 70)
print(f"Bayes Classifier Test Accuracy: {test_metrics_bayes['accuracy']:.4f}")
print(f"Naive Bayes Classifier Test Accuracy: {test_metrics_nb['accuracy']:.4f}")
print("\nKey Difference:")
print("- Bayes Classifier: Uses full covariance matrix (captures feature dependencies)")
print("- Naive Bayes Classifier: Assumes feature independence (diagonal covariance)")
print("="*70)