# Naive Bayes

## Importing the libraries

In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from scipy.stats import uniform

In [None]:
# Load the dataset
data = pd.read_csv('cancer_data.csv')

In [None]:
# Step 1: Remove the ID column
#data.drop(columns=['ID'], inplace=True)

In [None]:
# Step 2: Encode the Diagnosis column (M = 1, B = 0)
label_encoder = LabelEncoder()
data['diagnosis'] = label_encoder.fit_transform(data['diagnosis'])

In [None]:
# Step 3: Choose Diagnosis as the target variable
X = data.drop(columns=['diagnosis'])  # Features
y = data['diagnosis']  # Target

In [None]:
# Step 4: Split the data into train (80%), test (15%), and validation (5%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  # 0.25 * 20% = 5%

In [None]:
# Step 5: Hyperparameter Tuning for Naive Bayes
# Define the parameter distribution for Randomized Search
param_dist = {
    'var_smoothing': uniform(1e-9, 1e-1)  # Smoothing parameter for variance
}

In [None]:
# Initialize the Naive Bayes model
nb_model = GaussianNB()

In [None]:
# Set up Randomized Search with Cross-Validation
random_search = RandomizedSearchCV(
    estimator=nb_model,              # Base model
    param_distributions=param_dist,  # Parameter distribution to sample from
    n_iter=100,                      # Number of parameter settings to sample
    scoring='accuracy',              # Evaluation metric
    cv=10,                          # 10-fold cross-validation
    verbose=1,                      # Print progress
    n_jobs=-1,                      # Use all available CPU cores
    random_state=42                 # Seed for reproducibility
)

In [None]:
# Fit Randomized Search on the training data
random_search.fit(X_train, y_train)

In [None]:
# Retrieve the best parameters and best score
best_params = random_search.best_params_
best_score = random_search.best_score_

In [None]:
print("Best Parameters:", best_params)
print("Best Cross-Validation Accuracy:", best_score)

In [None]:
# Step 6: Train the final model with the best parameters
final_model = random_search.best_estimator_
final_model.fit(X_train, y_train)

In [None]:
# Step 7: Evaluate the model on the test set
y_test_pred = final_model.predict(X_test)
y_test_pred_prob = final_model.predict_proba(X_test)[:, 1]  # Probabilities for ROC-AUC

In [None]:
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)
test_roc_auc = roc_auc_score(y_test, y_test_pred_prob)

print("\nTest Set Performance:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1-Score: {test_f1:.4f}")
print(f"ROC-AUC: {test_roc_auc:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_test_pred))

In [None]:
# Step 8: Predict on the unseen validation set
y_val_pred = final_model.predict(X_val)
y_val_pred_prob = final_model.predict_proba(X_val)[:, 1]

val_accuracy = accuracy_score(y_val, y_val_pred)
val_precision = precision_score(y_val, y_val_pred)
val_recall = recall_score(y_val, y_val_pred)
val_f1 = f1_score(y_val, y_val_pred)
val_roc_auc = roc_auc_score(y_val, y_val_pred_prob)

print("\nValidation Set Performance:")
print(f"Accuracy: {val_accuracy:.4f}")
print(f"Precision: {val_precision:.4f}")
print(f"Recall: {val_recall:.4f}")
print(f"F1-Score: {val_f1:.4f}")
print(f"ROC-AUC: {val_roc_auc:.4f}")
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

### Results Summary

#### Hyperparameter Tuning:
- **Best Parameters**: `{'var_smoothing': 0.037}`
- **Best CV Accuracy**: **62.86%**

#### Test Set Performance:
- **Accuracy**: **58.82%**
- **Precision**: **0.00%** (no positive predictions)
- **Recall**: **0.00%**
- **F1-Score**: **0.00%**
- **ROC-AUC**: **56.11%**
- **Confusion Matrix**:
  ```
  [[50  1]
   [34  0]]
  ```

#### Validation Set Performance:
- **Accuracy**: **68.97%**
- **Precision**: **0.00%** (no positive predictions)
- **Recall**: **0.00%**
- **F1-Score**: **0.00%**
- **ROC-AUC**: **61.67%**
- **Confusion Matrix**:
  ```
  [[20  0]
   [ 9  0]]
  ```

### Key Issues:
- The model predicts **all samples as benign**, resulting in **0.00% precision, recall, and F1-score**.
- Low accuracy and ROC-AUC indicate poor performance.

### Reasons:
- **Imbalanced Data**: The dataset may have more benign cases, causing the model to favor the majority class.
- **Model Limitations**: Naive Bayes assumes feature independence, which may not fit this dataset well.

### Next Steps:
1. **Handle Class Imbalance**: Use oversampling, undersampling, or class weights.
2. **Try Other Models**: Use Logistic Regression, Random Forest, or SVM.
3. **Feature Engineering**: Explore additional features or transformations.