In [1]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score

# --- Step 1: Load Your PROCESSED Data ---
try:
    train_df = pd.read_csv('./Data/salary.train.processed.csv', index_col='id')
    test_df = pd.read_csv('./Data/salary.test.processed.csv', index_col='id')
except FileNotFoundError:
    print("Error: Could not find the processed CSV files.")
    print("Please make sure 'salary.train.processed.csv' and 'salary.test.processed.csv' are in the './Data/' folder.")
    raise

# --- Step 2: Separate Features (X) and Target (y) ---
X_train = train_df.drop('label', axis=1)
y_train = train_df['label']
X_test = test_df.drop('label', axis=1)
y_test = test_df['label']

print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")

# --- Step 3: Initialize and Train the Naive Bayes Model ---
# We use GaussianNB because our features are continuous (after scaling)
nb_model = GaussianNB()

print("\nTraining the Naive Bayes model...")
nb_model.fit(X_train, y_train)
print("Model training complete!")

# --- Step 4: Evaluate the Model ---
print("\nEvaluating the model on the test set...")
y_pred_nb = nb_model.predict(X_test)

# Check accuracy
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"\nNaive Bayes Model Accuracy on Test Data: {accuracy_nb * 100:.2f}%")

# Get a detailed report
print("\nNaive Bayes Classification Report:")
print(classification_report(y_test, y_pred_nb))

Training features shape: (16720, 89)
Testing features shape: (4180, 89)

Training the Naive Bayes model...
Model training complete!

Evaluating the model on the test set...

Naive Bayes Model Accuracy on Test Data: 62.13%

Naive Bayes Classification Report:
              precision    recall  f1-score   support

         0.0       0.91      0.38      0.54      2416
         1.0       0.53      0.95      0.68      1764

    accuracy                           0.62      4180
   macro avg       0.72      0.67      0.61      4180
weighted avg       0.75      0.62      0.60      4180

