<a href="https://colab.research.google.com/github/Anni1808/MINed_Hackoholics/blob/main/NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix, classification_report

# Load the training and test datasets
train_data = pd.read_csv('/content/sample_data/final_merged_dataset.csv')
test_data = pd.read_csv('/content/sample_data/aligned_test.csv')

# Separate features (X) and target (y) for training data
X = train_data.drop('Type', axis=1)  # Features
y = train_data['Type']  # Target

# Remove unnamed columns if they exist
X = X.loc[:, ~X.columns.str.contains('^Unnamed')]
test_data = test_data.loc[:, ~test_data.columns.str.contains('^Unnamed')]

# Select only numeric columns
numeric_cols = X.select_dtypes(include=[np.number]).columns
X_numeric = X[numeric_cols]
X_test_numeric = test_data[numeric_cols]

# Check for and handle infinite values
X_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)

# Handle missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_numeric = pd.DataFrame(
    imputer.fit_transform(X_numeric),
    columns=X_numeric.columns,
    index=X_numeric.index
)
X_test_numeric = pd.DataFrame(
    imputer.transform(X_test_numeric),
    columns=X_test_numeric.columns,
    index=X_test_numeric.index
)

# Feature scaling
scaler = StandardScaler()
X_scaled = pd.DataFrame(
    scaler.fit_transform(X_numeric),
    columns=X_numeric.columns,
    index=X_numeric.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test_numeric),
    columns=X_test_numeric.columns,
    index=X_test_numeric.index
)

# Encode target variable
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Split the training data into training and validation subsets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Define the parameter grid for hyperparameter tuning (Naive Bayes)
param_grid = {
    'var_smoothing': np.logspace(-11, -1, num=11)  # Smoothing parameter for Naive Bayes
}

# Create a Naive Bayes Classifier
nb_classifier = GaussianNB()

# Create a KFold object for cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Create a GridSearchCV object for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=nb_classifier,
    param_grid=param_grid,
    scoring='accuracy',
    cv=kf
)

# Fit the model with hyperparameter tuning on the training subset
grid_search.fit(X_train, y_train)

# Get the best model from the grid search
best_nb_model = grid_search.best_estimator_

# Evaluate the model on the validation subset
y_val_pred = best_nb_model.predict(X_val)

# Calculate performance metrics on the validation subset
precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_val_pred, average='weighted')
accuracy = accuracy_score(y_val, y_val_pred)
conf_matrix = confusion_matrix(y_val, y_val_pred)

# Convert target names to strings explicitly
target_names = [str(cls) for cls in le.classes_]

# Print performance metrics
print("\nValidation Set Performance Metrics:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Accuracy: {accuracy:.4f}")
print("\nConfusion Matrix (Validation Data):")
print(conf_matrix)

# Detailed classification report
print("\nDetailed Classification Report (Validation Data):")
print(classification_report(y_val, y_val_pred, target_names=target_names))

# Make predictions on the external test data
y_test_pred_encoded = best_nb_model.predict(X_test_scaled)

# Decode predictions back to original labels
y_test_pred = le.inverse_transform(y_test_pred_encoded)

# Create a DataFrame for predictions
predictions_df = pd.DataFrame({'SHA256': test_data['SHA256'], 'Predictions': y_test_pred})

# Save predictions to a CSV file
predictions_df.to_csv('predictions_nb.csv', index=False)

print("\nPredictions saved to predictions_nb.csv")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_numeric.replace([np.inf, -np.inf], np.nan, inplace=True)



Validation Set Performance Metrics:
Precision: 0.9443
Recall: 0.9421
F1 Score: 0.9421
Accuracy: 0.9421

Confusion Matrix (Validation Data):
[[36  0  4  0  1  0  0]
 [ 0 43  0  2  1  0  0]
 [ 0  2 40  0  1  0  0]
 [ 0  2  0 32  1  0  0]
 [ 0  0  0  0 36  0  0]
 [ 0  0  0  0  0  6  0]
 [ 0  0  0  0  0  0 35]]

Detailed Classification Report (Validation Data):
              precision    recall  f1-score   support

           0       1.00      0.88      0.94        41
           1       0.91      0.93      0.92        46
           2       0.91      0.93      0.92        43
           3       0.94      0.91      0.93        35
           4       0.90      1.00      0.95        36
           5       1.00      1.00      1.00         6
           6       1.00      1.00      1.00        35

    accuracy                           0.94       242
   macro avg       0.95      0.95      0.95       242
weighted avg       0.94      0.94      0.94       242


Predictions saved to predictions_nb.csv
