In [3]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from collections import Counter
import numpy as np

# Load the dataset from the 'result.xlsx' file
file_path = '../result.xlsx'  # Replace with your actual file path
data = pd.read_excel(file_path)

# Assuming the target variable is in the last column
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target variable

# Print the original class distribution
print(f"Original class distribution: {Counter(y)}")

# Initialize RandomUnderSampler
undersampler = RandomUnderSampler(random_state=42)

# Apply RandomUnderSampler to the whole dataset
X_resampled, y_resampled = undersampler.fit_resample(X, y)

# Print the new class distribution after undersampling
print(f"Resampled class distribution: {Counter(y_resampled)}")

# Initialize the classifier
clf = RandomForestClassifier(random_state=42)

# Initialize StratifiedKFold with 10 folds
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Store metrics for each fold
accuracies = []
precisions = []
recalls = []
f1s = []
confusion_matrices = []

# Perform cross-validation on the resampled dataset
for train_index, test_index in skf.split(X_resampled, y_resampled):
    X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
    y_train, y_test = y_resampled.iloc[train_index], y_resampled.iloc[test_index]
    
    # Train the classifier on the resampled training data
    clf.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = clf.predict(X_test)
    
    # Calculate metrics for this fold
    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(precision_score(y_test, y_pred, average='binary', zero_division=0))
    recalls.append(recall_score(y_test, y_pred, average='binary', zero_division=0))
    f1s.append(f1_score(y_test, y_pred, average='binary', zero_division=0))
    confusion_matrices.append(confusion_matrix(y_test, y_pred))

# Print average metrics
print(f"Average Accuracy: {np.mean(accuracies):.4f}")
print(f"Average Precision: {np.mean(precisions):.4f}")
print(f"Average Recall: {np.mean(recalls):.4f}")
print(f"Average F1 Score: {np.mean(f1s):.4f}")

# Print average confusion matrix
average_confusion_matrix = np.mean(confusion_matrices, axis=0)
print("\nAverage Confusion Matrix:\n", average_confusion_matrix)


Original class distribution: Counter({0: 6844, 1: 690})
Resampled class distribution: Counter({0: 690, 1: 690})
Average Accuracy: 0.6833
Average Precision: 0.6844
Average Recall: 0.6841
Average F1 Score: 0.6830

Average Confusion Matrix:
 [[47.1 21.9]
 [21.8 47.2]]
