In [1]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from collections import Counter
import numpy as np

# Load the dataset from the 'result.xlsx' file
file_path = '../result.xlsx'  # Replace with your actual file path
data = pd.read_excel(file_path)

# Assuming the target variable is in the last column
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target variable

# Print the original class distribution
print(f"Original class distribution: {Counter(y)}")

# Combine features and target for processing
data_combined = pd.concat([X, y], axis=1)

# Group by feature columns and filter records based on the rule
filtered_records = []
counter = 0

# Iterate through groups to apply the filtering rule
for name, group in data_combined.groupby(list(X.columns)):
    bug_counts = group[y.name].value_counts()
    if len(bug_counts) > 1:
        # Determine the class with the larger cardinality
        larger_class = bug_counts.idxmax()
        # Filter out entries not in the larger class
        filtered_group = group[group[y.name] == larger_class]
        filtered_records.append(filtered_group)
        counter += 1
    else:
        filtered_records.append(group)

# Combine all filtered records back into a single DataFrame
filtered_data = pd.concat(filtered_records)

# Print the total number of groups where filtering was applied
print(f"Total number of groups where filtering was applied: {counter}")

# Extract features and target from the filtered data
X_filtered = filtered_data.iloc[:, :-1]
y_filtered = filtered_data.iloc[:, -1]

# Print the new class distribution after filtering
print(f"Filtered class distribution: {Counter(y_filtered)}")

# Initialize SMOTE and apply it once to the entire filtered dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_filtered, y_filtered)

# Print the resampled class distribution
print(f"Resampled class distribution: {Counter(y_resampled)}")

# Initialize the classifier
clf = RandomForestClassifier(random_state=42)

# Initialize StratifiedKFold with 10 folds
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Store metrics for each fold
accuracies = []
precisions = []
recalls = []
f1s = []
confusion_matrices = []

# Perform cross-validation on the resampled dataset
for train_index, test_index in skf.split(X_resampled, y_resampled):
    X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
    y_train, y_test = y_resampled.iloc[train_index], y_resampled.iloc[test_index]
    
    # Train the classifier
    clf.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = clf.predict(X_test)
    
    # Calculate metrics for this fold
    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(precision_score(y_test, y_pred, average='binary', zero_division=0))
    recalls.append(recall_score(y_test, y_pred, average='binary', zero_division=0))
    f1s.append(f1_score(y_test, y_pred, average='binary', zero_division=0))
    confusion_matrices.append(confusion_matrix(y_test, y_pred))

# Print average metrics
print(f"Average Accuracy: {np.mean(accuracies):.4f}")
print(f"Average Precision: {np.mean(precisions):.4f}")
print(f"Average Recall: {np.mean(recalls):.4f}")
print(f"Average F1 Score: {np.mean(f1s):.4f}")

# Print average confusion matrix
average_confusion_matrix = np.mean(confusion_matrices, axis=0)
print("\nAverage Confusion Matrix:\n", average_confusion_matrix)


Original class distribution: Counter({0: 6844, 1: 690})
Total number of groups where filtering was applied: 118
Filtered class distribution: Counter({0: 6776, 1: 640})
Resampled class distribution: Counter({0: 6776, 1: 6776})
Average Accuracy: 0.9458
Average Precision: 0.9313
Average Recall: 0.9628
Average F1 Score: 0.9468

Average Confusion Matrix:
 [[629.4  48.2]
 [ 25.2 652.4]]
