In [1]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from collections import Counter
import numpy as np

# Load the dataset from the 'result.xlsx' file
file_path = '../result.xlsx'  # Replace with your actual file path
data = pd.read_excel(file_path)

# Assuming the target variable is in the last column
X = data.iloc[:, :-1]  # Features
y = data.iloc[:, -1]   # Target variable

# Print the original class distribution
print(f"Original class distribution: {Counter(y)}")

# Initialize Random Undersampler
rus = RandomUnderSampler(random_state=42)

# Apply Random Undersampling to the original dataset
X_resampled, y_resampled = rus.fit_resample(X, y)

# Convert resampled data back to DataFrame for compatibility with indexing
X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled = pd.Series(y_resampled, name=y.name)

# Print the resampled class distribution
print(f"Resampled class distribution: {Counter(y_resampled)}")

# Initialize the Logistic Regression classifier
clf = LogisticRegression(random_state=42)

# Initialize StratifiedKFold with 10 folds
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Store metrics for each fold
accuracies = []
precisions = []
recalls = []
f1s = []
confusion_matrices = []

# Perform cross-validation
for train_index, test_index in skf.split(X_resampled, y_resampled):
    X_train, X_test = X_resampled.iloc[train_index], X_resampled.iloc[test_index]
    y_train, y_test = y_resampled.iloc[train_index], y_resampled.iloc[test_index]
    
    # Train the Logistic Regression classifier
    clf.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = clf.predict(X_test)
    
    # Calculate metrics for this fold
    accuracies.append(accuracy_score(y_test, y_pred))
    precisions.append(precision_score(y_test, y_pred, average='binary', zero_division=0))
    recalls.append(recall_score(y_test, y_pred, average='binary', zero_division=0))
    f1s.append(f1_score(y_test, y_pred, average='binary', zero_division=0))
    confusion_matrices.append(confusion_matrix(y_test, y_pred))

# Print average metrics
print(f"Average Accuracy: {np.mean(accuracies):.4f}")
print(f"Average Precision: {np.mean(precisions):.4f}")
print(f"Average Recall: {np.mean(recalls):.4f}")
print(f"Average F1 Score: {np.mean(f1s):.4f}")

# Print average confusion matrix
average_confusion_matrix = np.mean(confusion_matrices, axis=0)
print("\nAverage Confusion Matrix:\n", average_confusion_matrix)


Original class distribution: Counter({0: 6844, 1: 690})
Resampled class distribution: Counter({0: 690, 1: 690})


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Average Accuracy: 0.6870
Average Precision: 0.7073
Average Recall: 0.6406
Average F1 Score: 0.6710

Average Confusion Matrix:
 [[50.6 18.4]
 [24.8 44.2]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt