In [None]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, roc_curve, auc
from sklearn.model_selection import KFold, cross_val_score, ShuffleSplit
from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression


In [None]:
# Generate a synthetic classification dataset
X, y = make_classification(n_samples=1000, n_features=5, n_classes=2, random_state=42)

# Split the dataset into training and test sets using the holdout method
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a logistic regression model to the training data
lr = LogisticRegression(random_state=42).fit(X_train, y_train)

# Make predictions on the test data
y_pred = lr.predict(X_test)


In [None]:
# Compute the confusion matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Compute the model accuracy and error rate
accuracy = accuracy_score(y_test, y_pred)
error_rate = 1 - accuracy

# Compute the kappa statistic
observed_agreement = (tp + tn) / len(y_test)
chance_agreement = ((tp + fp) / len(y_test)) * ((tp + fn) / len(y_test)) + ((tn + fp) / len(y_test)) * ((tn + fn) / len(y_test))
kappa = (observed_agreement - chance_agreement) / (1 - chance_agreement)

# Compute sensitivity, specificity, precision, and recall
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
precision = tp / (tp + fp)
recall = sensitivity

# Compute the F-measure
f_measure = f1_score(y_test, y_pred)

# Compute the ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, lr.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)

# Print all performance metrics
print("Confusion Matrix:\n", np.array([[tn, fp], [fn, tp]]))
print("Accuracy: {:.3f}".format(accuracy))
print("Error Rate: {:.3f}".format(error_rate))
print("Kappa Statistic: {:.3f}".format(kappa))
print("Sensitivity: {:.3f}".format(sensitivity))
print("Specificity: {:.3f}".format(specificity))
print("Precision: {:.3f}".format(precision))
print("Recall: {:.3f}".format(recall))
print("F-measure: {:.3f}".format(f_measure))
print("AUC: {:.3f}".format(roc_auc))



In [None]:
# Split the dataset into training and test sets using k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    lr = LogisticRegression(random_state=42).fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy: {:.2f}".format(accuracy))

In [None]:
# Perform bootstrap sampling on the dataset
n_bootstraps = 1000
bootstrapped_scores = []
rng = np.random.RandomState(42)
for i in range(n_bootstraps):
    indices = rng.randint(0, len(y), len(y))
    X_bootstrapped = X[indices]
    y_bootstrapped = y[indices]
    lr = LogisticRegression(random_state=42).fit(X_bootstrapped, y_bootstrapped)
    y_pred = lr.predict(X_test)
    bootstrapped_scores.append(accuracy_score(y_test, y_pred))

# Compute the bootstrap confidence interval
sorted_scores = np.array(bootstrapped_scores)
sorted_scores.sort()
lower_bound = sorted_scores[int(0.05 * len(sorted_scores))]
upper_bound = sorted_scores[int(0.95 * len(sorted_scores))]
print("Bootstrap confidence interval (95%): [{:.3f}, {:.3f}]".format(lower_bound, upper_bound))


In [None]:
# Shuffle the dataset and split it into training and test sets using repeated k-fold cross-validation
n_splits = 5
n_repeats = 10
rs = ShuffleSplit(n_splits=n_splits, test_size=0.2, random_state=42)
for i in range(n_repeats):
    for train_index, test_index in rs.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        lr = LogisticRegression(random_state=42).fit(X_train, y_train)
        y_pred = lr.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print("Accuracy: {:.2f}".format(accuracy))

# Upsample the minority class using bootstrap sampling
X_minority = X[y == 1]
y_minority = y[y == 1]
X_majority = X[y == 0]
y_majority = y[y == 0]
n_samples = len(y_majority)
X_resampled, y_resampled = resample(X_minority, y_minority, replace=True, n_samples=n_samples, random_state=42)
X_balanced = np.concatenate((X_majority, X_resampled))
y_balanced = np.concatenate((y_majority, y_resampled))


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

# Plot the confusion matrix
disp = plot_confusion_matrix(lr, X_test, y_test)
disp.ax_.set_title("Confusion Matrix")
plt.show()

# Plot the ROC curve and AUC
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()