In [None]:
from google.colab import drive
import pandas as pd
import numpy as np
import seaborn as sns
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import metrics
from sklearn.metrics import accuracy_score, recall_score, make_scorer, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1. Load Data

In [5]:
# Step 1: Data Preparation
# Load your data into X and y variables
# X = ...
# y = ...

In [None]:
# Standardize the features
scaler = StandardScaler()
scaler.fit(X)
x_norm = scaler.transform(X)
df_x_norm = pd.DataFrame(x_norm, columns=X.columns)

# Step 2: Model Training and Cross-Validation


In [None]:
kfold = KFold(n_splits=10, random_state=0, shuffle=True)  # Set the number of folds
lr = LogisticRegression(solver='liblinear', max_iter=1000, penalty='l1', C=1)  # Set hyperparameters

# Evaluate model with cross-validation
results = cross_val_score(lr, df_x_norm.values, y.values, scoring='accuracy', cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean() * 100.0, results.std() * 100.0))

# Fit the model
lr.fit(df_x_norm, y)
print(lr.coef_)

# Step 3: Hyperparameter Tuning with GridSearchCV


In [None]:
param_dict = {"C": np.arange(0.1, 0.31, 0.01), 'max_iter': range(1000, 11000, 1000)}
grid_lr = LogisticRegression(solver='liblinear', penalty='l1', random_state=0)
clf = GridSearchCV(estimator=grid_lr, param_grid=param_dict, cv=10)
clf.fit(df_x_norm.values, y.values)
print(clf.best_estimator_)

# Step 4: Plotting Regularization Path


In [None]:
def plot_reg_path(c_range):
    clf = LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000, random_state=0)
    coefs_ = []
    for c in c_range:
        clf.set_params(C=c)
        clf.fit(df_x_norm, y)
        coefs_.append(clf.coef_.ravel().copy())
    coefs_ = np.array(coefs_)
    plt.plot(np.log10(c_range), coefs_, marker="o")
    plt.xlabel("log(C)")
    plt.ylabel("Coefficients")
    plt.title("Logistic Regression Lasso Regularization Path")
    plt.axis("tight")
    plt.show()



# Step 5: Plotting Regularization Path with AUC


In [None]:
def plot_reg_path_auc(c_range, cv):
    clf = LogisticRegression(penalty="l1", solver="liblinear", max_iter=1000, random_state=0)
    auc_l = []
    for c in c_range:
        clf.set_params(C=c)
        clf.fit(df_x_norm, y)
        auc = cross_val_score(clf, df_x_norm, y, scoring='roc_auc', cv=cv)
        auc_l.append(auc.mean())
    plt.plot(1/c_range, auc_l, marker="o")
    plt.xlabel("Penalty")
    plt.ylabel("ROC-AUC")
    plt.title("Logistic Regression Regularization Path")
    plt.axis("tight")
    plt.show()

# Generate plots
cs = np.logspace(-3, 3, 7)
plot_reg_path(cs)
cs_range = np.arange(0.1, 1.01, 0.01)
plot_reg_path_auc(cs_range, cv=10)

# Step 6: Logistic Regression with Cross-Validation

In [None]:
Cs = [0.11]
lr_cv = LogisticRegressionCV(cv=10, solver='liblinear', max_iter=1000, penalty='l1', Cs=Cs, random_state=1314)
lr_cv.fit(df_x_norm, y)

# Calculate and print performance metrics
roc_auc_scores = cross_val_score(lr_cv, df_x_norm, y, scoring='roc_auc', cv=10)
print('The average ROC-AUC is:', round(roc_auc_scores.mean(), 2))
print("Accuracy: %0.2f (+/- %0.2f)" % (roc_auc_scores.mean(), roc_auc_scores.std() * 2))


# Step 7: Sensitivity and Specificity


In [None]:
sensitivity = make_scorer(recall_score, pos_label=1)
specificity = make_scorer(recall_score, pos_label=0)
print("Sensitivity:", cross_val_score(lr_cv, df_x_norm, y, scoring=sensitivity, cv=10).mean())
print("Specificity:", cross_val_score(lr_cv, df_x_norm, y, scoring=specificity, cv=10).mean())


# Step 8: Feature Importance Visualization


In [None]:
df_coef_nozero = pd.DataFrame({'FeatureName': X.columns, 'Coefficient': lr_cv.coef_.ravel()})
df_coef_nozero = df_coef_nozero[df_coef_nozero['Coefficient'] != 0]

plt.figure(figsize=(15, 10))
sns.barplot(x='FeatureName', y='Coefficient', data=df_coef_nozero, palette="GnBu_d")
plt.xticks(rotation=25)
plt.xlabel('Feature')
plt.ylabel('Coefficient')
plt.title('Feature Importance')
plt.show()