In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, roc_auc_score
import matplotlib.pyplot as plt


data = pd.read_csv('your_dataset.csv')


X = data.drop(columns=['target'])
y = data['target']


In [None]:
# Preprocessing: Separate numerical and categorical columns
num_cols = X.select_dtypes(include=np.number).columns
cat_cols = X.select_dtypes(include='object').columns

# Scale numerical columns
scaler = StandardScaler()
X_num = scaler.fit_transform(X[num_cols])

# One-hot encode categorical columns
encoder = OneHotEncoder(drop='first', sparse=False)
X_cat = encoder.fit_transform(X[cat_cols])

# Combine processed data
X_processed = np.hstack([X_num, X_cat])


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)


In [None]:
# Train logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predict probabilities
y_probs = model.predict_proba(X_test)[:, 1]  # Probabilities for class 1


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Define thresholds and initialize lists
thresholds = np.arange(0.1, 1.0, 0.1)
accuracies, precisions, recalls = [], [], []

# Loop through thresholds
for t in thresholds:
    y_pred_custom = (y_probs >= t).astype(int)
    accuracies.append(accuracy_score(y_test, y_pred_custom))
    precisions.append(precision_score(y_test, y_pred_custom))
    recalls.append(recall_score(y_test, y_pred_custom))

# Plot metrics vs thresholds
plt.figure(figsize=(10, 6))
plt.plot(thresholds, accuracies, label='Accuracy')
plt.plot(thresholds, precisions, label='Precision')
plt.plot(thresholds, recalls, label='Recall')
plt.xlabel('Threshold')
plt.ylabel('Metrics')
plt.title('Metrics vs Thresholds')
plt.legend()
plt.show()


In [None]:
# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_probs)
plt.plot(fpr, tpr, label='ROC Curve (AUC = {:.2f})'.format(roc_auc_score(y_test, y_probs)))
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


In [None]:
# Test Logistic Regression on each feature
for col in range(X_processed.shape[1]):
    X_single = X_processed[:, col].reshape(-1, 1)
    X_train, X_test, y_train, y_test = train_test_split(X_single, y, test_size=0.2, random_state=42)

    model.fit(X_train, y_train)
    y_probs = model.predict_proba(X_test)[:, 1]
    print(f"Feature {col + 1}: ROC AUC = {roc_auc_score(y_test, y_probs):.2f}")
