In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

In [3]:
# 1. Load Dataset
file_path = r'C:\Users\thega\Downloads\CreditDataset.csv'  # Updated file path
data = pd.read_csv(file_path)

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\thega\\Downloads\\CreditDataset.csv'

In [None]:
# 2. Data Exploration
print("Dataset Info:")
print(data.info())
print("\nFirst 5 rows:")
print(data.head())

In [None]:
# 3. Preprocessing
# Handling missing values (example: fillna or dropna)
data = data.dropna()  # Drop rows with missing values (adjust as needed)

In [None]:
# Encoding categorical columns if present
for col in data.select_dtypes(include='object').columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])

In [None]:
# Separating features (X) and target (y)
target_column = 'target_column_name'  # Replace with your target column
X = data.drop(target_column, axis=1)
y = data[target_column]

In [None]:
# Splitting into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# 4. Model Definitions and Training
models = {
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "k-NN": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [None]:
# 5. Evaluation Metrics and Training
all_metrics = []  # To store results for each model

plt.figure(figsize=(10, 8))  # For ROC curve

for model_name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    # Predictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else None

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None

    # Print Metrics
    print(f"\nModel: {model_name}")
    print(f"Accuracy: {acc:.2f}")
    print(f"Precision: {prec:.2f}")
    print(f"Recall: {rec:.2f}")
    print(f"F1 Score: {f1:.2f}")
    print(f"AUC-ROC: {auc:.2f}" if auc is not None else "AUC-ROC: Not Available")

    # Store Metrics for Comparison
    all_metrics.append({
        "Model": model_name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1 Score": f1,
        "AUC-ROC": auc
    })

    # ROC Curve
    if y_proba is not None:
        fpr, tpr, thresholds = roc_curve(y_test, y_proba)
        plt.plot(fpr, tpr, label=f"{model_name} (AUC = {auc:.2f})")

# Plot ROC Curve
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve Comparison")
plt.legend()
plt.show()

In [None]:
# 6. Display Comparison Table
metrics_df = pd.DataFrame(all_metrics)
print("\nModel Performance Comparison:")
print(metrics_df)