In [5]:
import os
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, auc
)
import joblib

In [8]:
# ---------------------- User variables ----------------------
CSV_PATH = "german_credit_data.csv"  # change if needed
RANDOM_STATE = 42
TEST_SIZE = 0.2
RESULTS_DIR = "results"
os.makedirs(RESULTS_DIR, exist_ok=True)

In [9]:
# ---------------------- Load data ----------------------
df = pd.read_csv(CSV_PATH)
print("Loaded data shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nFirst 5 rows:\n", df.head())

Loaded data shape: (1000, 10)
Columns: ['Unnamed: 0', 'Age', 'Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account', 'Credit amount', 'Duration', 'Purpose']

First 5 rows:
    Unnamed: 0  Age     Sex  Job Housing Saving accounts Checking account  \
0           0   67    male    2     own             NaN           little   
1           1   22  female    2     own          little         moderate   
2           2   49    male    1     own          little              NaN   
3           3   45    male    2    free          little           little   
4           4   53    male    2    free          little           little   

   Credit amount  Duration              Purpose  
0           1169         6             radio/TV  
1           5951        48             radio/TV  
2           2096        12            education  
3           7882        42  furniture/equipment  
4           4870        24                  car  


In [10]:
# ---------------------- Identify target ----------------------
# Common Kaggle column name: 'Risk' with values like 'good'/'bad'
# If your dataset uses a different target column, change TARGET_COL accordingly.
TARGET_COL = None
for candidate in ["Risk", "risk", "Target", "target", "Class", "class"]:
    if candidate in df.columns:
        TARGET_COL = candidate
        break
if TARGET_COL is None:
    # fallback: try to infer last column as target
    TARGET_COL = df.columns[-1]

print(f"\nUsing target column: {TARGET_COL}")

# Map target to binary 0/1 if needed
if df[TARGET_COL].dtype == object:
    unique_vals = df[TARGET_COL].unique()
    print("Target unique values:", unique_vals)
    # Map 'good'->0, 'bad'->1 if present
    if set(map(str.lower, unique_vals)) >= {"good", "bad"}:
        df[TARGET_COL] = df[TARGET_COL].map(lambda x: 1 if str(x).lower() == "bad" else 0)
    else:
        # generic mapping: most frequent -> 0, others -> 1
        most_freq = df[TARGET_COL].value_counts().idxmax()
        df[TARGET_COL] = (df[TARGET_COL] != most_freq).astype(int)
else:
    # If numeric already, ensure binary 0/1
    unique_vals = sorted(df[TARGET_COL].unique())
    if len(unique_vals) > 2:
        print("Warning: Target appears to have >2 classes; this script is for binary classification.")
    # if values other than 0/1 exist, map min->0, max->1
    if set(unique_vals) != {0,1} and len(unique_vals) == 2:
        mapping = {unique_vals[0]: 0, unique_vals[1]: 1}
        df[TARGET_COL] = df[TARGET_COL].map(mapping)

print("\nTarget value counts:\n", df[TARGET_COL].value_counts())


Using target column: Purpose
Target unique values: ['radio/TV' 'education' 'furniture/equipment' 'car' 'business'
 'domestic appliances' 'repairs' 'vacation/others']

Target value counts:
 Purpose
1    663
0    337
Name: count, dtype: int64


In [11]:
# ---------------------- Feature separation ----------------------
# Remove target & ID-like columns if present
possible_id_cols = [c for c in df.columns if c.lower() in ("id", "index")]
X = df.drop(columns=[TARGET_COL] + possible_id_cols)
y = df[TARGET_COL].copy()

# Detect numeric vs categorical:
numeric_cols = X.select_dtypes(include=["number"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

# Some integer-coded categorical features (small number of unique values) -> treat as categorical
for col in numeric_cols[:]:  # iterate over a copy
    if X[col].nunique() <= 10 and X[col].dtype in [np.int64, np.int32, np.int16, np.int8]:
        cat_cols.append(col)
        numeric_cols.remove(col)

print("\nNumeric columns:", numeric_cols)
print("Categorical columns:", cat_cols)



Numeric columns: ['Unnamed: 0', 'Age', 'Credit amount', 'Duration']
Categorical columns: ['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Job']


In [13]:
# ---------------------- Preprocessing pipelines ----------------------
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="MISSING")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_transformer, numeric_cols),
    ("cat", categorical_transformer, cat_cols)
], remainder="drop")

# ---------------------- Train/test split ----------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)
print(f"\nTrain shape: {X_train.shape}, Test shape: {X_test.shape}")


Train shape: (800, 9), Test shape: (200, 9)


In [14]:
# ---------------------- Helper: evaluate model ----------------------
def evaluate_model(name, model, X_test, y_test, show_plots=True):
    y_pred = model.predict(X_test)
    y_prob = None
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_test)[:, 1]
    elif hasattr(model, "decision_function"):
        y_prob = model.decision_function(X_test)
    else:
        y_prob = np.zeros(len(y_test))

    pr = precision_score(y_test, y_pred, zero_division=0)
    rc = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_prob)

    print(f"\n=== {name} ===")
    print("Precision: {:.4f}, Recall: {:.4f}, F1: {:.4f}, ROC-AUC: {:.4f}".format(pr, rc, f1, roc_auc))
    print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    if show_plots:
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        plt.figure(figsize=(6,5))
        plt.plot(fpr, tpr, label=f"{name} (AUC = {roc_auc:.3f})")
        plt.plot([0,1],[0,1], linestyle="--")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(f"ROC Curve â€” {name}")
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(os.path.join(RESULTS_DIR, f"roc_{name.replace(' ','_')}.png"))
        plt.close()

    return {"precision": pr, "recall": rc, "f1": f1, "roc_auc": roc_auc}

In [15]:
# ---------------------- Models to train ----------------------
models = {
    "LogisticRegression": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("clf", LogisticRegression(max_iter=2000, random_state=RANDOM_STATE, solver="liblinear"))
    ]),
    "DecisionTree": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("clf", DecisionTreeClassifier(random_state=RANDOM_STATE))
    ]),
    "RandomForest": Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("clf", RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1))
    ])
}

trained_models = {}
results = {}


In [16]:
# ---------------------- Train and evaluate ----------------------
for name, pipeline in models.items():
    print(f"\nTraining: {name} ...")
    pipeline.fit(X_train, y_train)
    trained_models[name] = pipeline
    # Evaluate on test set
    results[name] = evaluate_model(name, pipeline, X_test, y_test, show_plots=True)

# Save evaluation summary to CSV
pd.DataFrame(results).T.to_csv(os.path.join(RESULTS_DIR, "model_results_summary.csv"))
print("\nSaved results summary to:", os.path.join(RESULTS_DIR, "model_results_summary.csv"))


Training: LogisticRegression ...

=== LogisticRegression ===
Precision: 0.6910, Recall: 0.9248, F1: 0.7910, ROC-AUC: 0.5868

Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.18      0.27        67
           1       0.69      0.92      0.79       133

    accuracy                           0.68       200
   macro avg       0.62      0.55      0.53       200
weighted avg       0.64      0.68      0.62       200

Confusion Matrix:
 [[ 12  55]
 [ 10 123]]

Training: DecisionTree ...

=== DecisionTree ===
Precision: 0.7111, Recall: 0.7218, F1: 0.7164, ROC-AUC: 0.5699

Classification Report:
               precision    recall  f1-score   support

           0       0.43      0.42      0.42        67
           1       0.71      0.72      0.72       133

    accuracy                           0.62       200
   macro avg       0.57      0.57      0.57       200
weighted avg       0.62      0.62      0.62       200

Confusion Matrix

In [17]:
# ---------------------- Train and evaluate ----------------------
for name, pipeline in models.items():
    print(f"\nTraining: {name} ...")
    pipeline.fit(X_train, y_train)
    trained_models[name] = pipeline
    # Evaluate on test set
    results[name] = evaluate_model(name, pipeline, X_test, y_test, show_plots=True)

# Save evaluation summary to CSV
pd.DataFrame(results).T.to_csv(os.path.join(RESULTS_DIR, "model_results_summary.csv"))
print("\nSaved results summary to:", os.path.join(RESULTS_DIR, "model_results_summary.csv"))


Training: LogisticRegression ...

=== LogisticRegression ===
Precision: 0.6910, Recall: 0.9248, F1: 0.7910, ROC-AUC: 0.5868

Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.18      0.27        67
           1       0.69      0.92      0.79       133

    accuracy                           0.68       200
   macro avg       0.62      0.55      0.53       200
weighted avg       0.64      0.68      0.62       200

Confusion Matrix:
 [[ 12  55]
 [ 10 123]]

Training: DecisionTree ...

=== DecisionTree ===
Precision: 0.7111, Recall: 0.7218, F1: 0.7164, ROC-AUC: 0.5699

Classification Report:
               precision    recall  f1-score   support

           0       0.43      0.42      0.42        67
           1       0.71      0.72      0.72       133

    accuracy                           0.62       200
   macro avg       0.57      0.57      0.57       200
weighted avg       0.62      0.62      0.62       200

Confusion Matrix

In [18]:
# ---------------------- Feature importance (Random Forest) ----------------------
# We need feature names after preprocessing:
rf_pipeline = trained_models.get("RandomForest")
if rf_pipeline is not None:
    pre = rf_pipeline.named_steps["preprocessor"]
    # numeric feature names
    num_features = numeric_cols
    # categorical feature names after one-hot:
    ohe = pre.named_transformers_["cat"].named_steps["onehot"]
    # OneHotEncoder may not be present if no categorical columns; handle that
    cat_feature_names = []
    if cat_cols:
        # build categories list
        categories = ohe.categories_
        for col, cats in zip(cat_cols, categories):
            cat_feature_names += [f"{col}__{str(c)}" for c in cats]

    feature_names = num_features + cat_feature_names

    rf = rf_pipeline.named_steps["clf"]
    importances = rf.feature_importances_
    fi = pd.Series(importances, index=feature_names).sort_values(ascending=False)
    print("\nTop 20 feature importances (Random Forest):\n", fi.head(20))
    fi.head(50).to_csv(os.path.join(RESULTS_DIR, "rf_feature_importances.csv"))
    print("Saved RF feature importances to:", os.path.join(RESULTS_DIR, "rf_feature_importances.csv"))

    # Plot top 15
    plt.figure(figsize=(8,6))
    fi.head(15).sort_values().plot(kind="barh")
    plt.title("Random Forest - Top 15 feature importances")
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, "rf_top15_importances.png"))
    plt.close()


Top 20 feature importances (Random Forest):
 Credit amount                  0.213254
Unnamed: 0                     0.186137
Age                            0.164982
Duration                       0.129676
Checking account__little       0.024020
Job__2                         0.022395
Saving accounts__little        0.021771
Checking account__MISSING      0.021134
Housing__own                   0.021129
Checking account__moderate     0.019999
Sex__male                      0.019587
Saving accounts__MISSING       0.019019
Sex__female                    0.018103
Saving accounts__moderate      0.017940
Job__1                         0.017909
Job__3                         0.017230
Housing__free                  0.015189
Housing__rent                  0.014746
Saving accounts__quite rich    0.010976
Checking account__rich         0.009663
dtype: float64
Saved RF feature importances to: results/rf_feature_importances.csv


In [19]:
# ---------------------- Coefficients (Logistic Regression) ----------------------
lr_pipeline = trained_models.get("LogisticRegression")
if lr_pipeline is not None:
    # get feature names same as above using the preprocessor
    pre = lr_pipeline.named_steps["preprocessor"]
    num_features = numeric_cols
    cat_feature_names = []
    if cat_cols:
        ohe = pre.named_transformers_["cat"].named_steps["onehot"]
        categories = ohe.categories_
        for col, cats in zip(cat_cols, categories):
            cat_feature_names += [f"{col}__{str(c)}" for c in cats]
    feature_names = num_features + cat_feature_names

    lr = lr_pipeline.named_steps["clf"]
    coef = lr.coef_.ravel()
    coef_series = pd.Series(coef, index=feature_names).sort_values()
    print("\nTop positive coefficients (Logistic Regression):\n", coef_series.tail(10))
    print("\nTop negative coefficients (Logistic Regression):\n", coef_series.head(10))
    coef_series.to_csv(os.path.join(RESULTS_DIR, "lr_coefficients.csv"))
    print("Saved LR coefficients to:", os.path.join(RESULTS_DIR, "lr_coefficients.csv"))


Top positive coefficients (Logistic Regression):
 Job__3                         0.094614
Sex__female                    0.132497
Checking account__moderate     0.209732
Job__1                         0.216020
Saving accounts__little        0.282796
Job__2                         0.297910
Duration                       0.357214
Saving accounts__quite rich    0.364479
Checking account__rich         0.386537
Housing__own                   0.425232
dtype: float64

Top negative coefficients (Logistic Regression):
 Job__0                      -0.499224
Credit amount               -0.433417
Housing__free               -0.401975
Checking account__little    -0.364665
Saving accounts__rich       -0.220266
Saving accounts__moderate   -0.176320
Saving accounts__MISSING    -0.141369
Checking account__MISSING   -0.122284
Age                         -0.099309
Unnamed: 0                  -0.077771
dtype: float64
Saved LR coefficients to: results/lr_coefficients.csv


In [20]:
# ---------------------- Hyperparameter tuning example (Random Forest) ----------------------
print("\nStarting a quick GridSearchCV on RandomForest (small grid)...")
param_grid = {
    "clf__n_estimators": [100, 200],
    "clf__max_depth": [None, 6, 12],
    "clf__min_samples_split": [2, 5]
}
grid = GridSearchCV(trained_models["RandomForest"], param_grid, cv=5, scoring="roc_auc", n_jobs=-1, verbose=0)
grid.fit(X_train, y_train)
print("Best params:", grid.best_params_)
print("Best CV ROC-AUC:", grid.best_score_)
best_rf = grid.best_estimator_
# evaluate tuned model
results["RandomForest_tuned"] = evaluate_model("RandomForest_tuned", best_rf, X_test, y_test, show_plots=True)



Starting a quick GridSearchCV on RandomForest (small grid)...
Best params: {'clf__max_depth': 12, 'clf__min_samples_split': 2, 'clf__n_estimators': 100}
Best CV ROC-AUC: 0.6225716282320056

=== RandomForest_tuned ===
Precision: 0.7029, Recall: 0.9248, F1: 0.7987, ROC-AUC: 0.5864

Classification Report:
               precision    recall  f1-score   support

           0       0.60      0.22      0.33        67
           1       0.70      0.92      0.80       133

    accuracy                           0.69       200
   macro avg       0.65      0.57      0.56       200
weighted avg       0.67      0.69      0.64       200

Confusion Matrix:
 [[ 15  52]
 [ 10 123]]


In [21]:
# ---------------------- Save best model to disk ----------------------
# Choose final model by highest test ROC-AUC found above
best_model_name = max(results.items(), key=lambda t: t[1]["roc_auc"])[0]
print("\nBest model by test ROC-AUC:", best_model_name)
best_model_obj = trained_models.get(best_model_name, None)
if best_model_name == "RandomForest_tuned":
    best_model_obj = best_rf

if best_model_obj is not None:
    joblib.dump(best_model_obj, os.path.join(RESULTS_DIR, f"{best_model_name}.joblib"))
    print("Saved best model to:", os.path.join(RESULTS_DIR, f"{best_model_name}.joblib"))

print("\nAll done. Check the results folder for saved plots, CSVs, and the model.")


Best model by test ROC-AUC: RandomForest
Saved best model to: results/RandomForest.joblib

All done. Check the results folder for saved plots, CSVs, and the model.
