## Surrogate Model for Own Blackbox: Logistic Regression

In [1]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, r2_score
import pandas as pd

In [8]:
df = pd.read_csv('../Data/black_box_2_data.csv')

# Define feature set (X) and target (y)
X = df.drop(columns=['ID', 'Pred_default (y_hat)', 'Default (y)', 'PD', 'Group'])  # Dropping non-feature columns
y = df['Pred_default (y_hat)']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the logistic regression model
log_reg = LogisticRegression(max_iter=5000)

# Train the model
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)


# Print the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nClassification Report:\n", classification_rep)

# Inspect feature importance (coefficients in logistic regression)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': log_reg.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance (Logistic Regression Coefficients):\n", feature_importance)

Accuracy: 0.83
Precision: 0.54
Recall: 0.15
F1 Score: 0.24

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.97      0.90      1220
           1       0.54      0.15      0.24       266

    accuracy                           0.83      1486
   macro avg       0.69      0.56      0.57      1486
weighted avg       0.79      0.83      0.78      1486


Feature Importance (Logistic Regression Coefficients):
            Feature  Coefficient
7     Credit event     1.525902
6  Monthly payment     0.566443
4     Down payment     0.337965
5    Loan duration     0.006958
3   Funding amount     0.000274
2        Car price    -0.000238
1              Age    -0.005337
0       Job tenure    -0.029138
8          Married    -0.440746
9        Homeowner    -0.757404


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Surrogate Model for Own Blackbox: Linear Regression

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define feature set (X) and target (y, in this case PD)
X = df.drop(columns=['ID', 'PD', 'Default (y)', 'Pred_default (y_hat)', 'Group'])  # Dropping non-feature columns
y = df['PD']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
lin_reg = LinearRegression()

# Train the model
lin_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = lin_reg.predict(X_test)

# Performance metrics for regression
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the metrics
print(f"Mean Squared Error: {mse:.4f}")
print(f"R^2 Score: {r2:.4f}")

# Feature importance (coefficients)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lin_reg.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance (Linear Regression Coefficients):\n", feature_importance)


Mean Squared Error: 0.1171
R^2 Score: 0.1468

Feature Importance (Linear Regression Coefficients):
            Feature  Coefficient
6  Monthly payment     0.317430
7     Credit event     0.302075
4     Down payment     0.083060
5    Loan duration     0.001265
3   Funding amount     0.000026
2        Car price    -0.000021
1              Age    -0.001245
0       Job tenure    -0.002829
8          Married    -0.060669
9        Homeowner    -0.082519


## Surrogate Model for Own Blackbox: L1 Logistic Regression

In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [9]:
df

Unnamed: 0,ID,Job tenure,Age,Car price,Funding amount,Down payment,Loan duration,Monthly payment,Credit event,Married,Homeowner,Default (y),Pred_default (y_hat),PD,Group
0,1,34,55,4875,3087,0,36,0.047895,0,1,1,0,0,0.001356,1
1,2,5,29,13000,13000,0,60,0.091667,0,0,0,1,0,0.234995,0
2,3,14,38,17190,14190,0,60,0.088235,0,0,0,0,0,0.000045,0
3,4,16,37,22773,23568,0,48,0.110084,0,1,1,0,0,0.000017,1
4,5,1,61,7700,8526,0,48,0.123404,0,1,0,1,1,0.981289,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7425,7426,9,38,4502,4856,0,48,0.083846,1,0,0,1,1,0.997078,0
7426,7427,1,33,22900,17900,0,60,0.098000,0,0,0,0,0,0.000179,0
7427,7428,3,27,25858,25972,0,24,0.181667,0,0,0,0,0,0.025598,0
7428,7429,6,32,12300,11300,0,72,0.091963,0,0,1,0,0,0.003357,0


In [5]:
X = df.drop(columns=['ID', 'PD', 'Default (y)', 'Pred_default (y_hat)', 'Group'])
y = df['Pred_default (y_hat)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = LogisticRegression(penalty='l1', solver='saga', C=1.0, max_iter = 5000)  # 'saga' solver supports L1 penalty
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


Accuracy: 0.8250336473755047


In [7]:
feature_names = X_train.columns if isinstance(X_train, pd.DataFrame) else [f'feature_{i}' for i in range(X_train.shape[1])]

# Extracting coefficients
coefficients = model.coef_.flatten()  # Flatten in case of binary classification (1 class)

# Creating a DataFrame of feature names and their corresponding coefficients
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Sorting by the absolute value of the coefficients (importance)
feature_importance['Importance'] = np.abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display the top features
print(feature_importance)





     Feature  Coefficient  Importance
3  feature_3     1.581649    1.581649
2  feature_2    -1.433423    1.433423
9  feature_9    -0.350635    0.350635
0  feature_0    -0.282494    0.282494
7  feature_7     0.226682    0.226682
8  feature_8    -0.221162    0.221162
5  feature_5     0.156162    0.156162
6  feature_6     0.102157    0.102157
4  feature_4     0.092581    0.092581
1  feature_1    -0.076958    0.076958


In [10]:
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
from sklearn.model_selection import train_test_split

# Define feature set (X) and target (y, in this case PD)
X = df.drop(columns=['ID', 'PD', 'Default (y)', 'Pred_default (y_hat)', 'Group'])  # Dropping non-feature columns
y = df['PD']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Lasso model with L1 regularization (you can adjust alpha for more/less regularization)
lasso_reg = Lasso(alpha=0.01, max_iter=10000)

# Train the model
lasso_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = lasso_reg.predict(X_test)

# Performance metrics for regression
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the metrics
print(f"Mean Squared Error: {mse:.4f}")
print(f"R^2 Score: {r2:.4f}")

# Feature importance (coefficients)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lasso_reg.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance (Lasso Coefficients):\n", feature_importance)


Mean Squared Error: 0.1208
R^2 Score: 0.1197

Feature Importance (Lasso Coefficients):
            Feature  Coefficient
5    Loan duration     0.000870
3   Funding amount     0.000025
4     Down payment     0.000000
6  Monthly payment     0.000000
7     Credit event     0.000000
2        Car price    -0.000019
1              Age    -0.001761
0       Job tenure    -0.003442
8          Married    -0.029921
9        Homeowner    -0.047341
