## Surrogate Model: Logistic Regression

In [None]:
# Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, r2_score
import pandas as pd

In [None]:
df = pd.read_excel('../Data/dataproject2024.xlsx', sheet_name='Sheet1')

# Define feature set (X) and target (y)
X = df.drop(columns=['ID', 'Pred_default (y_hat)', 'Default (y)', 'PD', 'Group'])  # Dropping non-feature columns
y = df['Pred_default (y_hat)']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the logistic regression model
log_reg = LogisticRegression(max_iter=1000)

# Train the model
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)


# Print the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nClassification Report:\n", classification_rep)

# Inspect feature importance (coefficients in logistic regression)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': log_reg.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance (Logistic Regression Coefficients):\n", feature_importance)

## Surrogate Model: Linear Regression

In [20]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define feature set (X) and target (y, in this case PD)
X = df.drop(columns=['ID', 'PD', 'Default (y)', 'Pred_default (y_hat)', 'Group'])  # Dropping non-feature columns
y = df['PD']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
lin_reg = LinearRegression()

# Train the model
lin_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = lin_reg.predict(X_test)

# Performance metrics for regression
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the metrics
print(f"Mean Squared Error: {mse:.4f}")
print(f"R^2 Score: {r2:.4f}")

# Feature importance (coefficients)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lin_reg.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance (Linear Regression Coefficients):\n", feature_importance)


Mean Squared Error: 0.0253
R^2 Score: 0.4233

Feature Importance (Linear Regression Coefficients):
            Feature  Coefficient
7     Credit event     0.315471
6  Monthly payment     0.291974
4     Down payment     0.081968
5    Loan duration     0.001142
3   Funding amount     0.000026
2        Car price    -0.000021
1              Age    -0.001269
0       Job tenure    -0.002953
9        Homeowner    -0.064714
8          Married    -0.072892


## Surrogate Model: L1 Logistic Regression

In [5]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


In [10]:
df = pd.read_excel('../Data/dataproject2024.xlsx', sheet_name='Sheet1')

X = df.drop(columns=['ID', 'PD', 'Default (y)', 'Pred_default (y_hat)', 'Group'])
y = df['Pred_default (y_hat)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [11]:
df

Unnamed: 0,ID,Job tenure,Age,Car price,Funding amount,Down payment,Loan duration,Monthly payment,Credit event,Married,Homeowner,Default (y),Pred_default (y_hat),PD,Group
0,1,34,55,4875,3087,0,36,0.047895,0,1,1,0,0,0.024480,1
1,2,5,29,13000,13000,0,60,0.091667,0,0,0,1,0,0.331661,0
2,3,14,38,17190,14190,0,60,0.088235,0,0,0,0,0,0.187505,0
3,4,16,37,22773,23568,0,48,0.110084,0,1,1,0,0,0.035441,1
4,5,1,61,7700,8526,0,48,0.123404,0,1,0,1,0,0.340883,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7425,7426,9,38,4502,4856,0,48,0.083846,1,0,0,1,0,0.217708,0
7426,7427,1,33,22900,17900,0,60,0.098000,0,0,0,0,0,0.063937,0
7427,7428,3,27,25858,25972,0,24,0.181667,0,0,0,0,1,0.630863,0
7428,7429,6,32,12300,11300,0,72,0.091963,0,0,1,0,0,0.067551,0


In [7]:
X

Unnamed: 0,Job tenure,Age,Car price,Funding amount,Down payment,Loan duration,Monthly payment,Credit event,Married,Homeowner
0,34,55,4875,3087,0,36,0.047895,0,1,1
1,5,29,13000,13000,0,60,0.091667,0,0,0
2,14,38,17190,14190,0,60,0.088235,0,0,0
3,16,37,22773,23568,0,48,0.110084,0,1,1
4,1,61,7700,8526,0,48,0.123404,0,1,0
...,...,...,...,...,...,...,...,...,...,...
7425,9,38,4502,4856,0,48,0.083846,1,0,0
7426,1,33,22900,17900,0,60,0.098000,0,0,0
7427,3,27,25858,25972,0,24,0.181667,0,0,0
7428,6,32,12300,11300,0,72,0.091963,0,0,1


In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = LogisticRegression(penalty='l1', solver='saga', C=1.0, max_iter = 5000)  # 'saga' solver supports L1 penalty
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


Accuracy: 0.9030955585464334


In [9]:
feature_names = X_train.columns if isinstance(X_train, pd.DataFrame) else [f'feature_{i}' for i in range(X_train.shape[1])]

# Extracting coefficients
coefficients = model.coef_.flatten()  # Flatten in case of binary classification (1 class)

# Creating a DataFrame of feature names and their corresponding coefficients
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Sorting by the absolute value of the coefficients (importance)
feature_importance['Importance'] = np.abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display the top features
print(feature_importance)



     Feature  Coefficient  Importance
3  feature_3     2.863811    2.863811
2  feature_2    -2.568378    2.568378
0  feature_0    -1.004674    1.004674
8  feature_8    -0.515137    0.515137
7  feature_7     0.417501    0.417501
9  feature_9    -0.353209    0.353209
1  feature_1    -0.308576    0.308576
5  feature_5     0.201102    0.201102
6  feature_6     0.098410    0.098410
4  feature_4     0.057245    0.057245
