In [8]:
# Import necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, r2_score
import pandas as pd

In [16]:
df = pd.read_excel('../Data/dataproject2024.xlsx', sheet_name='Sheet1')

# Define feature set (X) and target (y)
X = df.drop(columns=['ID', 'Pred_default (y_hat)', 'Default (y)', 'PD', 'Group'])  # Dropping non-feature columns
y = df['Pred_default (y_hat)']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the logistic regression model
log_reg = LogisticRegression(max_iter=1000)

# Train the model
log_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = log_reg.predict(X_test)

# Performance metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)


# Print the metrics
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print("\nClassification Report:\n", classification_rep)

# Inspect feature importance (coefficients in logistic regression)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': log_reg.coef_[0]
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance (Logistic Regression Coefficients):\n", feature_importance)

Accuracy: 0.90
Precision: 0.68
Recall: 0.34
F1 Score: 0.46

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.98      0.95      1309
           1       0.68      0.34      0.46       177

    accuracy                           0.90      1486
   macro avg       0.80      0.66      0.70      1486
weighted avg       0.89      0.90      0.89      1486


Feature Importance (Logistic Regression Coefficients):
            Feature  Coefficient
7     Credit event     2.864450
6  Monthly payment     0.620591
4     Down payment     0.249530
5    Loan duration     0.008626
3   Funding amount     0.000499
2        Car price    -0.000435
1              Age    -0.023819
0       Job tenure    -0.103128
9        Homeowner    -0.675827
8          Married    -1.036126


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Define feature set (X) and target (y, in this case PD)
X = df.drop(columns=['ID', 'PD', 'Default (y)', 'Pred_default (y_hat)', 'Group'])  # Dropping non-feature columns
y = df['PD']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
lin_reg = LinearRegression()

# Train the model
lin_reg.fit(X_train, y_train)

# Predict on the test set
y_pred = lin_reg.predict(X_test)

# Performance metrics for regression
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the metrics
print(f"Mean Squared Error: {mse:.4f}")
print(f"R^2 Score: {r2:.4f}")

# Feature importance (coefficients)
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lin_reg.coef_
}).sort_values(by='Coefficient', ascending=False)

print("\nFeature Importance (Linear Regression Coefficients):\n", feature_importance)


Mean Squared Error: 0.0253
R^2 Score: 0.4233

Feature Importance (Linear Regression Coefficients):
            Feature  Coefficient
7     Credit event     0.315471
6  Monthly payment     0.291974
4     Down payment     0.081968
5    Loan duration     0.001142
3   Funding amount     0.000026
2        Car price    -0.000021
1              Age    -0.001269
0       Job tenure    -0.002953
9        Homeowner    -0.064714
8          Married    -0.072892


In [19]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


In [23]:
X = df.drop(columns=['ID', 'PD', 'Default (y)', 'Pred_default (y_hat)', 'Group'])
y = df['Pred_default (y_hat)']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [25]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = LogisticRegression(penalty='l1', solver='saga', C=1.0, max_iter = 5000)  # 'saga' solver supports L1 penalty
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


Accuracy: 0.9030955585464334
