# Logistic Regression Model
### Loan Default Prediction

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix, classification_report

from preprocessing import load_and_clean_data, preprocess_features

## 1. Load and Prepare Data

In [None]:
df = load_and_clean_data('loan_data.csv')
print(f'Shape after cleaning: {df.shape}')
df.head()

In [None]:

df = preprocess_features(df)


X = df.drop(columns=['loan_status'])
y = df['loan_status']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f'Train set: {X_train.shape}')
print(f'Test set: {X_test.shape}')

## 2. Feature Scaling

In [None]:

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print('Scaling done.')

## 3. Train the Model

In [None]:
model = LogisticRegression(max_iter=1000, random_state=42, class_weight='balanced')
model.fit(X_train_scaled, y_train)
print('Model trained.')

## 4. Evaluate the Model

In [None]:
y_train_pred = model.predict(X_train_scaled)
y_test_pred = model.predict(X_test_scaled)
y_test_prob = model.predict_proba(X_test_scaled)[:, 1]

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)
roc_auc = roc_auc_score(y_test, y_test_prob)

print(f'Train Accuracy: {train_acc * 100:.2f}%')
print(f'Test Accuracy:  {test_acc * 100:.2f}%')
print(f'ROC-AUC Score:  {roc_auc:.4f}')

In [None]:
print('Classification Report:')
print(classification_report(y_test, y_test_pred, target_names=['No Default', 'Default']))

## 5. Confusion Matrix

In [None]:
cm = confusion_matrix(y_test, y_test_pred)

fig, ax = plt.subplots(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Default', 'Default'],
            yticklabels=['No Default', 'Default'], ax=ax)
ax.set_xlabel('Predicted')
ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix - Logistic Regression')
plt.tight_layout()
plt.show()

## 6. ROC Curve

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_prob)

fig, ax = plt.subplots(figsize=(6, 4))
ax.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {roc_auc:.4f})')
ax.plot([0, 1], [0, 1], color='gray', linestyle='--', label='Random Baseline')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('ROC Curve - Logistic Regression')
ax.legend()
plt.tight_layout()
plt.show()

## 7. Feature Importance (Coefficients)

In [None]:
feature_names = list(X_train.columns)
coefficients = model.coef_[0]

coef_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
coef_df['Abs_Coef'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values('Abs_Coef', ascending=True)

fig, ax = plt.subplots(figsize=(8, 6))
colors = ['salmon' if c > 0 else 'skyblue' for c in coef_df['Coefficient']]
ax.barh(coef_df['Feature'], coef_df['Coefficient'], color=colors)
ax.set_xlabel('Coefficient Value')
ax.set_title('Logistic Regression - Feature Coefficients')
ax.axvline(0, color='black', linewidth=0.5)
plt.tight_layout()
plt.show()

print('\nTop 10 most important features:')
print(coef_df.sort_values('Abs_Coef', ascending=False).head(10)[['Feature', 'Coefficient']].to_string(index=False))