In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

In [None]:
# Load the dataset
df = pd.read_csv('loan_data.csv')
df.head()

In [None]:
# Handle missing values
df['employment_length'].fillna(df['employment_length'].median(), inplace=True)
df['annual_income'].fillna(df['annual_income'].median(), inplace=True)
df.dropna(subset=['loan_status'], inplace=True)

In [None]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

df = remove_outliers(df, 'annual_income')
df = remove_outliers(df, 'loan_amount')

In [None]:
# Convert categorical variables to numerical
df['loan_status'] = df['loan_status'].map({'Approved': 1, 'Denied': 0})
df = pd.get_dummies(df, columns=['employment_type', 'loan_purpose'], drop_first=True)

# Feature Engineering
df['debt_to_income_ratio'] = df['monthly_debt'] / (df['annual_income'] / 12)
df['loan_to_income_ratio'] = df['loan_amount'] / df['annual_income']

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix')
plt.show()

# Distribution plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
sns.histplot(df['annual_income'], kde=True, ax=axes[0, 0])
axes[0, 0].set_title('Annual Income Distribution')
sns.histplot(df['loan_amount'], kde=True, ax=axes[0, 1])
axes[0, 1].set_title('Loan Amount Distribution')
sns.histplot(df['credit_score'], kde=True, ax=axes[1, 0])
axes[1, 0].set_title('Credit Score Distribution')
sns.countplot(x='loan_status', data=df, ax=axes[1, 1])
axes[1, 1].set_title('Loan Status Distribution')
plt.tight_layout()
plt.show()

In [None]:
status_stats = df.groupby('loan_status').agg({
    'annual_income': ['mean', 'median', 'std'],
    'loan_amount': ['mean', 'median', 'std'],
    'credit_score': ['mean', 'median', 'std'],
    'debt_to_income_ratio': ['mean', 'median', 'std']
})
status_stats

In [None]:
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_scaled, y_train)

y_pred_log = log_reg.predict(X_test_scaled)
y_pred_proba_log = log_reg.predict_proba(X_test_scaled)[:, 1]

print("Logistic Regression Results:")
print(classification_report(y_test, y_pred_log))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba_log):.4f}")

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
rf_model.fit(X_train_scaled, y_train)

y_pred_rf = rf_model.predict(X_test_scaled)
y_pred_proba_rf = rf_model.predict_proba(X_test_scaled)[:, 1]

print("\nRandom Forest Results:")
print(classification_report(y_test, y_pred_rf))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_proba_rf):.4f}")

In [None]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='importance', y='feature', data=feature_importance.head(10))
plt.title('Top 10 Feature Importances')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
fpr_log, tpr_log, _ = roc_curve(y_test, y_pred_proba_log)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)
plt.plot(fpr_log, tpr_log, label=f'Logistic Regression (AUC = {roc_auc_score(y_test, y_pred_proba_log):.4f})')
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {roc_auc_score(y_test, y_pred_proba_rf):.4f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve Comparison')
plt.legend()
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
sns.heatmap(confusion_matrix(y_test, y_pred_log), annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Logistic Regression Confusion Matrix')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap='Blues', ax=axes[1])
axes[1].set_title('Random Forest Confusion Matrix')
axes[1].set_xlabel('Predicted')
axes[1].set_ylabel('Actual')

plt.tight_layout()
plt.show()