# Loan Approval Prediction

This notebook preprocesses loan application data and trains a machine learning model to predict loan approval.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [None]:
# Load dataset
df = pd.read_csv('loan_data.csv')

# Display first few rows
df.head()

In [None]:
# Display dataset info
df.info()

# Check for missing values
df.isnull().sum()

In [None]:
# Encode categorical variables
cat_cols = ['person_gender', 'person_education', 'person_home_ownership', 'loan_intent', 'previous_loan_defaults_on_file']
encoder = LabelEncoder()
for col in cat_cols:
    df[col] = encoder.fit_transform(df[col])

# Show transformed data
df.head()

In [None]:
# Identify outliers using boxplots
plt.figure(figsize=(15, 10))
for i, col in enumerate(['person_income', 'loan_amnt', 'loan_int_rate', 'credit_score'], 1):
    plt.subplot(2, 2, i)
    sns.boxplot(x=df[col], palette="coolwarm")
    plt.title(f"Boxplot of {col}")

plt.tight_layout()
plt.show()

In [None]:
# Function to cap outliers using IQR
def cap_outliers(df, col):
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
    return df

# Apply outlier capping
outlier_cols = ['person_income', 'loan_amnt', 'credit_score']
for col in outlier_cols:
    df = cap_outliers(df, col)

In [None]:
# Apply Standard Scaling
scaler = StandardScaler()
num_cols = ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'credit_score']
df[num_cols] = scaler.fit_transform(df[num_cols])

# Show dataset after scaling
df.head()

In [None]:
# Split Data into Training & Testing Sets
X = df.drop(columns=['loan_status'])
y = df['loan_status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(classification_rep)

In [None]:
# Save the trained model
joblib.dump(model, 'loan_approval_model.pkl')