# Loan Approval Prediction: EDA and Model Training

### Part 1: Data Loading & Exploratory Data Analysis (EDA)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

# Load the dataset
df = pd.read_csv('../data/loan_prediction_dataset.csv')

#### Initial Inspection

In [None]:
print("First 5 rows of the dataset:")
print(df.head())

print("\nDataset Information:")
df.info()

print("\nDescriptive Statistics:")
print(df.describe())

In [None]:
print("\nMissing values per column:")
print(df.isnull().sum())

#### Data Visualization

In [None]:
plt.figure(figsize=(6, 5))
sns.countplot(x='Loan_Status', data=df, palette='viridis')
plt.title('Distribution of Loan Approval Status')
plt.show()

In [None]:
# Explore relationships between features and the target
plt.figure(figsize=(8, 6))
sns.countplot(x='Education', hue='Loan_Status', data=df, palette='pastel')
plt.title('Education vs. Loan Status')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['ApplicantIncome'], bins=30, kde=True, color='skyblue')
plt.title('Applicant Income Distribution')
plt.show()

### Part 2: Data Preprocessing & Feature Engineering

#### Handle Missing Values

In [None]:
# For numerical columns, use the median for robustness to outliers
df['LoanAmount'] = df['LoanAmount'].fillna(df['LoanAmount'].median())
df['Loan_Amount_Term'] = df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].median())
df['Credit_History'] = df['Credit_History'].fillna(df['Credit_History'].median())

# For categorical columns, use the mode (most frequent value)
df['Gender'] = df['Gender'].fillna(df['Gender'].mode()[0])
df['Married'] = df['Married'].fillna(df['Married'].mode()[0])
df['Dependents'] = df['Dependents'].fillna(df['Dependents'].mode()[0])
df['Self_Employed'] = df['Self_Employed'].fillna(df['Self_Employed'].mode()[0])

# Verify that all missing values are handled
print("Missing values after handling:")
print(df.isnull().sum())

#### Encode Categorical Variables

In [None]:
# Convert target variable first
df['Loan_Status'] = df['Loan_Status'].map({'Y': 1, 'N': 0})

# Use one-hot encoding for other categorical features
df_encoded = pd.get_dummies(df, columns=['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area'], drop_first=True)

print("Data shape after encoding:", df_encoded.shape)
df_encoded.head()

#### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

# Drop Loan_ID as it's not a predictive feature
df_encoded = df_encoded.drop('Loan_ID', axis=1)

X = df_encoded.drop('Loan_Status', axis=1)
y = df_encoded['Loan_Status']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

X.head()

### Part 3: Model Training & Evaluation

#### Split the Data

In [None]:
from sklearn.model_selection import train_test_split

# stratify=y ensures the proportion of 'Y' and 'N' is the same in both the train and test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#### Train and Evaluate Models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# --- Logistic Regression ---
print("--- Logistic Regression ---")
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)
print("Accuracy:", accuracy_score(y_test, lr_preds))
print(classification_report(y_test, lr_preds))

# --- Random Forest ---
print("\n--- Random Forest ---")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, rf_preds))
print(classification_report(y_test, rf_preds))

# --- XGBoost ---
print("\n--- XGBoost ---")
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)
xgb_preds = xgb.predict(X_test)
print("Accuracy:", accuracy_score(y_test, xgb_preds))
print(classification_report(y_test, xgb_preds))

### Part 4: Model Selection & Saving

In [None]:
# Based on the results, Logistic Regression and Random Forest perform very well.
# Let's choose Random Forest as it's often more robust.

# Ensure the 'app' directory exists
import os
if not os.path.exists('../app'):
    os.makedirs('../app')

# Save the model, scaler, and column order to the 'app' directory
joblib.dump(rf, '../app/random_force_model.pkl')
joblib.dump(scaler, '../app/scaler.pkl')
joblib.dump(X.columns, '../app/model_columns.pkl')

print("Model, scaler, and columns saved successfully in the 'app' directory.")