In [101]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
import joblib
import os
from pathlib import Path

In [102]:
# Load and clean the dataset
df = pd.read_csv('E:/TCS project/Automated-Personal-Loan/data/loan_approval_dataset.csv')

# Strip whitespace from all string columns
string_cols = df.select_dtypes(include='object').columns
df[string_cols] = df[string_cols].apply(lambda x: x.str.strip())

# Display cleaned data
print("Cleaned data samples:")
print(df.head())


Cleaned data samples:
   loan_id   no_of_dependents     education  self_employed   income_annum  \
0        1                  2      Graduate             No        9600000   
1        2                  0  Not Graduate            Yes        4100000   
2        3                  3      Graduate             No        9100000   
3        4                  3      Graduate             No        8200000   
4        5                  5  Not Graduate            Yes        9800000   

    loan_amount   loan_term   cibil_score   residential_assets_value  \
0      29900000          12           778                    2400000   
1      12200000           8           417                    2700000   
2      29700000          20           506                    7100000   
3      30700000           8           467                   18200000   
4      24200000          20           382                   12400000   

    commercial_assets_value   luxury_assets_value   bank_asset_value  \
0         

In [103]:
# Data preprocessing
# Handle missing values if any
df = df.dropna()

# Convert categorical variables
df.columns = df.columns.str.strip()
label_encoders = {}
categorical_cols = ['education', 'self_employed', 'loan_status', 'address']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
    print(f"{col} categories:", le.classes_)

education categories: ['Graduate' 'Not Graduate']
self_employed categories: ['No' 'Yes']
loan_status categories: ['Approved' 'Rejected']
address categories: ['america' 'australia' 'dubai' 'india' 'japan']


In [104]:
X = df.drop(['loan_id', 'loan_status'], axis=1)
y = df['loan_status']

In [105]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [106]:
# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

In [107]:
# Evaluate model
y_pred = rf_model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9765807962529274

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       536
           1       0.98      0.96      0.97       318

    accuracy                           0.98       854
   macro avg       0.98      0.97      0.97       854
weighted avg       0.98      0.98      0.98       854



In [108]:
# Save model and preprocessing objects
models_path = Path("E:/TCS project/Automated-Personal-Loan/models")
models_path.mkdir(parents=True, exist_ok=True)

joblib.dump(rf_model, models_path / "loan_approval_model.pkl")
joblib.dump(scaler, models_path / "scaler.pkl")
joblib.dump(label_encoders, models_path / "label_encoders.pkl")

['E:\\TCS project\\Automated-Personal-Loan\\models\\label_encoders.pkl']

In [109]:
# Prediction function with proper categorical value handling
def predict_loan_status(extracted_data):
    # Load saved objects
    model = joblib.load(models_path / "loan_approval_model.pkl")
    scaler = joblib.load(models_path / "scaler.pkl")
    label_encoders = joblib.load(models_path / "label_encoders.pkl")
    
    # Create sanitized copy
    input_data = extracted_data.copy()
    
    # Standardize categorical values
    for col in ['education', 'self_employed', 'address']:
        if col in input_data:
            # Check both raw and stripped versions
            val = input_data[col]
            classes = label_encoders[col].classes_
            
            if val in classes:
                pass  # Exact match
            elif val.strip() in classes:
                input_data[col] = val.strip()
            else:
                print(f"Warning: Unknown {col} value '{val}', using default")
                input_data[col] = classes[0]
    
    # Convert to DataFrame
    input_df = pd.DataFrame([input_data])
    
    # Ensure all required columns are present
    required_cols = ['no_of_dependents', 'education', 'self_employed', 'income_annum',
                    'loan_amount', 'loan_term', 'cibil_score', 'residential_assets_value',
                    'commercial_assets_value', 'luxury_assets_value', 'bank_asset_value',
                    'address']
    
    for col in required_cols:
        if col not in input_df.columns:
            input_df[col] = 0  # Default value
    
    # Encode categorical variables
    for col in categorical_cols:
        if col in input_df.columns:
            input_df[col] = label_encoders[col].transform(input_df[col])
    
    # Scale the data
    input_scaled = scaler.transform(input_df[required_cols])
    
    # Make prediction
    prediction = model.predict(input_scaled)
    proba = model.predict_proba(input_scaled)
    
    # Convert prediction back to original label
    status = label_encoders['loan_status'].inverse_transform(prediction)
    
    return status[0], proba[0][1]

In [110]:
# Test cases
test_cases = [
    # Rejected case (high risk)
    {
        'no_of_dependents': 5,
        'education': 'Not Graduate',
        'self_employed': 'Yes',
        'income_annum': 200000,
        'loan_amount': 500000,
        'loan_term': 10,
        'cibil_score': 350,
        'residential_assets_value': 100000,
        'commercial_assets_value': 0,
        'luxury_assets_value': 0,
        'bank_asset_value': 50000,
        'address': 'america'
    },
    # Approved case (low risk)
    {
        'no_of_dependents': 1,
        'education': 'Graduate',
        'self_employed': 'No',
        'income_annum': 800000,
        'loan_amount': 200000,
        'loan_term': 3,
        'cibil_score': 800,
        'residential_assets_value': 500000,
        'commercial_assets_value': 300000,
        'luxury_assets_value': 200000,
        'bank_asset_value': 700000,
        'address': 'india'
    }
]

In [111]:
# Run test cases
for i, test_data in enumerate(test_cases):
    try:
        status, proba = predict_loan_status(test_data)
        print(f"\nTest Case {i+1} Result:")
        print(f"Status: {status} | Disapproval Probability: {proba:.2%}")
        print("Input Features:")
        for k, v in test_data.items():
            print(f"{k}: {v}")
    except Exception as e:
        print(f"Error processing test case {i+1}: {str(e)}")


Test Case 1 Result:
Status: Rejected | Disapproval Probability: 95.00%
Input Features:
no_of_dependents: 5
education: Not Graduate
self_employed: Yes
income_annum: 200000
loan_amount: 500000
loan_term: 10
cibil_score: 350
residential_assets_value: 100000
commercial_assets_value: 0
luxury_assets_value: 0
bank_asset_value: 50000
address: america

Test Case 2 Result:
Status: Approved | Disapproval Probability: 9.00%
Input Features:
no_of_dependents: 1
education: Graduate
self_employed: No
income_annum: 800000
loan_amount: 200000
loan_term: 3
cibil_score: 800
residential_assets_value: 500000
commercial_assets_value: 300000
luxury_assets_value: 200000
bank_asset_value: 700000
address: india
