# Loan Default Prediction (AU Dataset)

##### Source : https://archive.ics.uci.edu/dataset/143/statlog+australian+credit+approval

In [2]:
import subprocess
import sys

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

packages = ['pandas', 'numpy', 'matplotlib', 'seaborn', 'tensorflow', 'sklearn', 'sktime', 'prophet']

for package in packages:
    try:
        __import__(package)  
    except ImportError:
        install(package)  

from IPython.display import clear_output

clear_output(wait=True)

print("All required packages are installed.")

All required packages are installed.


## Load & Explore Dataset

In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
statlog_australian_credit_approval = fetch_ucirepo(id=143) 
  
# data (as pandas dataframes) 
X = statlog_australian_credit_approval.data.features 
y = statlog_australian_credit_approval.data.targets 
  
# metadata 
print(statlog_australian_credit_approval.metadata) 
  
# variable information 
print(statlog_australian_credit_approval.variables) 


{'uci_id': 143, 'name': 'Statlog (Australian Credit Approval)', 'repository_url': 'https://archive.ics.uci.edu/dataset/143/statlog+australian+credit+approval', 'data_url': 'https://archive.ics.uci.edu/static/public/143/data.csv', 'abstract': 'This file concerns credit card applications. This database exists elsewhere in the repository (Credit Screening Database) in a slightly different form', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 690, 'num_features': 14, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': [], 'target_col': ['A15'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1987, 'last_updated': None, 'dataset_doi': '10.24432/C59012', 'creators': ['Ross Quinlan'], 'intro_paper': None, 'additional_info': {'summary': 'This file concerns credit card applications.  All attribute names and values have been changed to meaningless symbols to protect 

## Import necessary libraries

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from ucimlrepo import fetch_ucirepo


## Train - Test Split

In [4]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the data to confirm the split
print(f"Training data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")


Training data shape: (552, 14)
Testing data shape: (138, 14)


## Standardise the features

In [5]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data
X_test_scaled = scaler.transform(X_test)


## Train a RandomForestClassifier model

In [6]:
# Initialize the RandomForestClassifier
model = RandomForestClassifier(random_state=42)

# Train the model on the scaled training data
model.fit(X_train_scaled, y_train)


  return fit_method(estimator, *args, **kwargs)


## Make predictions on the test data

In [7]:
# Predict the target values for the test set
y_pred = model.predict(X_test_scaled)


## Evaluate the Model

In [8]:
# Evaluate the model performance using accuracy and classification report
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print detailed classification report
print(classification_report(y_test, y_pred))


Accuracy: 87.68%
              precision    recall  f1-score   support

           0       0.88      0.93      0.91        87
           1       0.87      0.78      0.82        51

    accuracy                           0.88       138
   macro avg       0.88      0.86      0.86       138
weighted avg       0.88      0.88      0.88       138



## Feature Importance

In [9]:
# Get feature importances from the trained model
importances = model.feature_importances_

# Display the importance of each feature
feature_importance = pd.Series(importances, index=X.columns)
print(feature_importance.sort_values(ascending=False))


A8     0.302660
A10    0.105033
A7     0.098421
A3     0.080852
A14    0.075885
A2     0.072185
A5     0.069494
A13    0.067729
A9     0.051450
A6     0.025491
A4     0.017410
A11    0.011754
A1     0.011028
A12    0.010607
dtype: float64
