In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
import keras

In [2]:
#loaded the dataset
data_set = pd.read_csv('loan_data.csv')
shuffled_data_set = data_set.sample(frac=1, random_state=42).reset_index(drop=True)
print(shuffled_data_set)


       person_age person_gender person_education  person_income  \
0            32.0          male        Associate        96865.0   
1            24.0          male        Associate        56838.0   
2            22.0        female           Master        37298.0   
3            23.0        female         Bachelor        39944.0   
4            42.0          male      High School        67974.0   
...           ...           ...              ...            ...   
44995        26.0          male      High School        88451.0   
44996        25.0        female      High School        34772.0   
44997        33.0        female        Associate        58317.0   
44998        26.0          male           Master       178602.0   
44999        26.0        female      High School       210894.0   

       person_emp_exp person_home_ownership  loan_amnt        loan_intent  \
0                  10              MORTGAGE     7500.0          EDUCATION   
1                   6                  RE

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import tensorflow as tf
from keras.models import Sequential

# Reload the dataset
dataset = pd.read_csv('loan_data.csv')

# Identify categorical columns
categorical_columns = dataset.select_dtypes(include=['object']).columns

# Apply one-hot encoding to categorical columns
dataset = pd.get_dummies(dataset, columns=categorical_columns, drop_first=True)

# Shuffle the dataset
shuffle_dataset = dataset.sample(frac=1, random_state=42).reset_index(drop=True)

# Split the data into training and test sets
train_set = shuffle_dataset[:int(len(shuffle_dataset) * 0.8)]
test_set = shuffle_dataset[int(len(shuffle_dataset) * 0.8):]

# Extract labels
train_labels = train_set['loan_status']
test_labels = test_set['loan_status']

# Define subsets of features
highly_important_features = [
    "person_income",
    "loan_amnt",
    "loan_int_rate",
    "credit_score",
    "cb_person_cred_hist_length",
    "loan_percent_income"
]

# Feature subsets for comparison
feature_subsets = [
    highly_important_features,  # Only highly important features
    train_set.drop(['loan_status'], axis=1).columns.tolist()  # All features
]

# Initialize results list
results = []

# Loop through feature subsets
for i, subset in enumerate(feature_subsets):
    print(f"Training model with feature subset {i + 1}: {subset}")
    
    # Select features
    train_features = train_set[subset]
    test_features = test_set[subset]
    
    # Normalize the features
    scaler = StandardScaler()
    train_features = scaler.fit_transform(train_features)
    test_features = scaler.transform(test_features)
    
    # Create a simple neural network model
    model = Sequential()
    model.add(tf.keras.layers.Dense(1, activation='sigmoid', input_shape=(train_features.shape[1],)))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
    # Use model checkpointing
    checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=f'best_model_subset_{i + 1}.h5',
        monitor='val_accuracy',
        save_best_only=True
    )
    
    # Train the model
    history = model.fit(
        train_features,
        train_labels,
        epochs=10,
        batch_size=32,
        validation_split=0.2,
        callbacks=[checkpoint_callback]
    )
    
    # Evaluate the model
    test_loss, test_accuracy = model.evaluate(test_features, test_labels)
    predictions = (model.predict(test_features) > 0.5).astype("int32")
    precision = precision_score(test_labels, predictions)
    recall = recall_score(test_labels, predictions)
    f1 = f1_score(test_labels, predictions)
    
    # Append results
    results.append({
        "Feature Subset": f"Subset {i + 1} ({'Highly Important Features' if i == 0 else 'All Features'})",
        "Test Accuracy": f"{test_accuracy * 100:.2f} %",
        "Precision": f"{precision * 100:.2f} %",
        "Recall": f"{recall * 100:.2f} %",
        "F1 Score": f"{f1:.4f}"
    })

# Create a results table
results_table = pd.DataFrame(results)

# Display the results table
print("\nModel Performance Comparison:")
print(results_table)

# Save the results table to a CSV file for further analysis
results_table.to_csv("feature_comparison_results.csv", index=False)

Training model with feature subset 1: ['person_income', 'loan_amnt', 'loan_int_rate', 'credit_score', 'cb_person_cred_hist_length', 'loan_percent_income']


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4384 - loss: 0.9230



[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.4385 - loss: 0.9229 - val_accuracy: 0.7518 - val_loss: 0.5823
Epoch 2/10
[1m894/900[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.7735 - loss: 0.5357



[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.7736 - loss: 0.5355 - val_accuracy: 0.8028 - val_loss: 0.4517
Epoch 3/10
[1m891/900[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 3ms/step - accuracy: 0.8126 - loss: 0.4393



[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.8126 - loss: 0.4392 - val_accuracy: 0.8233 - val_loss: 0.4131
Epoch 4/10
[1m879/900[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.8206 - loss: 0.4163



[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8207 - loss: 0.4162 - val_accuracy: 0.8269 - val_loss: 0.4002
Epoch 5/10
[1m893/900[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.8313 - loss: 0.4014



[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8313 - loss: 0.4014 - val_accuracy: 0.8288 - val_loss: 0.3954
Epoch 6/10
[1m874/900[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.8269 - loss: 0.4005



[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8269 - loss: 0.4005 - val_accuracy: 0.8292 - val_loss: 0.3930
Epoch 7/10
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8285 - loss: 0.3974 - val_accuracy: 0.8290 - val_loss: 0.3917
Epoch 8/10
[1m886/900[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.8273 - loss: 0.3977



[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8273 - loss: 0.3977 - val_accuracy: 0.8301 - val_loss: 0.3907
Epoch 9/10
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8266 - loss: 0.3980 - val_accuracy: 0.8300 - val_loss: 0.3899
Epoch 10/10
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8246 - loss: 0.4044 - val_accuracy: 0.8294 - val_loss: 0.3892
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8270 - loss: 0.3876
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Training model with feature subset 2: ['person_age', 'person_income', 'person_emp_exp', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score', 'person_gender_male', 'person_education_Bachelor', 'person_education_Doctorate', 'person_education_High School', 'person_education_Master', 'person_home_

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m886/900[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.5493 - loss: 0.7635



[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.5514 - loss: 0.7610 - val_accuracy: 0.8486 - val_loss: 0.4223
Epoch 2/10
[1m888/900[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.8698 - loss: 0.3864



[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8700 - loss: 0.3860 - val_accuracy: 0.8921 - val_loss: 0.3130
Epoch 3/10
[1m884/900[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 1ms/step - accuracy: 0.8902 - loss: 0.3025



[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8902 - loss: 0.3023 - val_accuracy: 0.8944 - val_loss: 0.2717
Epoch 4/10
[1m869/900[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.8963 - loss: 0.2672



[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8963 - loss: 0.2671 - val_accuracy: 0.8963 - val_loss: 0.2519
Epoch 5/10
[1m891/900[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.8951 - loss: 0.2499



[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8951 - loss: 0.2499 - val_accuracy: 0.8972 - val_loss: 0.2411
Epoch 6/10
[1m895/900[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.8934 - loss: 0.2460



[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8934 - loss: 0.2460 - val_accuracy: 0.8975 - val_loss: 0.2350
Epoch 7/10
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8953 - loss: 0.2384 - val_accuracy: 0.8971 - val_loss: 0.2312
Epoch 8/10
[1m899/900[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - accuracy: 0.8974 - loss: 0.2313



[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8974 - loss: 0.2313 - val_accuracy: 0.8985 - val_loss: 0.2288
Epoch 9/10
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8989 - loss: 0.2286 - val_accuracy: 0.8976 - val_loss: 0.2274
Epoch 10/10
[1m900/900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8957 - loss: 0.2299 - val_accuracy: 0.8978 - val_loss: 0.2259
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8982 - loss: 0.2226
[1m282/282[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step

Model Performance Comparison:
                         Feature Subset Test Accuracy Precision   Recall  \
0  Subset 1 (Highly Important Features)       82.54 %   68.04 %  39.78 %   
1               Subset 2 (All Features)       89.76 %   78.18 %  74.49 %   

  F1 Score  
0   0.5021  
1   0.7629  
