# Project 1: Home Credit Default Risk (Binary Classification)

## Objective
Predict whether a client will default on a loan using the Home Credit Default Risk dataset.
This notebook covers:
1. Data Loading
2. Exploratory Data Analysis (EDA)
3. Preprocessing
4. Building a Deep ANN (7-8 Layers)
5. Hyperparameter Tuning
6. Training and Evaluation
7. Saving the Model

In [None]:
!pip install -q keras-tuner

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner as kt

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

## 1. Data Loading

In [None]:
url = "https://raw.githubusercontent.com/rmaso/home-credit-default-risk/master/application_train.csv"
df = pd.read_csv(url)
print(f"Dataset Shape: {df.shape}")

## 2. Exploratory Data Analysis (EDA)

In [None]:
df.head()

In [None]:
# Check target distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='TARGET', data=df)
plt.title('Target Distribution (0: No Default, 1: Default)')
plt.show()

print(df['TARGET'].value_counts(normalize=True))

In [None]:
# Missing Values
missing_values = df.isnull().mean() * 100
missing_values = missing_values[missing_values > 0].sort_values(ascending=False)
print("Top 10 features with missing values (%):")
print(missing_values.head(10))

## 3. Preprocessing

In [None]:
# Drop columns with too many missing values (>50%)
cols_to_drop = missing_values[missing_values > 50].index
df = df.drop(columns=cols_to_drop)
print(f"Shape after dropping columns: {df.shape}")

# Drop ID column
if 'SK_ID_CURR' in df.columns:
    df = df.drop(columns=['SK_ID_CURR'])

# Separate Target
X = df.drop(columns=['TARGET'])
y = df['TARGET']

# Handle Categorical and Numerical Columns
cat_cols = X.select_dtypes(include=['object']).columns
num_cols = X.select_dtypes(include=['number']).columns

# Impute Missing Values
# Numerical: Median
imputer_num = SimpleImputer(strategy='median')
X[num_cols] = imputer_num.fit_transform(X[num_cols])

# Categorical: Most Frequent
imputer_cat = SimpleImputer(strategy='most_frequent')
X[cat_cols] = imputer_cat.fit_transform(X[cat_cols])

# Encode Categorical Variables
X = pd.get_dummies(X, columns=cat_cols, drop_first=True)

print(f"Final Feature Shape: {X.shape}")

In [None]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 4 & 5. Build ANN & Hyperparameter Tuning

In [None]:
def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Input(shape=(X_train_scaled.shape[1],)))
    
    # Layer 0
    model.add(layers.Dense(units=hp.Int('units_0', min_value=32, max_value=256, step=32), activation='relu'))
    model.add(layers.Dropout(hp.Float('dropout_0', 0.0, 0.5, step=0.1)))
    if hp.Boolean('batch_norm_0'):
        model.add(layers.BatchNormalization())
    
    # Layer 1
    model.add(layers.Dense(units=hp.Int('units_1', min_value=32, max_value=256, step=32), activation='relu'))
    model.add(layers.Dropout(hp.Float('dropout_1', 0.0, 0.5, step=0.1)))
    if hp.Boolean('batch_norm_1'):
        model.add(layers.BatchNormalization())
    
    # Layer 2
    model.add(layers.Dense(units=hp.Int('units_2', min_value=32, max_value=256, step=32), activation='relu'))
    model.add(layers.Dropout(hp.Float('dropout_2', 0.0, 0.5, step=0.1)))
    if hp.Boolean('batch_norm_2'):
        model.add(layers.BatchNormalization())
    
    # Layer 3
    model.add(layers.Dense(units=hp.Int('units_3', min_value=32, max_value=256, step=32), activation='relu'))
    model.add(layers.Dropout(hp.Float('dropout_3', 0.0, 0.5, step=0.1)))
    if hp.Boolean('batch_norm_3'):
        model.add(layers.BatchNormalization())
    
    # Layer 4
    model.add(layers.Dense(units=hp.Int('units_4', min_value=32, max_value=256, step=32), activation='relu'))
    model.add(layers.Dropout(hp.Float('dropout_4', 0.0, 0.5, step=0.1)))
    if hp.Boolean('batch_norm_4'):
        model.add(layers.BatchNormalization())
    
    # Layer 5
    model.add(layers.Dense(units=hp.Int('units_5', min_value=32, max_value=256, step=32), activation='relu'))
    model.add(layers.Dropout(hp.Float('dropout_5', 0.0, 0.5, step=0.1)))
    if hp.Boolean('batch_norm_5'):
        model.add(layers.BatchNormalization())
    
    # Layer 6
    model.add(layers.Dense(units=hp.Int('units_6', min_value=32, max_value=256, step=32), activation='relu'))
    model.add(layers.Dropout(hp.Float('dropout_6', 0.0, 0.5, step=0.1)))
    if hp.Boolean('batch_norm_6'):
        model.add(layers.BatchNormalization())
    
    # Layer 7 (Conditional)
    if hp.Int('num_layers', 7, 8) >= 8:
        model.add(layers.Dense(units=hp.Int('units_7', min_value=32, max_value=256, step=32), activation='relu'))
        model.add(layers.Dropout(hp.Float('dropout_7', 0.0, 0.5, step=0.1)))
        if hp.Boolean('batch_norm_7'):
            model.add(layers.BatchNormalization())
            
    # Output Layer (Binary Classification)
    model.add(layers.Dense(1, activation='sigmoid'))
    
    # Compile
    learning_rate = hp.Float('lr', min_value=1e-4, max_value=1e-2, sampling='log')
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    return model

In [None]:
tuner = kt.RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=5,
    executions_per_trial=1,
    directory='my_dir',
    project_name='home_credit_tuning'
)

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

tuner.search(X_train_scaled, y_train, epochs=20, validation_split=0.2, callbacks=[stop_early])

In [None]:
# Get Best Hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"Best Number of Layers: {best_hps.get('num_layers')}")
print(f"Best Learning Rate: {best_hps.get('lr')}")

## 6. Train Best Model

In [None]:
model = tuner.hypermodel.build(best_hps)

history = model.fit(
    X_train_scaled, 
    y_train, 
    epochs=100, 
    validation_split=0.2,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)]
)

## 7. Evaluation & Visualization

In [None]:
# Plot Loss and Accuracy
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.show()

In [None]:
# Evaluate on Test Set
test_loss, test_acc = model.evaluate(X_test_scaled, y_test)
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Loss: {test_loss:.4f}")

## 8. Save Model

In [None]:
model.save('model_1.h5')
print("Model saved as model_1.h5")