In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.metrics import LogLoss


from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


from imblearn.under_sampling import RandomUnderSampler

import sys

class AUPRC(Metric):
    def __init__(self):
        self._name = "auprc"
        self._maximize = True

    def __call__(self, y_true, y_pred):
        #print(y_true[1:10])
        #print(y_pred[1:10])
        #sys.exit()
        
        return average_precision_score(y_true, y_pred[ :,1])

class AUC(Metric):
    def __init__(self):
        self._name = "auc"
        self._maximize = True

    def __call__(self, y_true, y_pred):
        #print(y_true[1:10])
        #print(y_pred[1:10])
        #sys.exit()
        
        return roc_auc_score(y_true, y_pred[ :,1])
    

class Precision(Metric):
    def __init__(self):
        self._name = "precision"
        self._maximize = True

    def __call__(self, y_true, y_pred):
        y_pred_class = y_pred.argmax(axis=1)
        return precision_score(y_true, y_pred_class)
    
class Recall(Metric):
    def __init__(self):
        self._name = "recall"
        self._maximize = True

    def __call__(self, y_true, y_pred):
        y_pred_class = y_pred.argmax(axis=1)
        return recall_score(y_true, y_pred_class)

class AsymmetricHuberLoss(nn.Module):
    def __init__(self, delta=1.0, false_positive_weight=0.5, false_negative_weight=1):
        super(AsymmetricHuberLoss, self).__init__()
        self.delta = delta
        self.false_positive_weight = false_positive_weight
        self.false_negative_weight = false_negative_weight

    def forward(self, y_pred_logits, y_true):
        y_pred = torch.nn.functional.softmax(y_pred_logits, dim=1)  # Apply softmax activation to convert logits to probabilities
        y_true_one_hot = torch.nn.functional.one_hot(y_true, num_classes=y_pred_logits.shape[1])  # Convert y_true to one-hot encoding
        y_true_one_hot = y_true_one_hot.type_as(y_pred)  # Convert to the same dtype as y_pred
        error = y_true_one_hot - y_pred
        abs_error = torch.abs(error)

        # Different weights for false positive and false negative errors
        weight = torch.where(error > 0, self.false_positive_weight, self.false_negative_weight)

        # Calculate the asymmetric Huber loss
        huber_loss = torch.where(abs_error <= self.delta,
                                 0.5 * weight * (error ** 2),
                                 weight * self.delta * (abs_error - 0.5 * self.delta))
        return torch.mean(huber_loss)

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.5, gamma=2.0):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        # Calculate the cross-entropy loss
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')

        # Get the probability of the true class
        prob_true_class = torch.gather(F.softmax(inputs, dim=1), 1, targets.view(-1, 1))

        # Calculate the modulating factor
        modulating_factor = (1 - prob_true_class) ** self.gamma

        # Calculate the class weights based on the targets
        class_weights = torch.where(targets.view(-1, 1) == 1, self.alpha, 1 - self.alpha)

        # Calculate the focal loss
        focal_loss = class_weights * modulating_factor * ce_loss

        return focal_loss.mean()

def visualize_masks(model, x):
    """
    Visualize the masks for a specific example.

    Parameters:
    model: The trained TabNet model.
    x: A single example from the input features (X).
    """
    # Run the model in explain mode to get the masks
    M_explain, _ = model.explain(torch.from_numpy(x.astype(np.float32)).unsqueeze(0))

    # Extract the masks
    masks = M_explain.squeeze()

    # Ensure the masks array has the correct shape
    if masks.ndim == 1:
        masks = masks[:, np.newaxis]

    masks = masks.T

    # Plot the masks as a heatmap
    plt.figure(figsize=(10, model.n_steps))
    sns.heatmap(masks, cmap='viridis', linewidths=0.1, linecolor='white', cbar=False, yticklabels=False)
    plt.xlabel('Step')
    plt.ylabel('Feature')
    plt.title('TabNet Mask Visualization')
    plt.show()


def preprocess_data(filename):
    data = pd.read_csv(filename)
    
    def separate_features(dataframe):
        categorical_features = []
        continuous_features = []

        for column in dataframe.columns:
            if dataframe[column].dtype == 'object' or dataframe[column].dtype.name == 'category':
                categorical_features.append(column)
            elif dataframe[column].dtype == 'int64' or dataframe[column].dtype == 'float64':
                continuous_features.append(column)
            else:
                print(f'Unhandled data type in column "{column}": {dataframe[column].dtype}')

        return categorical_features, continuous_features




    data = data[data['loan_status'].isin(['Fully Paid','Charged Off','Defaulted'])]
    leakage_features = [
        'out_prncp',
        'out_prncp_inv',
        'total_pymnt',
        'total_pymnt_inv',
        'total_rec_prncp',
        'total_rec_int',
        'total_rec_late_fee',
        'recoveries',
        'collection_recovery_fee',
        'last_pymnt_d',
        'last_pymnt_amnt',
        'next_pymnt_d',
        'last_credit_pull_d',
        'debt_settlement_flag',
        'debt_settlement_flag_date',
        'settlement_status',
        'settlement_date',
        'settlement_amount',
        'settlement_percentage',
        'settlement_term',
        'last_fico_range_high',
        'last_fico_range_low',
        'id',
        'url',
        'emp_title',
        'title'
    ]

    # Drop the leakage features
    data = data.drop(leakage_features, axis=1)

    # Calculate the percentage of missing values for each feature
    missing_values = data.isnull().sum() / len(data) * 100

    # Identify columns with more than 90% missing values
    columns_to_drop = missing_values[missing_values > 90].index

    # Drop the identified columns
    data = data.drop(columns_to_drop, axis=1)

    categorical_features, continuous_features = separate_features(data)
    categorical_features.remove('loan_status')

   
    data[continuous_features] = data[continuous_features].fillna(-1)
    
    
    # Fill categorical NAs with "missing" and convert all elements to strings
    data[categorical_features] = data[categorical_features].fillna('missing').astype(str)

    data.isna().sum().sum()
    
    
    for feature in categorical_features:
        label_encoder = LabelEncoder()
        data[feature] = label_encoder.fit_transform(data[feature])
    


    # Preprocess loan status labels with custom encoding
    custom_encoding = {
        'Fully Paid': 0,
        'Charged Off': 1,
        'Default': 1,
    
    }
    data['loan_status'] = data['loan_status'].replace(custom_encoding)
    
    return data

#a=torch.cuda.FloatTensor()
display(torch.cuda.is_available())
#sys.exit()

filename = 'accepted_2007_to_2018Q4.csv'
df = preprocess_data(filename)
X = df.drop(columns=['loan_status']).values
y = df['loan_status'].values


pretrainer_model_path = 'tabnet_pretrainer.pth.zip'


# Instantiate the TabNetPretrainer
pretrainer = TabNetPretrainer()
pretrainer.load_model(pretrainer_model_path) 



#Split the dataset
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)

#Undersample the training set
undersampler = RandomUnderSampler(random_state=42)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)


# Adjust hyperparameters according to the TabNet paper
tabnet_params = dict(
    n_d=64,
    n_a=64,
    n_steps=5,
    gamma=1.6,
    n_independent=2,
    n_shared=2,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
    mask_type='entmax',
    scheduler_params=dict(T_0=200, T_mult=1, eta_min=1e-6, last_epoch=-1, verbose=False),
    scheduler_fn=torch.optim.lr_scheduler.CosineAnnealingWarmRestarts,
    seed=42,
    verbose=1,
)


#loss_function = FocalLoss()

# Instantiate the TabNetClassifier with the pretrained weights
tabnet_model = TabNetClassifier(**tabnet_params)

tabnet_model.fit(
    X_train_resampled,
    y_train_resampled,
    eval_set=[(X_val, y_val)],
    eval_metric=['logloss'],
    max_epochs=300,
    batch_size=16384,
    virtual_batch_size=512,
    num_workers=0,
    drop_last=False,
    from_unsupervised=pretrainer,
    patience=12,
    #loss_fn=loss_function
)



# Make predictions on the test set
y_test_pred = tabnet_model.predict(X_test)
y_test_pred_proba = tabnet_model.predict_proba(X_test)[:, 1]

y_val_pred = tabnet_model.predict(X_val)

# Calculate the test accuracy
val_accuracy =accuracy_score(y_val, y_val_pred)
test_accuracy =accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1_score = f1_score(y_test, y_test_pred)
test_auc_roc = roc_auc_score(y_test, y_test_pred_proba)

print(f"Validation accuracy: {val_accuracy}")
print(f"Test accuracy: {test_accuracy}")
print(f"Test precision: {test_precision}")
print(f"Test recall: {test_recall}")
print(f"Test F1-score: {test_f1_score}")
print(f"Test AUC-ROC score: {test_auc_roc}")

True

  data = pd.read_csv(filename)


epoch 0  | loss: 0.91007 | val_0_logloss: 1.07119 |  0:00:09s
epoch 1  | loss: 0.6326  | val_0_logloss: 0.96487 |  0:00:17s
epoch 2  | loss: 0.62639 | val_0_logloss: 0.78798 |  0:00:25s
epoch 3  | loss: 0.62288 | val_0_logloss: 0.81573 |  0:00:32s
epoch 4  | loss: 0.62072 | val_0_logloss: 0.7553  |  0:00:40s
epoch 5  | loss: 0.61788 | val_0_logloss: 0.74457 |  0:00:47s
epoch 6  | loss: 0.616   | val_0_logloss: 0.72267 |  0:00:55s
epoch 7  | loss: 0.61525 | val_0_logloss: 0.68948 |  0:01:03s
epoch 8  | loss: 0.615   | val_0_logloss: 0.64758 |  0:01:10s
epoch 9  | loss: 0.61309 | val_0_logloss: 0.65777 |  0:01:18s
epoch 10 | loss: 0.61259 | val_0_logloss: 0.64992 |  0:01:26s
epoch 11 | loss: 0.6117  | val_0_logloss: 0.62041 |  0:01:34s
epoch 12 | loss: 0.61657 | val_0_logloss: 0.60872 |  0:01:42s
epoch 13 | loss: 0.61587 | val_0_logloss: 0.61207 |  0:01:49s
epoch 14 | loss: 0.6134  | val_0_logloss: 0.59547 |  0:01:57s
epoch 15 | loss: 0.61207 | val_0_logloss: 0.63081 |  0:02:05s
epoch 16



Validation accuracy: 0.6750823230333529
Test accuracy: 0.6736216931413578
Test precision: 0.334475528548409
Test recall: 0.64151400059577
Test F1-score: 0.43969884514770624
Test AUC-ROC score: 0.7233147377384213


In [2]:
# Save the model
model_path = 'tabnet_model.pth'
tabnet_model.save_model(model_path)


Successfully saved model at tabnet_model.pth.zip


'tabnet_model.pth.zip'