In [1]:
import pandas as pd
import pyro
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [34]:
raw_path = "../../data/raw/"
loan_default_df = pd.read_csv(raw_path + "Loan_default.csv")

In [32]:
loan_default_df.head()
loan_default_df = loan_default_df.drop('LoanID',axis=1)

In [51]:
loan_default = loan_default_df.drop(["LoanID"], axis=1)
categorical_columns = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 
                       'HasDependents', 'LoanPurpose', 'HasCoSigner']
loan_default_df_num = loan_default.drop(categorical_columns, axis=1)

numerical_columns = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 
                     'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']
loan_default_df_cat = loan_default.drop(numerical_columns, axis=1)

num_stats =[loan_default_df_num.max(),
            loan_default_df_num.min(),
            loan_default_df_num.mean(),
            loan_default_df_num.std(),
            loan_default_df_num.skew().values,
            loan_default_df_num.kurtosis()         
            ]
num_stats


[array([ 6.97854369e-04, -3.80513285e-04, -1.82724685e-03,  4.68818634e-03,
        -2.14168362e-03, -2.78024759e-04,  4.60789092e-03, -2.17779617e-03,
        -1.49896337e-03,  2.39637696e+00]),
 Age              -1.198431
 Income           -1.198361
 LoanAmount       -1.203680
 CreditScore      -1.200302
 MonthsEmployed   -1.199632
 NumCreditLines   -1.357671
 InterestRate     -1.197167
 LoanTerm         -1.299895
 DTIRatio         -1.199675
 Default           3.742652
 dtype: float64,
 Age                  18.0
 Income            15000.0
 LoanAmount         5000.0
 CreditScore         300.0
 MonthsEmployed        0.0
 NumCreditLines        1.0
 InterestRate          2.0
 LoanTerm             12.0
 DTIRatio              0.1
 Default               0.0
 dtype: float64,
 Age                   69.0
 Income            149999.0
 LoanAmount        249999.0
 CreditScore          849.0
 MonthsEmployed       119.0
 NumCreditLines         4.0
 InterestRate          25.0
 LoanTerm              6

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam

In [4]:
# Convert categorical variables to numeric
categorical_columns = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 
                       'HasDependents', 'LoanPurpose', 'HasCoSigner']
label_encoders = {col: LabelEncoder() for col in categorical_columns}

for col in categorical_columns:
    loan_default_df[col] = label_encoders[col].fit_transform(loan_default_df[col])

# Normalize numerical features
numerical_columns = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 
                     'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']
scaler = StandardScaler()
loan_default_df[numerical_columns] = scaler.fit_transform(loan_default_df[numerical_columns])

# Split the data into training and test sets
X = loan_default_df.drop(columns=['LoanID', 'Default'])
y = loan_default_df['Default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Define the logistic regression model
def model(X, y=None):
    # Priors for weights and bias
    w_prior = dist.Normal(torch.zeros(X.shape[1]), torch.ones(X.shape[1])).to_event(1)
    b_prior = dist.Normal(0., 1.)
    
    # Sample weights and bias
    weights = pyro.sample('weights', w_prior)
    bias = pyro.sample('bias', b_prior)
    
    # Compute the logits
    logits = (X @ weights) + bias
    
    # Likelihood
    with pyro.plate('data', X.shape[0]):
        y = pyro.sample('obs', dist.Bernoulli(logits=logits), obs=y)

# Define the guide (variational distribution)
def guide(X, y=None):
    # Variational parameters for weights
    w_loc = pyro.param('w_loc', torch.randn(X.shape[1]))
    w_scale = pyro.param('w_scale', torch.ones(X.shape[1]), constraint=dist.constraints.positive)
    b_loc = pyro.param('b_loc', torch.tensor(0.))
    b_scale = pyro.param('b_scale', torch.tensor(1.), constraint=dist.constraints.positive)
    
    # Sample weights and bias
    weights = pyro.sample('weights', dist.Normal(w_loc, w_scale).to_event(1))
    bias = pyro.sample('bias', dist.Normal(b_loc, b_scale))

# Define the optimizer
optimizer = Adam({'lr': 0.01})

# Define the SVI object
svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

# Training the model
num_iterations = 5000
for step in range(num_iterations):
    loss = svi.step(X_train_tensor, y_train_tensor)
    if step % 500 == 0:
        print(f'Step {step} : Loss = {loss}')

# Function to predict
def predict(X):
    w_loc = pyro.param('w_loc')
    b_loc = pyro.param('b_loc')
    logits = X @ w_loc + b_loc
    return torch.sigmoid(logits).round()

# Evaluate the model
y_pred_train = predict(X_train_tensor)
train_accuracy = (y_pred_train == y_train_tensor).float().mean().item()

y_pred_test = predict(X_test_tensor)
test_accuracy = (y_pred_test == y_test_tensor).float().mean().item()

print(f'Train Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')

Step 0 : Loss = 822149.9795074463
Step 500 : Loss = 82473.85537350178
Step 1000 : Loss = 91284.19500887394
Step 1500 : Loss = 68283.68096578121
Step 2000 : Loss = 66059.10210263729
Step 2500 : Loss = 66148.44097816944
Step 3000 : Loss = 65548.52555477619
Step 3500 : Loss = 65914.88224542141
Step 4000 : Loss = 65515.303657889366
Step 4500 : Loss = 65062.091701865196
Train Accuracy: 0.8850531578063965
Test Accuracy: 0.8860387802124023
