In [11]:
import pandas as pd
import pyro
import torch

In [17]:
raw_path = "../../data/raw/"
loan_default_df = pd.read_csv(raw_path + "Loan_default.csv")

0         56
1         69
2         46
3         32
4         60
          ..
255342    19
255343    32
255344    56
255345    42
255346    62
Name: Age, Length: 255347, dtype: int64

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
import pyro
import pyro.distributions as dist
from pyro.infer import SVI, Trace_ELBO
from pyro.optim import Adam

In [19]:
# Convert categorical variables to numeric
categorical_columns = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 
                       'HasDependents', 'LoanPurpose', 'HasCoSigner']
label_encoders = {col: LabelEncoder() for col in categorical_columns}

for col in categorical_columns:
    loan_default_df[col] = label_encoders[col].fit_transform(loan_default_df[col])

# Normalize numerical features
numerical_columns = ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 
                     'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']
scaler = StandardScaler()
loan_default_df[numerical_columns] = scaler.fit_transform(loan_default_df[numerical_columns])

# Split the data into training and test sets
X = loan_default_df.drop(columns=['LoanID', 'Default'])
y = loan_default_df['Default']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Define the logistic regression model
def model(X, y=None):
    # Priors for weights and bias
    w_prior = dist.Normal(torch.zeros(X.shape[1]), torch.ones(X.shape[1])).to_event(1)
    b_prior = dist.Normal(0., 1.)
    
    # Sample weights and bias
    weights = pyro.sample('weights', w_prior)
    bias = pyro.sample('bias', b_prior)
    
    # Compute the logits
    logits = (X @ weights) + bias
    
    # Likelihood
    with pyro.plate('data', X.shape[0]):
        y = pyro.sample('obs', dist.Bernoulli(logits=logits), obs=y)

# Define the guide (variational distribution)
def guide(X, y=None):
    # Variational parameters for weights
    w_loc = pyro.param('w_loc', torch.randn(X.shape[1]))
    w_scale = pyro.param('w_scale', torch.ones(X.shape[1]), constraint=dist.constraints.positive)
    b_loc = pyro.param('b_loc', torch.tensor(0.))
    b_scale = pyro.param('b_scale', torch.tensor(1.), constraint=dist.constraints.positive)
    
    # Sample weights and bias
    weights = pyro.sample('weights', dist.Normal(w_loc, w_scale).to_event(1))
    bias = pyro.sample('bias', dist.Normal(b_loc, b_scale))

# Define the optimizer
optimizer = Adam({'lr': 0.01})

# Define the SVI object
svi = SVI(model, guide, optimizer, loss=Trace_ELBO())

# Training the model
num_iterations = 5000
for step in range(num_iterations):
    loss = svi.step(X_train_tensor, y_train_tensor)
    if step % 500 == 0:
        print(f'Step {step} : Loss = {loss}')

# Function to predict
def predict(X):
    w_loc = pyro.param('w_loc')
    b_loc = pyro.param('b_loc')
    logits = X @ w_loc + b_loc
    return torch.sigmoid(logits).round()

# Evaluate the model
y_pred_train = predict(X_train_tensor)
train_accuracy = (y_pred_train == y_train_tensor).float().mean().item()

y_pred_test = predict(X_test_tensor)
test_accuracy = (y_pred_test == y_test_tensor).float().mean().item()

print(f'Train Accuracy: {train_accuracy}')
print(f'Test Accuracy: {test_accuracy}')

Step 0 : Loss = 712894.465089798
Step 500 : Loss = 92338.48615282774
Step 1000 : Loss = 97768.59743851423
Step 1500 : Loss = 68180.95592796803
Step 2000 : Loss = 67633.76522356272
Step 2500 : Loss = 67249.34474694729
Step 3000 : Loss = 65970.93062913418
Step 3500 : Loss = 66445.28516495228
Step 4000 : Loss = 65433.48547422886
Step 4500 : Loss = 65468.01813709736
Train Accuracy: 0.8848915696144104
Test Accuracy: 0.8860583305358887
