# 1) Importing necessary libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# 2) Importing and preparing the data for training

In [2]:
# import dsataset
loan_data = pd.read_csv('preprocessed_loans50k.csv')

# Convert 'status' to numerical labels
status_mapping = {
    'Safe': 0,
    'Risky': 1
}

loan_data['status'] = loan_data['status'].map(status_mapping)

# Separate features and labels
X = loan_data.drop('status', axis=1)
y = loan_data['status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert data to PyTorch tensors
X_train = torch.tensor(X_train.values, dtype=torch.float32)
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_train = torch.tensor(y_train.values, dtype=torch.long)
y_test = torch.tensor(y_test.values, dtype=torch.long)

# Create DataLoader for training and testing data
train_dataset = data.TensorDataset(X_train, y_train)
test_dataset = data.TensorDataset(X_test, y_test)

train_loader = data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = data.DataLoader(test_dataset, batch_size=64, shuffle=False)


# import dsataset
loan_data = pd.read_csv('preprocessed_loans50k.csv')

from imblearn.over_sampling import RandomOverSampler

# Convert 'status' to numerical labels
status_mapping = {
    'Safe': 0,
    'Risky': 1
}

loan_data['status'] = loan_data['status'].map(status_mapping)

# Separate features and labels
X = loan_data.drop('status', axis=1)
y = loan_data['status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply oversampling to the training set
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train, y_train)

# Convert data to PyTorch tensors
X_train_resampled = torch.tensor(X_train_resampled.values, dtype=torch.float32)
X_test = torch.tensor(X_test.values, dtype=torch.float32)
y_train_resampled = torch.tensor(y_train_resampled.values, dtype=torch.float32)
y_test = torch.tensor(y_test.values, dtype=torch.float32)

# Create DataLoader for resampled training and testing data
train_dataset = data.TensorDataset(X_train_resampled, y_train_resampled)
test_dataset = data.TensorDataset(X_test, y_test)

train_loader = data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = data.DataLoader(test_dataset, batch_size=64, shuffle=False)

# 3) Defining Neural Network Model

In [3]:
class LoanModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LoanModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# 4) Training the Model

In [4]:
def train_model(model, train_loader, test_loader, num_epochs, learning_rate, target_accuracy, device):
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    writer = SummaryWriter()

    total_steps = len(train_loader)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        average_loss = total_loss / len(train_loader)
        writer.add_scalar('Loss/train', average_loss, epoch + 1)

        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        writer.add_scalar('Accuracy/test', accuracy, epoch + 1)

        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}, Accuracy: {accuracy:.2f}%')

        if accuracy >= target_accuracy:
            print(f"Reached target accuracy of {target_accuracy:.2f}%.")
            break

    writer.flush()
    writer.close()

# Set the device to GPU if available, otherwise CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the model
input_dim = X_train.shape[1]
hidden_dim = 1024 
output_dim = len(status_mapping)
model = LoanModel(input_dim, hidden_dim, output_dim)

# Train the model with stopping criterion
num_epochs = 100
learning_rate = 0.001
target_accuracy = 95.0  # Stop training when the accuracy reaches 80%
train_model(model, train_loader, test_loader, num_epochs, learning_rate, target_accuracy, device)

Epoch [1/100], Loss: 545.2132, Accuracy: 89.84%
Epoch [2/100], Loss: 372.1509, Accuracy: 88.48%
Epoch [3/100], Loss: 269.9355, Accuracy: 86.44%
Epoch [4/100], Loss: 214.8713, Accuracy: 85.63%
Epoch [5/100], Loss: 191.1319, Accuracy: 89.62%
Epoch [6/100], Loss: 142.4723, Accuracy: 87.58%
Epoch [7/100], Loss: 110.8655, Accuracy: 90.60%
Epoch [8/100], Loss: 66.0129, Accuracy: 72.54%
Epoch [9/100], Loss: 78.4111, Accuracy: 91.39%
Epoch [10/100], Loss: 44.2733, Accuracy: 91.72%
Epoch [11/100], Loss: 37.4266, Accuracy: 89.47%
Epoch [12/100], Loss: 25.1888, Accuracy: 89.37%
Epoch [13/100], Loss: 16.6816, Accuracy: 84.75%
Epoch [14/100], Loss: 12.5628, Accuracy: 84.47%
Epoch [15/100], Loss: 10.4785, Accuracy: 88.04%
Epoch [16/100], Loss: 5.6283, Accuracy: 88.40%
Epoch [17/100], Loss: 3.1774, Accuracy: 88.22%
Epoch [18/100], Loss: 1.7070, Accuracy: 89.49%
Epoch [19/100], Loss: 0.9751, Accuracy: 87.56%
Epoch [20/100], Loss: 0.6508, Accuracy: 90.80%
Epoch [21/100], Loss: 0.4790, Accuracy: 90.84%


class LoanModel(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(LoanModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)  # Apply sigmoid activation for binary classification
        return out


def train_model(model, train_loader, test_loader, num_epochs, learning_rate, target_accuracy, device):
    model.to(device)
    criterion = nn.BCELoss()  # Use BCELoss for binary classification
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    writer = SummaryWriter()

    total_steps = len(train_loader)
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        for i, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device).view(-1, 1)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.float())  # Use BCELoss for binary classification
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        average_loss = total_loss / len(train_loader)
        writer.add_scalar('Loss/train', average_loss, epoch + 1)

        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device).view(-1, 1)
                outputs = model(inputs)
                predicted = (outputs >= 0.5).float()  # Convert probabilities to binary predictions
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        writer.add_scalar('Accuracy/test', accuracy, epoch + 1)

        print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {average_loss:.4f}, Accuracy: {accuracy:.2f}%')

        if accuracy >= target_accuracy:
            print(f"Reached target accuracy of {target_accuracy:.2f}%.")
            break

    writer.flush()
    writer.close()

# Set the device to GPU if available, otherwise CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the model
input_dim = X_train.shape[1]
hidden_dim = 1024
model = LoanModel(input_dim, hidden_dim)


# Train the model with stopping criterion
num_epochs = 100
learning_rate = 0.1
target_accuracy = 95.0  # Stop training when the accuracy reaches 80%
train_model(model, train_loader, test_loader, num_epochs, learning_rate, target_accuracy, device)

### 5) Create Confusion Matrix

In [6]:
from sklearn.metrics import confusion_matrix, classification_report

def evaluate_model(model, test_loader, device):
    model.eval()
    y_true = []
    y_pred = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device).view(-1, 1)
            outputs = model(inputs)
            predicted = (outputs >= 0.5).float()  # Convert probabilities to binary predictions
            y_pred.extend(predicted.cpu().numpy())
            y_true.extend(labels.cpu().numpy())

    # Convert lists to numpy arrays
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    # Create the confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    print("Confusion Matrix:")
    print(cm)

    # Print classification report with zero_division parameter
    report = classification_report(y_true, y_pred, target_names=['Safe', 'Risky'], zero_division=1)
    print("\nClassification Report:")
    print(report)

# Evaluate the trained model and print the confusion matrix
evaluate_model(model, test_loader, device)


ValueError: Classification metrics can't handle a mix of binary and multilabel-indicator targets