In [3]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
from sklearn.impute import SimpleImputer

# Define the number of rows in the dataset
n_rows = 10000

# Create a list to store the data
data = []

# Loop through each row
for i in range(n_rows):
    # Generate random data for each feature
    account_number = f"ACC{i:06d}"
    customer_name = f"Customer {i}"
    loan_amount = np.random.uniform(1000, 100000)
    outstanding_balance = np.random.uniform(0, loan_amount)
    payment_due_date = datetime.now() + timedelta(days=np.random.randint(1, 30))
    last_payment_date = datetime.now() - timedelta(days=np.random.randint(1, 90))
    last_payment_amount = np.random.uniform(0, outstanding_balance)
    days_past_due = (payment_due_date - last_payment_date).days
    collection_status = random.choice(["Not Started", "In Progress", "Completed"])
    risk_category = random.choice(["Low", "Medium", "High"])
    
    # Append the data to the list
    data.append([
        account_number,
        customer_name,
        loan_amount,
        outstanding_balance,
        payment_due_date,
        last_payment_date,
        last_payment_amount,
        days_past_due,
        collection_status,
        risk_category
    ])

# Create a Pandas DataFrame from the list
columns = [
    "Account Number",
    "Customer Name",
    "Loan Amount",
    "Outstanding Balance",
    "Payment Due Date",
    "Last Payment Date",
    "Last Payment Amount",
    "Days Past Due",
    "Collection Status",
    "Risk Category"
]
df = pd.DataFrame(data, columns=columns)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
imputed_data = imputer.fit_transform(df[['Loan Amount', 'Outstanding Balance', 'Last Payment Amount']])
df[['Loan Amount', 'Outstanding Balance', 'Last Payment Amount']] = imputed_data
df = df.replace([np.inf, -np.inf], np.nan).dropna()

# Print the first few rows of the DataFrame
print(df.head())
df.to_csv("revenue_loss_collections.csv", index=False)

import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
import pyvis
from pyvis.network import Network

# Load the dataset
df = pd.read_csv("revenue_loss_collections.csv")

# Define node labels
node_labels = df["Account Number"].tolist()

# Define edge labels
edge_labels = df["Collection Status"].tolist()

# Convert the dataset to a PyTorch Geometric Data object
node_features = df.drop(["Account Number", "Customer Name", "Days Past Due"], axis=1).apply(pd.to_numeric, errors='coerce').fillna(0)
node_features_std = node_features.std()
node_features_std[node_features_std == 0] = 1  # avoid division by zero
node_features = (node_features - node_features.mean()) / node_features_std

x = torch.tensor(node_features.values, dtype=torch.float)
y = torch.tensor(df["Days Past Due"].values, dtype=torch.float)

# Create a graph with a limited number of edges (e.g., 10 neighbors for each node)
num_nodes = x.size(0)
edge_index = torch.tensor(np.array([[i, (i+1)%num_nodes] for i in range(num_nodes)] + [[(i+1)%num_nodes, i] for i in range(num_nodes)]).T, dtype=torch.long)
edge_attr = torch.tensor(np.ones(edge_index.size(1)), dtype=torch.float).unsqueeze(-1) if edge_index.size(1) > 0 else torch.tensor([])

data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)

# Create a PyVis Network object
net = Network(height="1000px", width="1000px", directed=True)

# Add nodes to the network
for i, node in enumerate(node_labels):
    net.add_node(i, label=node)

# Add edges to the network
for i, edge in enumerate(edge_labels):
    if i < len(node_labels) - 1:  # check if there's a next node
        net.add_edge(i, i+1, label=edge)

# Visualize the network
net.write_html("revenue_loss_collections_graph.html")

import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GATConv, global_mean_pool
from torch.nn import Linear, ReLU, ModuleList
from torch.optim import Adam
from torch_geometric.utils import to_networkx
import networkx as nx
import matplotlib.pyplot as plt

# Load the CSV file
df = pd.read_csv("revenue_loss_collections.csv")

# Define node features and ensure they are numeric, handle NaNs
node_features = df.drop(["Account Number", "Customer Name", "Days Past Due"], axis=1).apply(pd.to_numeric, errors='coerce').fillna(0)
node_features_std = node_features.std()
node_features_std[node_features_std == 0] = 1  # avoid division by zero
node_features = (node_features - node_features.mean()) / node_features_std

# Convert the dataset to a PyTorch Geometric Data object
x = torch.tensor(node_features.values, dtype=torch.float)
y = torch.tensor(df["Days Past Due"].values, dtype=torch.float)

# Create a graph with a limited number of edges (e.g., 10 neighbors for each node)
num_nodes = x.size(0)
edge_index = torch.tensor(np.array([[i, (i+1)%num_nodes] for i in range(num_nodes)] + [[(i+1)%num_nodes, i] for i in range(num_nodes)]).T, dtype=torch.long)
edge_attr = torch.tensor(np.ones(edge_index.size(1)), dtype=torch.float).unsqueeze(-1) if edge_index.size(1) > 0 else torch.tensor([])

data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)

# Define the GAT model
class GATModel(torch.nn.Module):
    def __init__(self, num_features, num_layers, hidden_dim, output_dim):
        super(GATModel, self).__init__()
        self.layers = ModuleList([GATConv(num_features if i == 0 else hidden_dim * 8, hidden_dim, heads=8, dropout=0.1, concat=True) for i in range(num_layers)])
        self.fc = Linear(hidden_dim * 8, output_dim)

    def forward(self, x, edge_index, edge_attr=None):
        for layer in self.layers:
            x = layer(x, edge_index)
        x = global_mean_pool(x, torch.zeros(x.size(0), dtype=torch.long))  # Using global mean pooling
        x = self.fc(x)
        return x

# Define the model, optimizer, and loss function
model = GATModel(num_features=x.size(1), num_layers=2, hidden_dim=64, output_dim=1)
optimizer = Adam(model.parameters(), lr=0.01)
loss_fn = torch.nn.MSELoss()

# Training loop
for epoch in range(100):
    optimizer.zero_grad()
    out = model(data.x, data.edge_index, data.edge_attr)
    loss = loss_fn(out, data.y.unsqueeze(-1))  # Unsqueeze to match the output shape
    if torch.isnan(loss):
        print("Loss is NaN!")
    else:
        loss.backward()
        optimizer.step()
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Evaluate the model
model.eval()
out = model(data.x, data.edge_index, data.edge_attr)
final_loss = torch.sqrt(loss_fn(out, data.y.unsqueeze(-1)))
if torch.isnan(final_loss):
    print("RMSE is NaN!")
else:
    print(f"RMSE: {final_loss.item()}")
# Use the GAT model to identify influential accounts
# influential_accounts = []
# for node in data.x:
#     node_features = node.detach().numpy()
#     batch_edge_index = torch.tensor([[0, 0]]).t().contiguous()
#     output = model(torch.tensor(node_features).unsqueeze(0), batch_edge_index, data.edge_attr)
#     node_importance = output.squeeze(0)  # assuming the model returns a single value
#     influential_accounts.append((node_importance.item(), node_features))
# influential_accounts.sort(key=lambda x: x[0], reverse=True)
# print("Influential Accounts:")
# print(influential_accounts[:10])

  Account Number Customer Name   Loan Amount  Outstanding Balance  \
0      ACC000000    Customer 0  83465.085778         43987.863113   
1      ACC000001    Customer 1  22859.011741          7956.892317   
2      ACC000002    Customer 2  63919.954710         32928.695729   
3      ACC000003    Customer 3  63444.032993         41101.246994   
4      ACC000004    Customer 4   6744.309321          5891.119959   

            Payment Due Date          Last Payment Date  Last Payment Amount  \
0 2024-05-24 23:21:36.028017 2024-03-01 23:21:36.028017         24762.183624   
1 2024-05-22 23:21:36.028017 2024-02-05 23:21:36.028017          3529.070238   
2 2024-05-27 23:21:36.028017 2024-03-27 23:21:36.028017         21696.471422   
3 2024-05-26 23:21:36.028017 2024-03-15 23:21:36.028017         15771.781237   
4 2024-05-31 23:21:36.028017 2024-03-20 23:21:36.028017          5674.177697   

   Days Past Due Collection Status Risk Category  
0             84       Not Started          High  
1 

  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1, Loss: 4304.8994140625
Epoch 2, Loss: 4224.0615234375
Epoch 3, Loss: 3873.23291015625
Epoch 4, Loss: 3154.871826171875
Epoch 5, Loss: 2011.63818359375
Epoch 6, Loss: 871.5529174804688
Epoch 7, Loss: 1464.4219970703125
Epoch 8, Loss: 1596.2896728515625
Epoch 9, Loss: 954.2247924804688
Epoch 10, Loss: 739.0643310546875
Epoch 11, Loss: 920.901611328125
Epoch 12, Loss: 1118.1016845703125
Epoch 13, Loss: 1160.4493408203125
Epoch 14, Loss: 1061.7039794921875
Epoch 15, Loss: 889.2374267578125
Epoch 16, Loss: 754.59326171875
Epoch 17, Loss: 757.4945678710938
Epoch 18, Loss: 891.507080078125
Epoch 19, Loss: 942.760498046875
Epoch 20, Loss: 864.1041259765625
Epoch 21, Loss: 763.7025756835938
Epoch 22, Loss: 739.4429931640625
Epoch 23, Loss: 786.35107421875
Epoch 24, Loss: 838.4910278320312
Epoch 25, Loss: 852.1063232421875
Epoch 26, Loss: 819.8682250976562
Epoch 27, Loss: 767.0443115234375
Epoch 28, Loss: 736.3953247070312
Epoch 29, Loss: 751.4221801757812
Epoch 30, Loss: 791.04760742187

In [1]:
import torch
print(torch.__version__)

2.3.0+cpu


In [4]:
# Use the GAT model to identify influential accounts
influential_accounts = []
for node in data.x:
    node_features = node.detach().numpy()
    batch_edge_index = torch.tensor([[0, 0]]).t().contiguous()
    output = model(torch.tensor(node_features).unsqueeze(0), batch_edge_index, data.edge_attr)
    node_importance = output.squeeze(0)  # assuming the model returns a single value
    influential_accounts.append((node_importance.item(), node_features))
influential_accounts.sort(key=lambda x: x[0], reverse=True)
print("Influential Accounts:")
print(influential_accounts[:10])

Influential Accounts:
[(105.57073211669922, array([-1.7460368, -1.1453136,  0.       ,  0.       , -0.8597123,
        0.       ,  0.       ], dtype=float32)), (105.54959869384766, array([-1.746008 , -1.1451861,  0.       ,  0.       , -0.858823 ,
        0.       ,  0.       ], dtype=float32)), (105.49858093261719, array([-1.7419858, -1.1456848,  0.       ,  0.       , -0.8596252,
        0.       ,  0.       ], dtype=float32)), (105.4508056640625, array([-1.7397029, -1.1455997,  0.       ,  0.       , -0.8595572,
        0.       ,  0.       ], dtype=float32)), (105.29486083984375, array([-1.7385013, -1.1419141,  0.       ,  0.       , -0.8571353,
        0.       ,  0.       ], dtype=float32)), (105.25997924804688, array([-1.7352294 , -1.1434637 ,  0.        ,  0.        , -0.85680145,
        0.        ,  0.        ], dtype=float32)), (105.2046127319336, array([-1.7320831 , -1.1425558 ,  0.        ,  0.        , -0.85814816,
        0.        ,  0.        ], dtype=float32)), (105.1