In [5]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Define the possible values for each feature
customer_types = ["Corporate", "Institutional", "Government"]
industries = ["Finance", "Healthcare", "Technology", "Energy", "Manufacturing"]
locations = ["USA", "Europe", "Asia", "New York", "London", "Tokyo"]
relationship_managers = ["John Smith", "Jane Doe", "RM-001", "RM-002", "RM-003"]

product_types = ["Loan", "Deposit", "Trade Finance", "Cash Management", "Risk Management"]
product_categories = ["Cash Management", "Risk Management", "Trade Services", "Lending", "Investments"]
product_subcategories = ["Credit Facilities", "Foreign Exchange", "Supply Chain Finance", "Term Loans", "Commercial Cards"]
product_features = ["Interest Rate", "Tenor", "Currency", "Collateral", "Covenants"]

transaction_types = ["Loan Disbursement", "Deposit", "Trade Settlement", "Credit Facility", "FX Trade"]
currencies = ["USD", "EUR", "GBP", "JPY", "CHF"]

risk_ratings = ["Low", "Medium", "High"]
regulatory_requirements = ["KYC", "AML", "CCAR", "Basel III", "Dodd-Frank"]

# Define the synthetic data generation rules
def generate_customer_id():
    return f"CUS-{random.randint(1, 1000):03d}"

def generate_customer_type():
    return random.choice(customer_types)

def generate_industry():
    return random.choice(industries)

def generate_company_size():
    return random.randint(100, 10000)

def generate_location():
    return random.choice(locations)

def generate_relationship_manager():
    return random.choice(relationship_managers)

def generate_product_id():
    return f"PROD-{random.randint(1, 1000):03d}"

def generate_product_type():
    return random.choice(product_types)

def generate_product_category():
    return random.choice(product_categories)

def generate_product_subcategory():
    return random.choice(product_subcategories)

def generate_product_features():
    return random.choice(product_features)

def generate_transaction_id():
    return f"TRAN-{random.randint(1, 1000):03d}"

def generate_transaction_date():
    return datetime.now() - timedelta(days=random.randint(1, 365))

def generate_transaction_type():
    return random.choice(transaction_types)

def generate_transaction_amount():
    return random.uniform(1000.0, 100000.0)

def generate_transaction_currency():
    return random.choice(currencies)

def generate_transaction_frequency():
    return random.randint(1, 10)

def generate_transaction_value():
    return random.uniform(10000.0, 100000.0)

def generate_product_adoption():
    return random.randint(1, 5)

def generate_product_usage():
    return random.randint(1, 10)

def generate_customer_engagement():
    return random.randint(1, 10)

def generate_credit_score():
    return random.uniform(600.0, 800.0)

def generate_risk_rating():
    return random.choice(risk_ratings)

def generate_default_probability():
    return random.uniform(0.01, 0.10)

def generate_exposure():
    return random.uniform(10000.0, 100000.0)

def generate_market_data():
    return random.uniform(100.0, 1000.0)

def generate_economic_indicators():
    return random.uniform(2.0, 4.0)

def generate_regulatory_requirements():
    return random.choice(regulatory_requirements)

# Generate the synthetic data
data = []
for i in range(1000):
    customer_id = generate_customer_id()
    customer_type = generate_customer_type()
    industry = generate_industry()
    company_size = generate_company_size()
    location = generate_location()
    relationship_manager = generate_relationship_manager()
    
    product_id = generate_product_id()
    product_type = generate_product_type()
    product_category = generate_product_category()
    product_subcategory = generate_product_subcategory()
    product_features = generate_product_features()
    
    transaction_id = generate_transaction_id()
    transaction_date = generate_transaction_date()
    transaction_type = generate_transaction_type()
    transaction_amount = generate_transaction_amount()
    transaction_currency = generate_transaction_currency()
    
    transaction_frequency = generate_transaction_frequency()
    transaction_value = generate_transaction_value()
    product_adoption = generate_product_adoption()
    product_usage = generate_product_usage()
    customer_engagement = generate_customer_engagement()
    
    credit_score = generate_credit_score()
    risk_rating = generate_risk_rating()
    default_probability = generate_default_probability()
    exposure = generate_exposure()
    
    market_data = generate_market_data()
    economic_indicators = generate_economic_indicators()
    regulatory_requirements = generate_regulatory_requirements()
    
    data.append({
        "Customer ID": customer_id,
        "Customer Type": customer_type,
        "Industry": industry,
        "Company Size": company_size,
        "Location": location,
        "Relationship Manager": relationship_manager,
        
        "Product ID": product_id,
        "Product Type": product_type,
        "Product Category": product_category,
        "Product Subcategory": product_subcategory,
        "Product Features": product_features,
        
        "Transaction ID": transaction_id,
        "Transaction Date": transaction_date,
        "Transaction Type": transaction_type,
        "Transaction Amount": transaction_amount,
        "Transaction Currency": transaction_currency,
        
        "Transaction Frequency": transaction_frequency,
        "Transaction Value": transaction_value,
        "Product Adoption": product_adoption,
        "Product Usage": product_usage,
        "Customer Engagement": customer_engagement,
        
        "Credit Score": credit_score,
        "Risk Rating": risk_rating,
        "Default Probability": default_probability,
        "Exposure": exposure,
        
        "Market Data": market_data,
        "Economic Indicators": economic_indicators,
        "Regulatory Requirements": regulatory_requirements
    })

# Create a Pandas DataFrame from the synthetic data
df = pd.DataFrame(data)

# Save the synthetic data to a CSV file
df.to_csv("wholesale_banking_synthetic_data.csv", index=False)

print(df.head())

  Customer ID  Customer Type       Industry  Company Size Location  \
0     CUS-067  Institutional  Manufacturing          6828   London   
1     CUS-685     Government     Technology          8392      USA   
2     CUS-999     Government  Manufacturing          2297   London   
3     CUS-504      Corporate  Manufacturing          3287      USA   
4     CUS-004     Government  Manufacturing           391     Asia   

  Relationship Manager Product ID     Product Type Product Category  \
0               RM-001   PROD-772             Loan  Cash Management   
1               RM-003   PROD-772  Risk Management   Trade Services   
2               RM-003   PROD-465          Deposit  Cash Management   
3           John Smith   PROD-149  Risk Management          Lending   
4               RM-003   PROD-622             Loan   Trade Services   

    Product Subcategory  ... Product Adoption Product Usage  \
0      Foreign Exchange  ...                2             3   
1      Foreign Exchange  .

In [3]:
# import torch
# import torch.nn.functional as F
# from torch_geometric.nn import MessagePassing
# from torch_geometric.datasets import Planetoid
# import torch_geometric.utils
# from torch_geometric.data import DataLoader
# from sklearn.cluster import KMeans
# import matplotlib.pyplot as plt

# # Define the GATv2 Layer
# class GATv2Conv(MessagePassing):
#     def __init__(self, in_channels, out_channels, heads=1, concat=True, negative_slope=0.2, dropout=0, bias=True, **kwargs):
#         super(GATv2Conv, self).__init__(aggr='add', **kwargs)  # "Add" aggregation.
#         self.heads = heads
#         self.out_channels = out_channels
#         self.lin = torch.nn.Linear(in_channels, out_channels * heads, bias=False)
#         self.att = torch.nn.Parameter(torch.Tensor(1, heads, 2 * out_channels))
#         self.negative_slope = negative_slope
#         self.dropout = dropout
#         self.concat = concat

#         if bias:
#             self.bias = torch.nn.Parameter(torch.Tensor(out_channels))
#         else:
#             self.register_parameter('bias', None)

#         self.reset_parameters()

#     def reset_parameters(self):
#         torch.nn.init.xavier_uniform_(self.lin.weight)
#         torch.nn.init.xavier_uniform_(self.att)
#         if self.bias is not None:
#             torch.nn.init.zeros_(self.bias)

#     def forward(self, x, edge_index):
#         x = self.lin(x).view(-1, self.heads, self.out_channels)
#         alpha = self._prepare_attentional_mechanism_input(x, edge_index)
#         alpha = torch_geometric.utils.softmax(alpha, edge_index[0], num_nodes=x.size(0))
#         alpha = F.dropout(alpha, p=self.dropout, training=self.training)
#         return self.propagate(edge_index, x=x, alpha=alpha)

#     def _prepare_attentional_mechanism_input(self, x, edge_index):
#         x_i = x[edge_index[0]]
#         x_j = x[edge_index[1]]
#         alpha = torch.cat([x_i, x_j], dim=-1)
#         alpha = F.leaky_relu((alpha * self.att).sum(dim=-1), self.negative_slope)
#         return alpha

#     def message(self, x_j, alpha):
#         return alpha.unsqueeze(-1) * x_j

#     def aggregate(self, inputs, index, dim_size=None):
#         return torch_geometric.utils.scatter(inputs, index, dim=self.node_dim, dim_size=dim_size, reduce=self.aggr)

#     def update(self, aggr_out):
#         if self.concat:
#             aggr_out = aggr_out.view(-1, self.heads * self.out_channels)
#         else:
#             aggr_out = aggr_out.mean(dim=1)

#         if self.bias is not None:
#             aggr_out = aggr_out + self.bias

#         return aggr_out

# # Define the GATv2 Model
# class GATv2Model(torch.nn.Module):
#     def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
#         super(GATv2Model, self).__init__()
#         self.convs = torch.nn.ModuleList()
#         self.convs.append(GATv2Conv(in_channels, hidden_channels, heads=1))
#         for _ in range(num_layers - 2):
#             self.convs.append(GATv2Conv(hidden_channels, hidden_channels, heads=1))
#         self.convs.append(GATv2Conv(hidden_channels, out_channels, heads=1))

#     def forward(self, x, edge_index):
#         for conv in self.convs[:-1]:
#             x = conv(x, edge_index)
#             x = F.relu(x)
#             x = F.dropout(x, p=0.6, training=self.training)
#         x = self.convs[-1](x, edge_index)
#         return x

# # Load data
# dataset = Planetoid(root='/tmp/Cora', name='Cora')
# data = dataset[0]

# # Ensure all indices in edge_index are within the valid range
# num_nodes = data.x.size(0)
# mask = (data.edge_index[0] < num_nodes) & (data.edge_index[1] < num_nodes)
# data.edge_index = data.edge_index[:, mask]

# # Check for any invalid edge indices after filtering
# if data.edge_index.max() >= num_nodes:
#     raise ValueError("There are still invalid edge indices in 'edge_index'.")

# # Initialize the model
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model = GATv2Model(dataset.num_features, 8, dataset.num_classes, num_layers=3).to(device)
# data = data.to(device)
# optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

# # Training function
# def train():
#     model.train()
#     optimizer.zero_grad()
#     out = model(data.x, data.edge_index)
#     loss = F.cross_entropy(out[data.train_mask], data.y[data.train_mask])
#     loss.backward()
#     optimizer.step()
#     return loss.item()

# # Testing function
# def test():
#     model.eval()
#     logits, accs = model(data.x, data.edge_index), []
#     for _, mask in data('train_mask', 'val_mask', 'test_mask'):
#         pred = logits[mask].max(1)[1]
#         acc = pred.eq(data.y[mask]).sum().item() / mask.sum().item()
#         accs.append(acc)
#     return accs

# # Train the model
# for epoch in range(1, 201):
#     loss = train()
#     train_acc, val_acc, test_acc = test()
#     print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_acc:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}')

# # Get embeddings from the trained model
# model.eval()
# embeddings = model(data.x, data.edge_index).detach().cpu().numpy()

# # Apply K-means clustering
# n_clusters = 5
# kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(embeddings)
# clusters = kmeans.labels_

# # Map clusters back to customer IDs (assuming customer_ids is available)
# customer_ids = range(len(embeddings))  # Replace this with actual customer IDs
# customer_cluster_map = {customer_ids[i]: clusters[i] for i in range(len(customer_ids))}

# # Display customer segments
# for cluster_id in range(n_clusters):
#     print(f"Customers in segment {cluster_id}:")
#     members = [cid for cid, clust in customer_cluster_map.items() if clust == cluster_id]
#     print(members)

# # Optional: Visualize the clusters
# plt.figure(figsize=(10, 7))
# for cluster_id in range(n_clusters):
#     cluster_points = embeddings[clusters == cluster_id]
#     plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {cluster_id}')
# plt.legend()
# plt.show()

IndexError: Found indices in 'edge_index' that are larger than 0 (got 2707). Please ensure that all indices in 'edge_index' point to valid indices in the interval [0, 1) in your node feature matrix and try again.

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch_geometric.data import Data
from torch_geometric.nn import GATv2Conv
import torch.nn.functional as F
from torch_geometric.loader import DataLoader
from torch_geometric.utils import train_test_split_edges

# Load the synthetic dataset
df = pd.read_csv("wholesale_banking_synthetic_data.csv")

# Initialize LabelEncoders for categorical features
label_encoders = {}
for column in ['Customer Type', 'Industry', 'Location', 'Relationship Manager', 'Product Type', 'Product Category', 'Product Subcategory', 'Product Features', 'Transaction Type', 'Transaction Currency', 'Risk Rating', 'Regulatory Requirements']:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Normalize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df.drop(columns=['Customer ID', 'Product ID', 'Transaction ID', 'Transaction Date']))

# Create node features
node_features = torch.tensor(scaled_features, dtype=torch.float)

# Create labels (for demonstration, using 'Risk Rating' as the target)
labels = torch.tensor(df['Risk Rating'].values, dtype=torch.long)

# Create a more meaningful edge index (e.g., based on 'Transaction ID')
unique_transactions = df['Transaction ID'].unique()
edge_index = []
for transaction in unique_transactions:
    involved_nodes = df[df['Transaction ID'] == transaction].index.tolist()
    for i in range(len(involved_nodes)):
        for j in range(i + 1, len(involved_nodes)):
            edge_index.append([involved_nodes[i], involved_nodes[j]])
edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

# Create train, validation, and test masks
train_mask, test_mask = train_test_split(np.arange(df.shape[0]), test_size=0.2, random_state=42)
train_mask, val_mask = train_test_split(train_mask, test_size=0.1, random_state=42)

train_mask = torch.tensor(train_mask, dtype=torch.long)
val_mask = torch.tensor(val_mask, dtype=torch.long)
test_mask = torch.tensor(test_mask, dtype=torch.long)

# Create a PyTorch Geometric Data object
data = Data(x=node_features, edge_index=edge_index, y=labels)
data.train_mask = train_mask
data.val_mask = val_mask
data.test_mask = test_mask
class GATv2Net(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_heads=8):
        super(GATv2Net, self).__init__()
        self.conv1 = GATv2Conv(in_channels, hidden_channels, heads=num_heads, dropout=0.6)
        self.conv2 = GATv2Conv(hidden_channels * num_heads, out_channels, heads=1, concat=False, dropout=0.6)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index)
        return F.log_softmax(x, dim=1)

# Initialize the model, optimizer, and loss function
model = GATv2Net(in_channels=node_features.shape[1], hidden_channels=32, out_channels=labels.max().item() + 1, num_heads=8)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

# Training and validation function
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def test():
    model.eval()
    logits, accs = model(data), []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        pred = logits[mask].max(1)[1]
        acc = pred.eq(data.y[mask]).sum().item() / mask.size(0)
        accs.append(acc)
    return accs

# Train the model
best_val_acc = test_acc = 0
for epoch in range(1, 201):
    loss = train()
    train_acc, val_acc, tmp_test_acc = test()
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        test_acc = tmp_test_acc
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_acc:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}')

print(f'Final Test Accuracy: {test_acc:.4f}')


Epoch: 001, Loss: 1.6179, Train: 0.4000, Val: 0.4750, Test: 0.4050
Epoch: 002, Loss: 1.5712, Train: 0.4611, Val: 0.5375, Test: 0.4450
Epoch: 003, Loss: 1.4549, Train: 0.5250, Val: 0.5875, Test: 0.4950
Epoch: 004, Loss: 1.4775, Train: 0.5625, Val: 0.5625, Test: 0.4950
Epoch: 005, Loss: 1.2778, Train: 0.6014, Val: 0.5625, Test: 0.4950
Epoch: 006, Loss: 1.2775, Train: 0.6431, Val: 0.5750, Test: 0.4950
Epoch: 007, Loss: 1.2551, Train: 0.6625, Val: 0.6000, Test: 0.6150
Epoch: 008, Loss: 1.1929, Train: 0.6861, Val: 0.6000, Test: 0.6150
Epoch: 009, Loss: 1.2008, Train: 0.6972, Val: 0.6125, Test: 0.6300
Epoch: 010, Loss: 1.1025, Train: 0.7014, Val: 0.6000, Test: 0.6300
Epoch: 011, Loss: 1.1623, Train: 0.7014, Val: 0.6375, Test: 0.6650
Epoch: 012, Loss: 1.1160, Train: 0.7139, Val: 0.6375, Test: 0.6650
Epoch: 013, Loss: 1.0759, Train: 0.7181, Val: 0.6375, Test: 0.6650
Epoch: 014, Loss: 1.1087, Train: 0.7222, Val: 0.6500, Test: 0.6900
Epoch: 015, Loss: 1.0430, Train: 0.7292, Val: 0.6375, Test: 0.