In [10]:
import pandas as pd

# Load the insurance data
insurance_data = pd.read_csv("C:\\Users\\AksharaVenkatesh\\OneDrive - ConceptVines\\High Peak\\insurance_data_optim.csv")

# Display the first few rows of the dataset
print(insurance_data.head())
# Display the columns in the dataset
print(insurance_data.columns)


   Policyholder_ID First_Name Last_Name Date_of_Birth    Address  \
0                1        Bob       Doe    00-01-1900  33 Oak St   
1                2       Jane     Brown    00-01-1900  56 Oak St   
2                3      Alice  Williams    00-01-1900  73 Elm St   
3                4        Bob     Brown    00-01-1900  84 Oak St   
4                5      Alice   Johnson    00-01-1900  23 Oak St   

           City State    Zip           Phone                       Email  ...  \
0      New York    IL  81757  (911) 519-4115         Bob.Doe@example.com  ...   
1   Los Angeles    IL  42285  (670) 416-1363      Jane.Brown@example.com  ...   
2       Chicago    PA  60413  (624) 459-7280  Alice.Williams@example.com  ...   
3  Philadelphia    TX  17796  (453) 376-6301       Bob.Brown@example.com  ...   
4      New York    PA  16751  (571) 370-9500   Alice.Johnson@example.com  ...   

   Claim_Type  Claim_Date Claim_Amount Adjuster_ID  Adjuster_Name  Payment_ID  \
0    Accident  27-07-20

In [11]:
import pandas as pd
import networkx as nx
from datetime import datetime
from torch_geometric.utils import from_networkx
from pyvis.network import Network

# Load the insurance data
insurance_data = pd.read_csv('C:\\Users\\AksharaVenkatesh\\OneDrive - ConceptVines\\High Peak\\insurance_data_optim.csv')

# Initialize a directed graph
G = nx.DiGraph()

# Display the columns in the dataset to verify the available columns
print(insurance_data.columns)

# Extract relevant columns
policyholders = insurance_data[['Policyholder_ID', 'First_Name', 'Last_Name', 'Date_of_Birth', 'Address', 'City', 'State', 'Zip', 'Phone', 'Email']].drop_duplicates()
policies = insurance_data[['Policy_ID', 'Policyholder_ID', 'Policy_Type', 'Effective_Date', 'Expiration_Date', 'Premium', 'Coverage_Amount']].drop_duplicates()
claims = insurance_data[['Claim_ID', 'Policy_ID', 'Claim_Date', 'Claim_Type', 'Claim_Amount', 'Adjuster_ID']].drop_duplicates()

# Add policyholder nodes
for _, row in policyholders.iterrows():
    G.add_node(str(row['Policyholder_ID']), type='policyholder', features=row.to_dict())

# Add policy nodes and edges to policyholders
for _, row in policies.iterrows():
    G.add_node(str(row['Policy_ID']), type='policy', features=row.to_dict())
    G.add_edge(str(row['Policyholder_ID']), str(row['Policy_ID']), relation='holds')

# Add claim nodes and edges to policies
for _, row in claims.iterrows():
    G.add_node(str(row['Claim_ID']), type='claim', features=row.to_dict())
    G.add_edge(str(row['Policy_ID']), str(row['Claim_ID']), relation='files')

# Calculate age from Date_of_Birth and add it to policyholder features
def calculate_age(birthdate):
    today = datetime.today()
    return today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))

# Function to parse the date safely
def safe_parse_date(date_str):
    try:
        return datetime.strptime(date_str, '%d-%m-%Y')
    except ValueError:
        return None  # Return None if date is invalid

for _, row in policyholders.iterrows():
    birthdate = safe_parse_date(row['Date_of_Birth'])
    if birthdate is not None:
        age = calculate_age(birthdate)
        G.nodes[str(row['Policyholder_ID'])]['features']['age'] = age
    else:
        G.nodes[str(row['Policyholder_ID'])]['features']['age'] = None  # Handle invalid date

# Verify the graph construction
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

# Convert the NetworkX graph to a PyTorch Geometric Data object
data = from_networkx(G)

# Use the original NetworkX graph directly for visualization
G_nx = G

# Create a Pyvis network
net = Network(notebook=True, height="750px", width="100%", bgcolor="#222222", font_color="white", cdn_resources='remote')

# Add nodes and edges to the pyvis network
for node, node_attrs in G_nx.nodes(data=True):
    net.add_node(node, label=str(node), title=str(node_attrs['features']), **node_attrs)

for source, target, edge_attrs in G_nx.edges(data=True):
    net.add_edge(source, target, title=str(edge_attrs))

# Generate the HTML file
net.show("insurance_graph.html")


Index(['Policyholder_ID', 'First_Name', 'Last_Name', 'Date_of_Birth',
       'Address', 'City', 'State', 'Zip', 'Phone', 'Email', 'Policy_ID',
       'Policy_Type', 'Effective_Date', 'Expiration_Date', 'Premium',
       'Coverage_Amount', 'Claim_ID', 'Claim_Type', 'Claim_Date',
       'Claim_Amount', 'Adjuster_ID', 'Adjuster_Name', 'Payment_ID',
       'Payment_Date', 'Payment_Amount', 'Payment_Method', 'Longevity'],
      dtype='object')
Number of nodes: 601
Number of edges: 1000
insurance_graph.html


In [3]:
import pandas as pd

# Load the insurance data
insurance_data = pd.read_csv('C:\\Users\\AksharaVenkatesh\\OneDrive - ConceptVines\\High Peak\\insurance_data_optim.csv')
print(insurance_data.head())


   Policyholder_ID First_Name Last_Name Date_of_Birth    Address  \
0                1        Bob       Doe    00-01-1900  33 Oak St   
1                2       Jane     Brown    00-01-1900  56 Oak St   
2                3      Alice  Williams    00-01-1900  73 Elm St   
3                4        Bob     Brown    00-01-1900  84 Oak St   
4                5      Alice   Johnson    00-01-1900  23 Oak St   

           City State    Zip           Phone                       Email  ...  \
0      New York    IL  81757  (911) 519-4115         Bob.Doe@example.com  ...   
1   Los Angeles    IL  42285  (670) 416-1363      Jane.Brown@example.com  ...   
2       Chicago    PA  60413  (624) 459-7280  Alice.Williams@example.com  ...   
3  Philadelphia    TX  17796  (453) 376-6301       Bob.Brown@example.com  ...   
4      New York    PA  16751  (571) 370-9500   Alice.Johnson@example.com  ...   

   Claim_Type  Claim_Date Claim_Amount Adjuster_ID  Adjuster_Name  Payment_ID  \
0    Accident  27-07-20

In [9]:
import pandas as pd
import networkx as nx
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import coo_matrix

# Load the insurance data
insurance_data = pd.read_csv('insurance_data_optim.csv')
print(insurance_data.columns)

# Preprocessing functions
def preprocess_policyholders(data):
    columns = ['Policyholder_ID', 'First_Name', 'Last_Name', 'Date_of_Birth', 'City', 'State', 'Zip', 'Phone', 'Email']
    available_columns = [col for col in columns if col in data.columns]
    policyholders = data[available_columns].drop_duplicates()
    return policyholders

def preprocess_policies(data):
    columns = ['Policy_ID', 'Policyholder_ID', 'Policy_Type', 'Effective_Date', 'Expiration_Date', 'Premium', 'Coverage_Amount', 'Deductible']
    available_columns = [col for col in columns if col in data.columns]
    policies = data[available_columns].drop_duplicates()
    return policies

def preprocess_claims(data):
    columns = ['Claim_ID', 'Policy_ID', 'Claim_Date', 'Claim_Type', 'Claim_Amount', 'Adjuster_ID']
    available_columns = [col for col in columns if col in data.columns]
    claims = data[available_columns].drop_duplicates()
    return claims

def preprocess_payments(data):
    columns = ['Payment_ID', 'Policy_ID', 'Payment_Date', 'Payment_Amount', 'Payment_Method']
    available_columns = [col for col in columns if col in data.columns]
    payments = data[available_columns].drop_duplicates()
    return payments

# Preprocess the data
policyholders = preprocess_policyholders(insurance_data)
policies = preprocess_policies(insurance_data)
claims = preprocess_claims(insurance_data)
payments = preprocess_payments(insurance_data)

# Construct the graph
G = nx.DiGraph()

# Add policyholders nodes
for index, row in policyholders.iterrows():
    G.add_node(row['Policyholder_ID'], type='policyholder', features=row.to_dict())

# Add policies nodes and edges
for index, row in policies.iterrows():
    G.add_node(row['Policy_ID'], type='policy', features=row.to_dict())
    G.add_edge(row['Policyholder_ID'], row['Policy_ID'], relation='holds')

# Add claims nodes and edges
for index, row in claims.iterrows():
    G.add_node(row['Claim_ID'], type='claim', features=row.to_dict())
    G.add_edge(row['Policy_ID'], row['Claim_ID'], relation='files')
    G.add_edge(row['Claim_ID'], row['Adjuster_ID'], relation='assigned')

# Add payments nodes and edges
for index, row in payments.iterrows():
    G.add_node(row['Payment_ID'], type='payment', features=row.to_dict())
    G.add_edge(row['Policy_ID'], row['Payment_ID'], relation='pays')

# Normalize numeric features (example, depending on your feature structure)
def normalize_features(features):
    numeric_features = {k: v for k, v in features.items() if isinstance(v, (int, float))}
    if numeric_features:
        numeric_features_series = pd.Series(numeric_features)
        normalized_features = {k: (v - numeric_features_series.mean()) / (numeric_features_series.std() + 1e-6) for k, v in numeric_features.items()} # Added small constant
        features.update(normalized_features)
    return features

for node in G.nodes():
    if 'features' in G.nodes[node]:
        G.nodes[node]['features'] = normalize_features(G.nodes[node]['features'])

print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

# Reindex nodes to ensure indices are within a manageable range
mapping = {node: idx for idx, node in enumerate(G.nodes())}
G = nx.relabel_nodes(G, mapping)

# Extract numeric node features and adjacency matrix
node_features = []
node_labels = []
for node, data in G.nodes(data=True):
    if 'features' in data:
        feature_vector = [v for v in data['features'].values() if isinstance(v, (int, float))]
        node_features.append(feature_vector)
        if 'Policy_Type' in data['features']:
            node_labels.append(data['features']['Policy_Type'])
        else:
            node_labels.append('')
    else:
        node_features.append([0.0] * 5)  # Use a default value or an appropriate vector size
        node_labels.append('')

# Pad feature vectors to ensure consistent length
max_length = max(len(f) for f in node_features)
node_features = [f + [0.0] * (max_length - len(f)) for f in node_features]

# Check for NaN or infinite values
node_features = pd.DataFrame(node_features).fillna(0).replace([float('inf'), float('-inf')], 0).values
node_features = torch.tensor(node_features, dtype=torch.float32)

# Encode labels
label_encoder = LabelEncoder()
node_labels = torch.tensor(label_encoder.fit_transform(node_labels), dtype=torch.long)

# Create sparse adjacency matrix
edges = list(G.edges())
edge_index = torch.tensor(edges, dtype=torch.int32).t().contiguous()
adj_matrix = torch.sparse_coo_tensor(edge_index, torch.ones(len(edges)), (len(G.nodes()), len(G.nodes()))).to_dense()

# Example train/test split
train_mask = torch.rand(len(node_labels)) < 0.8
test_mask = ~train_mask

print(f"Node features shape: {node_features.shape}")
print(f"Adjacency matrix shape: {adj_matrix.shape}")
print(f"Number of training samples: {train_mask.sum().item()}")
print(f"Number of test samples: {test_mask.sum().item()}")

class GraphAttentionLayer(nn.Module):
    def __init__(self, in_features, out_features, dropout, alpha, concat=True):
        super(GraphAttentionLayer, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.dropout = dropout
        self.alpha = alpha
        self.concat = concat

        self.W = nn.Parameter(torch.zeros(size=(in_features, out_features)))
        nn.init.xavier_uniform_(self.W.data, gain=1.414)
        self.a = nn.Parameter(torch.zeros(size=(2 * out_features, 1)))
        nn.init.xavier_uniform_(self.a.data, gain=1.414)

        self.leakyrelu = nn.LeakyReLU(self.alpha)

    def forward(self, h, adj):
        Wh = torch.matmul(h, self.W)  # h.shape: (N, in_features), Wh.shape: (N, out_features)
        e = self._prepare_attentional_mechanism_input(Wh)

        zero_vec = -9e15 * torch.ones_like(e)
        attention = torch.where(adj > 0, e, zero_vec)
        attention = F.softmax(attention, dim=1)
        attention = F.dropout(attention, self.dropout, training=self.training)
        h_prime = torch.matmul(attention, Wh)

        if self.concat:
            return F.elu(h_prime)
        else:
            return h_prime

    def _prepare_attentional_mechanism_input(self, Wh):
        N = Wh.size()[0]  # number of nodes

        Wh_repeated_in_chunks = Wh.repeat_interleave(N, dim=0)
        Wh_repeated_alternating = Wh.repeat(N, 1)
        all_combinations_matrix = torch.cat([Wh_repeated_in_chunks, Wh_repeated_alternating], dim=1)

        e = self.leakyrelu(torch.matmul(all_combinations_matrix, self.a).squeeze(1))
        return e.view(N, N)

class GAT(nn.Module):
    def __init__(self, n_feat, n_hid, n_class, dropout, alpha, n_heads):
        super(GAT, self).__init__()
        self.dropout = dropout

        self.attentions = [GraphAttentionLayer(n_feat, n_hid, dropout=dropout, alpha=alpha, concat=True) for _ in range(n_heads)]
        for i, attention in enumerate(self.attentions):
            self.add_module('attention_{}'.format(i), attention)

        self.out_att = GraphAttentionLayer(n_hid * n_heads, n_class, dropout=dropout, alpha=alpha, concat=False)

    def forward(self, x, adj):
        x = F.dropout(x, self.dropout, training=self.training)
        x = torch.cat([att(x, adj) for att in self.attentions], dim=1)
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.out_att(x, adj)
        return F.log_softmax(x, dim=1)

# Set hyperparameters
n_feat = node_features.shape[1]
n_hid = 8
n_class = len(label_encoder.classes_)
dropout = 0.6
alpha = 0.2
n_heads = 8
lr = 0.001 # Reduced learning rate
weight_decay = 5e-4
epochs = 50

# Initialize model, loss function, and optimizer
model = GAT(n_feat, n_hid, n_class, dropout, alpha, n_heads)
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
loss_fn = nn.CrossEntropyLoss()

# Train the model
model.train()
for epoch in range(epochs):
    optimizer.zero_grad()
    output = model(node_features, adj_matrix)
    loss = loss_fn(output[train_mask], node_labels[train_mask])
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# Evaluate the model
model.eval()
with torch.no_grad():
    output = model(node_features, adj_matrix)
    preds = output[test_mask].max(1)[1].type_as(node_labels)
    accuracy = preds.eq(node_labels[test_mask]).double().mean()
    print(f"Test accuracy: {accuracy.item()}")

# Generate recommendations
def generate_recommendations(model, features, adj, mask):
    model.eval()
    with torch.no_grad():
        output = model(features, adj)
        recommendations = output[mask].max(1)[1].type_as(node_labels)
        return recommendations

# Generate recommendations for all nodes
recommendations = generate_recommendations(model, node_features, adj_matrix, torch.ones_like(train_mask, dtype=torch.bool))

# Print recommendations for each policyholder
for node, recommendation in zip(G.nodes(), recommendations):
    if G.nodes[node].get('type') == 'policyholder':
        print(f"Policyholder {node}: Recommended policy type {label_encoder.inverse_transform([recommendation.item()])[0]}")


Index(['Policyholder_ID', 'First_Name', 'Last_Name', 'Date_of_Birth',
       'Address', 'City', 'State', 'Zip', 'Phone', 'Email', 'Policy_ID',
       'Policy_Type', 'Effective_Date', 'Expiration_Date', 'Premium',
       'Coverage_Amount', 'Claim_ID', 'Claim_Type', 'Claim_Date',
       'Claim_Amount', 'Adjuster_ID', 'Adjuster_Name', 'Payment_ID',
       'Payment_Date', 'Payment_Amount', 'Payment_Method', 'Longevity'],
      dtype='object')
Number of nodes: 1300
Number of edges: 1399
Node features shape: torch.Size([1300, 6])
Adjacency matrix shape: torch.Size([1300, 1300])
Number of training samples: 1051
Number of test samples: 249
Epoch 1/50, Loss: 0.0
Epoch 2/50, Loss: 0.0
Epoch 3/50, Loss: 0.0
Epoch 4/50, Loss: 0.0
Epoch 5/50, Loss: 0.0
Epoch 6/50, Loss: 0.0
Epoch 7/50, Loss: 0.0
Epoch 8/50, Loss: 0.0
Epoch 9/50, Loss: 0.0
Epoch 10/50, Loss: 0.0
Epoch 11/50, Loss: 0.0
Epoch 12/50, Loss: 0.0
Epoch 13/50, Loss: 0.0
Epoch 14/50, Loss: 0.0
Epoch 15/50, Loss: 0.0
Epoch 16/50, Loss: 0.0
E