In [1]:
import warnings
warnings.filterwarnings("ignore")

from torch_geometric.nn import GCN
from torch.optim import Adam
import torch
from torch_geometric.data import Data
from sklearn.model_selection import train_test_split

from utils_data import *
from utils_classification import *
from utils_training import GraphTrainer
from utils_running import GraphRunner
from utils_evaluation import evaluate_tensor
from utils_plot import plot_confusion_matrix, plot_roc

In [2]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = "mps"

In [7]:
data = PopulationData().load_raw('data/raw/train.csv')
data.encode_connection_int()
data.encode_normalized_age()
data.encode_normalized_constitution()
data.encode_normalized_behavior()
# data.encode_standardized_age()
# data.encode_standardized_constitution()
# data.encode_standardized_behavior()
#data.encode_connection_lists()
data.encode_graph_nx()
data.encode_degree()
data.encode_degree_centrality()
data.encode_clustering_coefficient()
data.encode_normalized_degree()
# data.encode_normalized_degree_centrality()
# data.encode_normalized_clustering_coefficient()
# data.encode_standardized_degree()
# data.encode_standardized_degree_centrality()
# data.encode_standardized_clustering_coefficient()
# data.encode_connected_index_patient()
data.encode_distance_to_index_patient()
data.encode_normalized_distance_to_index_patient()
# data.encode_standardized_distance_to_index_patient()
data.encode_sum_neighbor_age()
data.encode_sum_neighbor_constitution()
data.encode_sum_neighbor_behavior()
data.encode_sum_neighbor_degree()
# data.encode_sum_neighbor_degree_centrality()
# data.encode_sum_neighbor_clustering_coefficient()
data.encode_normalized_sum_neighbor_age()
data.encode_normalized_sum_neighbor_constitution()
data.encode_normalized_sum_neighbor_behavior()
data.encode_normalized_sum_neighbor_degree()
# data.encode_normalized_sum_neighbor_degree_centrality()
# data.encode_normalized_sum_neighbor_clustering_coefficient()
# data.encode_standardized_sum_neighbor_age()
# data.encode_standardized_sum_neighbor_constitution()
# data.encode_standardized_sum_neighbor_behavior()
# data.encode_standardized_sum_neighbor_degree()
# data.encode_standardized_sum_neighbor_degree_centrality()
# data.encode_standardized_sum_neighbor_clustering_coefficient()
# data.encode_mean_neighbor_age()
# data.encode_mean_neighbor_constitution()
# data.encode_mean_neighbor_behavior()
# data.encode_mean_neighbor_degree()
# data.encode_mean_neighbor_degree_centrality()
# data.encode_mean_neighbor_clustering_coefficient()
# data.encode_normalized_mean_neighbor_age()
# data.encode_normalized_mean_neighbor_constitution()
# data.encode_normalized_mean_neighbor_behavior()
# data.encode_normalized_mean_neighbor_degree()
# data.encode_normalized_mean_neighbor_degree_centrality()
# data.encode_normalized_mean_neighbor_clustering_coefficient()
data.encode_test_train()

<utils_data.PopulationData.PopulationData at 0x1f208e3d890>

In [8]:
print(data.data_df.columns)

Index(['Population', 'Index_Patient', 'Infected', 'Age', 'Constitution',
       'Behaviour', 'Connections', 'Normalized_Age', 'Normalized_Constitution',
       'Normalized_Behaviour', 'Degree', 'Degree_Centrality',
       'Clustering_Coefficient', 'Normalized_Degree',
       'Distance_to_Index_Patient', 'Normalized_Distance_to_Index_Patient',
       'Sum_Neighbor_Age', 'Sum_Neighbor_Constitution',
       'Sum_Neighbor_Behaviour', 'Sum_Neighbor_Degree',
       'Normalized_Sum_Neighbor_Age', 'Normalized_Sum_Neighbor_Constitution',
       'Normalized_Sum_Neighbor_Behaviour', 'Normalized_Sum_Neighbor_Degree',
       'Train', 'Test'],
      dtype='object')


In [9]:
# print the data
print(data.data_df.head())

       Population  Index_Patient  Infected  Age  Constitution  Behaviour   
ID                                                                         
598886         A0              0         1   49      0.347675          2  \
565531         A0              0         1   80      0.324719          1   
367210         A0              0         1   71      0.583583          1   
524059         A0              0         1   60      0.439978          2   
552292         A0              0         1   49      0.553221          1   

                                              Connections  Normalized_Age   
ID                                                                          
598886  [565531, 367210, 524059, 267616, 500874, 55478...        0.387097  \
565531  [598886, 552292, 334896, 391502, 286958, 41721...        0.637097   
367210  [598886, 79928, 395880, 71423, 26551, 548578, ...        0.564516   
524059  [598886, 485148, 630714, 538180, 135800, 57967...        0.475806   
55229

In [10]:
features = ['Normalized_Age',
			'Normalized_Behaviour',
			'Normalized_Constitution',
			'Normalized_Degree',
			'Normalized_Distance_to_Index_Patient',
			'Normalized_Sum_Neighbor_Age',
			'Normalized_Sum_Neighbor_Behaviour',
			'Normalized_Sum_Neighbor_Constitution',
			'Normalized_Sum_Neighbor_Degree']
graph = data.get_graph_torch(features=features)

In [11]:
print(type(graph))
print("Graph Data Attributes:")
print(f"x: {graph.x}")
print(f"edge_index: {graph.edge_index}")
print(f"y: {graph.y}")
print(f"train_mask: {graph.train_mask}")
print(f"test_mask: {graph.test_mask}")

<class 'torch_geometric.data.data.Data'>
Graph Data Attributes:
x: tensor([[0.3871, 1.0000, 0.3477,  ..., 0.4737, 0.3088, 0.4211],
        [0.6371, 0.5000, 0.3247,  ..., 0.4737, 0.3397, 0.2947],
        [0.5645, 0.5000, 0.5836,  ..., 0.4211, 0.2898, 0.3158],
        ...,
        [0.6371, 0.5000, 0.3684,  ..., 0.1053, 0.0522, 0.0211],
        [0.5806, 0.5000, 0.7672,  ..., 0.0526, 0.0274, 0.0421],
        [0.2419, 1.0000, 0.5173,  ..., 0.0526, 0.0669, 0.0211]])
edge_index: tensor([[598886, 598886, 598886,  ..., 186351, 387064, 628340],
        [565531, 367210, 524059,  ..., 123974, 397442, 585987]])
y: tensor([1, 1, 1,  ..., 1, 1, 0])
train_mask: tensor([ True,  True,  True,  ..., False, False,  True])
test_mask: tensor([False, False, False,  ..., False,  True, False])


In [None]:
#train_loader, test_loader = data.get_graph_train_test_loaders(graph, batch_size=32)

In [12]:
# Define model parameters
in_channels = len(features)  # Number of input features (Age, Constitution, Behaviour, Population)
hidden_channels = 24 # Number of hidden features
out_channels = 1  # Number of output classes (Infected/Not Infected)

In [13]:
# Initialize the model
model = GCN(in_channels, hidden_channels, out_channels)
criteria = torch.nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.01)

In [14]:
trainer = GraphTrainer(model, criteria, optimizer)
trainer.train(graph, num_epochs=100, device='cuda')

100%|██████████| 100/100 [00:07<00:00, 13.57it/s]


Best model found at epoch 99 with evaluation loss: 0.6893


In [17]:
trainer.test(graph, device='cuda')

Test loss: 0.6915


In [18]:
runner = GraphRunner(model)
test_probabilities = runner.predict_proba(graph, device=device)
test_predictions = runner.predict(graph, device=device)

# Apply the test mask
test_predictions = test_predictions[graph.test_mask]
test_probabilities = test_probabilities[graph.test_mask]
test_labels = graph.y[graph.test_mask].to(device)

In [19]:
print("Test Predictions:", test_predictions)
print("Test Probabilities:", test_probabilities)
print("Test Labels:", test_labels)

Test Predictions: tensor([0, 0, 0,  ..., 0, 0, 0], device='cuda:0')
Test Probabilities: tensor([[ 5.8655,  5.8401, -4.2379,  ..., -3.7233, -5.7654, -4.7766],
        [ 5.8541,  5.7881, -4.2978,  ..., -3.7140, -5.7211, -4.6660],
        [ 7.3485,  7.2472, -5.3165,  ..., -4.6389, -7.1782, -5.8490],
        ...,
        [ 6.8508,  6.7182, -4.9684,  ..., -4.3168, -6.6517, -5.4072],
        [ 5.6934,  5.6871, -4.0163,  ..., -3.6131, -5.5403, -4.6561],
        [ 5.4597,  5.3540, -3.9633,  ..., -3.4767, -5.3008, -4.3208]],
       device='cuda:0')
Test Labels: tensor([1, 1, 1,  ..., 1, 1, 1], device='cuda:0')


In [20]:
accuracy, precision, recall, f1, auc_roc, confusion_matrix = evaluate_tensor(test_predictions, test_probabilities, test_labels, device=device)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1: {f1:.4f}')
print(f'AUC-ROC: {auc_roc:.4f}')

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
plot_confusion_matrix(confusion_matrix, ['Healthy', 'Infected'])
plot_roc(test_probabilities.cpu().numpy(), graph.y[graph.test_mask].cpu().numpy())