In [None]:
# Install required packages.
import os
import torch
os.environ['TORCH'] = torch.__version__
print(torch.__version__)

!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

In [22]:
import pandas as pd
import numpy as np
import io
from google.colab import files

In [2]:
uploaded = files.upload()
df = pd.read_csv(io.BytesIO(uploaded['heart.csv']))

Saving heart.csv to heart (14).csv


In [3]:
df['ID'] = np.arange(len(df))
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,ID
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,0
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,3
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,4


In [4]:
# Sort to define the order of nodes
sorted_df = df.sort_values(by="ID")
sorted_df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output,ID
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1,0
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1,3
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1,4


In [5]:
# Select node features
node_features = sorted_df[["age", "sex", "trtbps", "chol", "thalachh", "oldpeak", "fbs", "restecg", "exng", "slp", "caa", "thall"]]
node_features.head()

Unnamed: 0,age,sex,trtbps,chol,thalachh,oldpeak,fbs,restecg,exng,slp,caa,thall
0,63,1,145,233,150,2.3,1,0,0,0,0,1
1,37,1,130,250,187,3.5,0,1,0,0,0,2
2,41,0,130,204,172,1.4,0,0,0,2,0,2
3,56,1,120,236,178,0.8,0,1,0,2,0,2
4,57,0,120,354,163,0.6,0,1,1,2,0,2


In [6]:
# Get one hot encoding of columns fbs
one_hot_fbs = pd.get_dummies(node_features['fbs']) 
one_hot_fbs = one_hot_fbs.rename({0: 'No_fbs', 1: 'Yes_fbs'}, axis='columns')
one_hot_fbs.head()

Unnamed: 0,No_fbs,Yes_fbs
0,0,1
1,1,0
2,1,0
3,1,0
4,1,0


In [7]:
# Get one hot encoding of columns restecg
one_hot_restecg = pd.get_dummies(node_features['restecg']) 
one_hot_restecg = one_hot_restecg.rename({0: 'normal', 1: 'abnormal_1', 2:'abnormal_2'}, axis='columns')
one_hot_restecg.head()

Unnamed: 0,normal,abnormal_1,abnormal_2
0,1,0,0
1,0,1,0
2,1,0,0
3,0,1,0
4,0,1,0


In [8]:
# Get one hot encoding of columns exng
one_hot_exng = pd.get_dummies(node_features['exng']) 
one_hot_exng = one_hot_exng.rename({0: 'No_angina', 1: 'Yes_angina'}, axis='columns')
one_hot_exng.head()

Unnamed: 0,No_angina,Yes_angina
0,1,0
1,1,0
2,1,0
3,1,0
4,0,1


In [9]:
# Get one hot encoding of columns slp	
one_hot_slp	 = pd.get_dummies(node_features['slp']) 
one_hot_slp = one_hot_slp.rename({0: 'ST_unsloping', 1: 'ST_flat', 2:'ST_downsloping'}, axis='columns')
one_hot_slp.head()

Unnamed: 0,ST_unsloping,ST_flat,ST_downsloping
0,1,0,0
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [10]:
# Get one hot encoding of columns caa	
one_hot_caa	 = pd.get_dummies(node_features['caa']) 
one_hot_caa = one_hot_caa.rename({0: 'caa_1', 1: 'caa_2', 2:'caa_3', 3:'caa_4', 4:'caa_5'}, axis='columns')
one_hot_caa.head()

Unnamed: 0,caa_1,caa_2,caa_3,caa_4,caa_5
0,1,0,0,0,0
1,1,0,0,0,0
2,1,0,0,0,0
3,1,0,0,0,0
4,1,0,0,0,0


In [11]:
# Get one hot encoding of columns thall
one_hot_thall	 = pd.get_dummies(node_features['thall']) 
one_hot_thall = one_hot_thall.rename({0: 'thall_null', 1: 'thall_fixed_defect', 2:'thall_normal', 3:'thall_reversable defect'}, axis='columns')
one_hot_thall.head()

Unnamed: 0,thall_null,thall_fixed_defect,thall_normal,thall_reversable defect
0,0,1,0,0
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,0,0,1,0


In [12]:
encoded_df = pd.concat([node_features, one_hot_fbs, one_hot_restecg, one_hot_exng, one_hot_slp, one_hot_caa, one_hot_thall], axis=1)
encoded_df = encoded_df.drop(['thalachh','fbs','restecg','exng','slp','caa','thall'], axis=1)
encoded_df.head()

Unnamed: 0,age,sex,trtbps,chol,oldpeak,No_fbs,Yes_fbs,normal,abnormal_1,abnormal_2,...,ST_downsloping,caa_1,caa_2,caa_3,caa_4,caa_5,thall_null,thall_fixed_defect,thall_normal,thall_reversable defect
0,63,1,145,233,2.3,0,1,1,0,0,...,0,1,0,0,0,0,0,1,0,0
1,37,1,130,250,3.5,1,0,0,1,0,...,0,1,0,0,0,0,0,0,1,0
2,41,0,130,204,1.4,1,0,1,0,0,...,1,1,0,0,0,0,0,0,1,0
3,56,1,120,236,0.8,1,0,0,1,0,...,1,1,0,0,0,0,0,0,1,0
4,57,0,120,354,0.6,1,0,0,1,0,...,1,1,0,0,0,0,0,0,1,0


In [13]:
# Convert to numpy: the node feature matrix
encoded_df_np = encoded_df.to_numpy()
encoded_df_np.shape 

(303, 24)

In [14]:
#labels
# Select node features
labels = sorted_df[["thalachh"]]  
labels.head()

Unnamed: 0,thalachh
0,150
1,187
2,172
3,178
4,163


In [15]:
# Convert to numpy
labels_np = labels.to_numpy()
labels_np.shape # [num_nodes, 1] --> node regression

(303, 1)

In [16]:
df["output"].value_counts()

1    165
0    138
Name: output, dtype: int64

In [17]:
import itertools

categories = df["output"].unique()
all_edges = np.array([], dtype=np.int32).reshape((0, 2))

for category in categories:
    category_df = df[df["output"] == category]
    Patients = category_df["ID"].values
    # Build all combinations, as all players are connected
    permutations = list(itertools.combinations(Patients, 2))
    edges_source = [e[0] for e in permutations]
    edges_target = [e[1] for e in permutations]
    category_edges = np.column_stack([edges_source, edges_target])
    all_edges = np.vstack([all_edges, category_edges])

# Convert to Pytorch Geometric format
edge_index = all_edges.transpose()
edge_index # [2, num_edges]

array([[  0,   0,   0, ..., 300, 300, 301],
       [  1,   2,   3, ..., 301, 302, 302]])

In [19]:
from torch_geometric.data import Data
data = Data(x=encoded_df_np, edge_index=edge_index, y=labels_np)

ModuleNotFoundError: ignored