In [1]:
import pandas as pd
import numpy as np
from time import time

from torch_geometric.data import HeteroData

In [2]:
# file paths
hcc_mappings_fp = "/workspaces/graphs/graphs-eda/data/metadata/2022 Midyear_Final ICD-10-CM Mappings.csv"
ip_data_fp = "/workspaces/graphs/graphs-eda/data/patient-data/df_preprocessed.csv"
op_data_fp = "/workspaces/graphs/graphs-eda/data/patient-data/df_outpatient.csv"

# other constants
visit_cols = [
    'visit_amount', 'AIDS/HIV', 'COPD', 'Cancer',
    'Cerebrovascular Disease', 'Chronic Kidney Disease',
    'Congestive Heart Failure', 'Connective Tissue Disease-Rheumatic Disease',
    'Coronary Artery Disease', 'Dementia', 'Diabetes Mellitus',
    'Diabetes with complications', 'Diabetes without complications',
    'Essential Hypertension', 'Heart Failure', 'Malnutrition',
    'Mental Disease/Disorders (i.e. depression, anxiety, personality disorders, etc)',
    'Metastatic Carcinoma', 'Mild Liver Disease',
    'Moderate or Severe Liver Disease', 'Myocardial Infarction', 'Obese ',
    'Paraplegia and Hemiplegia', 'Peptic Ulcer Disease',
    'Periphral Vascular Disease', 'Renal Disease', 'length_of_stay',
    'admission_interval', 'unplanned_flag', 'last_90_days',
    'last_180_days', 'ed_op_visits', 'disease_count', 'cci_score',
    'lace_score', 'drg_weight', 'drg_los', 'diagnosis_count_90',
    'diagnosis_count_180', 'unique_diagnosis_count_90',
    'unique_diagnosis_count_180', 'op_diagnosis_count_90',
    'op_diagnosis_count_180', 'op_unique_diagnosis_count_90',
    'op_unique_diagnosis_count_180'
]

In [3]:
class HeterogenousGraph():
    """
    This class contains methods, attributes and functions to 
    create the following from IP and OP data -
    1. Graph object
    2. Node features
    3. Edge features
    4. Node labels
    5. Utility functions for data exploration
    """

    def __init__(self, ip_data_fp, op_data_fp, hcc_mappings_fp):
        """Upon instantiation prepares IP and OP data for
        processing, creates nodes and edges and stores the 
        information in pandas.Dataframe objects, and prepares
        ICD-HCC-mapping table.

        Args:
            ip_data_fp (str): IP data file path
            op_data_fp (str): OP data file path
            hcc_mappings_fp (str): ICD-HCC mappings file path
        """
        self.__hcc_mappings_columns__ = ['Diagnosis Code','Description','CMS-HCC Model Category V24']
        self.__ip_data_columns__ = ['empi','visit_id','visit_start_date','primary_diagnosis']
        self.__op_data_columns__ = ['empi','visit_id','last_date_of_service','primary_diagnosis']
        self.ip_data_fp = ip_data_fp
        self.op_data_fp = op_data_fp
        self.hcc_mappings_fp = hcc_mappings_fp
        self._prepare_patient_data()
        self._prepare_combined_data()
        self._create_disease_edge_table()
        self._create_node_table()
        self._prepare_hcc_mappings()
    

    def _prepare_patient_data(self):
        """
        Reads and filters only the required columns from inpatient and outpatient data
        """
        start = time()
        # reading the inpatient and outpatient data 
        ip_data = pd.read_csv(self.ip_data_fp)
        op_data = pd.read_csv(self.op_data_fp)

        ip_data = ip_data.loc[:,self.__ip_data_columns__]
        op_data = op_data.loc[:,self.__op_data_columns__]

        ip_data.columns = ['empi','vid','vdt','pd']
        op_data.columns = ['empi','vid','vdt','pd']

        self.ip_data = ip_data.copy()
        self.op_data = op_data[op_data.pd.isin(ip_data.pd.unique())].copy()
        del ip_data, op_data
        stop = time()
        print(f"IP and OP data prepared [Time taken: {round(stop-start, 3)} seconds]")


    def _prepare_combined_data(self):
        start = time()
        data = pd.concat([self.ip_data, self.op_data], axis=0).sort_values(['empi','vdt'])
        self.data = data[data.empi.isin(self.ip_data.empi.unique())].reset_index(drop=True).copy()
        del data
        stop = time()
        print(f"IP data combined with OP data [Time taken: {round(stop-start, 3)} seconds]")


    def _create_disease_edge_table(self):
        """creates edge-table from data

        Args:
            data (_type_): _description_

        Returns:
            _type_: _description_
        """
        start = time()
        data = self.data.sort_values(['empi','vdt']).reset_index(drop=True).copy()
        data["pd_nxt"] = data.pd.shift(-1)
        index_drop = data.groupby('empi').tail(1).index
        data = data.drop(index_drop)
        data = data[data.pd!=data.pd_nxt].reset_index(drop=True)

        data['edge'] = data.pd + ", " + data.pd_nxt
        data['cnt'] = 1

        edge_data = data.groupby(['pd','pd_nxt'], as_index=False).agg({'cnt':'count'})
        edge_data.columns = ['source','target','weight']

        self.edge_data = edge_data.copy()
        del edge_data
        stop = time()
        print(f"Edge Table Created [Time taken: {round(stop-start, 3)} seconds]")


    def _create_node_table(self):
        """Creates node table from data

        Args:
            data (_type_): _description_

        Returns:
            _type_: _description_
        """
        start = time()
        node_data = pd.DataFrame(self.data.pd.unique(), columns=['pd'])
        node_data.index = node_data.pd
        node_data.pd = 1
        self.node_data = node_data.copy()
        del node_data
        stop = time()
        print(f"Node Table Created [Time taken: {round(stop-start, 3)} seconds]")


    def _prepare_hcc_mappings(self):
        """
        Reads, cleans and prepares the HCC metadata file
        """
        start = time()
        # reading the risk adjustment metadata file
        hcc_mappings = pd.read_csv(self.hcc_mappings_fp)

        # cleaning the risk adjustment metadata file
        hcc_mappings = hcc_mappings.iloc[2:-7,:]
        hcc_mappings.replace(r'\n',' ', regex=True, inplace=True)
        hcc_mappings.reset_index(drop=True, inplace=True)

        # creating the cleaned risk adjustment metadata dataframe
        hcc_mappings_ = pd.DataFrame(hcc_mappings.iloc[1:,:])
        hcc_mappings_.columns = hcc_mappings.iloc[0,:].tolist()

        # filtering only the required columns from patient data and metadata
        hcc_mappings_ = hcc_mappings_.loc[:,self.__hcc_mappings_columns__]
        hcc_mappings_.columns = ['pd','dscr','hcc']
        hcc_mappings_.loc[:,"hcc"] = hcc_mappings_.hcc.fillna(0).astype('int')

        self.hcc_mappings = hcc_mappings_.copy()
        del hcc_mappings_
        stop = time()
        print(f"ICD to HCC Mappings Prepared [Time taken: {round(stop-start, 3)} seconds]")


In [4]:
obj = HeterogenousGraph(ip_data_fp, op_data_fp, hcc_mappings_fp)

IP and OP data prepared [Time taken: 0.906 seconds]
IP data combined with OP data [Time taken: 0.161 seconds]
Edge Table Created [Time taken: 0.271 seconds]
Node Table Created [Time taken: 0.016 seconds]
ICD to HCC Mappings Prepared [Time taken: 0.151 seconds]


In [5]:
ip_data = pd.read_csv(ip_data_fp)
op_data = pd.read_csv(op_data_fp)

In [6]:
patient_node_features = pd.concat(
    [
        ip_data[['empi','age_in_years']].groupby('empi').tail(1).reset_index(drop=True),
        pd.get_dummies(ip_data.empi.unique())], 
    axis=1
)

disease_node_features = pd.get_dummies(ip_data.primary_diagnosis.unique())

# visit_features = obj.ip_data[visit_cols]

In [7]:
disease_name_index_mapping = pd.DataFrame({'pd':disease_node_features.columns, 'id':disease_node_features.index})
empi_name_index_mapping = pd.DataFrame({'empi':obj.data.empi.unique(), 'id':np.arange(obj.data.empi.nunique())})

In [8]:
edge_info = obj.edge_data.copy()

In [9]:
edge_info = edge_info.merge(disease_name_index_mapping, left_on='source', right_on='pd', how='left')
edge_info.rename({'id':'source_id'}, axis=1, inplace=True)
edge_info = edge_info.merge(disease_name_index_mapping, left_on='target', right_on='pd', how='left')
edge_info.rename({'id':'target_id'}, axis=1, inplace=True)


In [10]:
edge_info.drop(['source','target','pd_x','pd_y'], axis=1, inplace=True)
edge_info.rename({'source_id':'source','target_id':'target'}, axis=1, inplace=True)
edge_info = edge_info[['source','target','weight']]
edge_info

Unnamed: 0,source,target,weight
0,0,564,1
1,0,831,1
2,0,1318,1
3,0,1327,1
4,0,1487,1
...,...,...,...
51990,2269,1624,1
51991,2269,2220,10
51992,2270,2114,1
51993,2271,1108,1


In [11]:
disease_name_index_mapping

Unnamed: 0,pd,id
0,A020,0
1,A021,1
2,A039,2
3,A040,3
4,A041,4
...,...,...
2268,Z955,2268
2269,Z95810,2269
2270,Z9851,2270
2271,Z9989,2271


In [12]:
empi_name_index_mapping

Unnamed: 0,empi,id
0,M0000040556,0
1,M0003299846,1
2,M0005129001,2
3,M0014789603,3
4,M0023249300,4
...,...,...
10380,M9992892324,10380
10381,M9993733479,10381
10382,M9993964586,10382
10383,M9994944874,10383


In [13]:
data_id = obj.data.groupby(['empi','pd']).head(1).\
                merge(disease_name_index_mapping, how='left').\
                    rename({'id':'pd_id'}, axis=1).\
                        merge(empi_name_index_mapping, how='left').\
                            rename({'id':'empi_id'}, axis=1)

In [14]:
# data['patient'].y computation
patient_data_hcc = obj.data.merge(obj.hcc_mappings[['pd','hcc']], how='left').fillna(0)
for hcc in set(obj.hcc_mappings.hcc.unique()) - set(patient_data_hcc.hcc.unique()):
    patient_data_hcc.loc[patient_data_hcc.shape[0]] = ['temp', 'temp', 'temp', 'temp', hcc]
data_y = pd.concat([patient_data_hcc.empi, pd.get_dummies(patient_data_hcc.hcc)], axis=1).groupby(['empi']).aggregate('sum')
data_y = data_y.drop('temp', axis=0).drop(0.0, axis=1)
data_y = (data_y>0)*1


In [50]:
import torch

data = HeteroData()

data['patient'].x = torch.from_numpy(patient_node_features.drop('empi', axis=1).to_numpy()) # [10385, 10386]
data['patient'].y = torch.from_numpy(data_y.to_numpy())
data['disease'].x = torch.from_numpy(disease_node_features.to_numpy().astype(float)) # [2273, 2273]
# data['visit'].x = visit_features.to_numpy() # [20541, 45]

data['pd','progresses_to','pd'].edge_index  = torch.from_numpy(edge_info[['source','target']].T.to_numpy().astype(float))# [2, 51995],
data['pd','resides_in','patient'].edge_index  = torch.from_numpy(data_id[['pd_id','empi_id']].T.to_numpy().astype(float)) # (2, 74469)

# data['pd','progresses_to','pd'].edge_attr = torch.from_numpy(edge_info.weight.to_numpy().reshape(edge_info.shape[0],1).astype(float))
# data['pd','resides_in','patient'].edge_attr = torch.from_numpy(np.ones(data_id.shape[0]).reshape(data_id.shape[0], 1).astype(float))

In [51]:
data

HeteroData(
  [1mpatient[0m={
    x=[10385, 10386],
    y=[10385, 86]
  },
  [1mdisease[0m={ x=[2273, 2273] },
  [1m(pd, progresses_to, pd)[0m={ edge_index=[2, 51995] },
  [1m(pd, resides_in, patient)[0m={ edge_index=[2, 74469] }
)

In [42]:
data.metadata()

(['patient', 'disease'],
 [('pd', 'progresses_to', 'pd'), ('pd', 'resides_in', 'patient')])

In [57]:
data['patient']

{'x': tensor([[0.9867, 0.0133, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.9886, 0.0000, 0.0114,  ..., 0.0000, 0.0000, 0.0000],
        [0.9880, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.9851, 0.0000, 0.0000,  ..., 0.0149, 0.0000, 0.0000],
        [0.9855, 0.0000, 0.0000,  ..., 0.0000, 0.0145, 0.0000],
        [0.9855, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0145]],
       dtype=torch.float64), 'y': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])}

In [52]:
import torch_geometric.transforms as T

data = T.ToUndirected()(data)
data = T.AddSelfLoops()(data)
data = T.NormalizeFeatures()(data)




In [53]:
import torch_geometric.transforms as T
from torch_geometric.datasets import OGB_MAG
from torch_geometric.nn import SAGEConv, to_hetero


# dataset = OGB_MAG(root='./data', preprocess='metapath2vec', transform=T.ToUndirected())
# data = dataset[0]


class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


model = GNN(hidden_channels=64, out_channels=86)
model = to_hetero(model, data.metadata(), aggr='sum')

In [55]:
with torch.no_grad():  # Initialize lazy modules.
    # out = model(data.x_dict, data.edge_index_dict)
    out = model(data.x_dict, data.edge_index_dict)

AssertionError: 

In [21]:
import torch_geometric.transforms as T

data = T.ToUndirected()(data)
data = T.AddSelfLoops()(data)
data = T.NormalizeFeatures()(data)



In [22]:
model = HeteroGNN(...)

output = model(data.x_dict, data.edge_index_dict, data.edge_attr_dict)

NameError: name 'HeteroGNN' is not defined

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

Unnamed: 0_level_0,1.0,2.0,6.0,8.0,9.0,10.0,11.0,12.0,17.0,18.0,...,162.0,166.0,167.0,169.0,170.0,173.0,176.0,186.0,188.0,189.0
empi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
M0000040556,0,0,0,0,1,0,1,1,0,1,...,0,0,1,1,0,0,1,0,0,0
M0003299846,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M0005129001,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M0014789603,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
M0023249300,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
M9992892324,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M9993733479,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M9993964586,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M9994944874,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [62]:
(data_y>0)*1

Unnamed: 0_level_0,1.0,2.0,6.0,8.0,9.0,10.0,11.0,12.0,17.0,18.0,...,162.0,166.0,167.0,169.0,170.0,173.0,176.0,186.0,188.0,189.0
empi,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
M0000040556,0,0,0,0,1,0,1,1,0,1,...,0,0,1,1,0,0,1,0,0,0
M0003299846,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M0005129001,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M0014789603,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
M0023249300,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
M9992892324,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M9993733479,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M9993964586,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
M9994944874,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
### TRUE NOTEBOOK
#install torch-scatter
#install torch-sparse
#install torch-cluster


import numpy as np
import torch
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.nn import Sequential, Linear
from torch.nn import ReLU
from torch_geometric.datasets import OGB_MAG
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.loader import NeighborLoader, HGTLoader

In [44]:
#-------------------------Register HeteroData

# Random data just to show how store values of nodes work
authors = torch.rand((10,8))
papers = torch.rand((20,4))
authors_y = torch.rand(10)

# Random data just to show how store values of edges work
write_from = torch.tensor(np.random.choice(10, 50, replace = True))
write_to = torch.tensor(np.random.choice(20, 50, replace=True))
write = torch.concat((write_from, write_to)).reshape(-1,50).long()

# Random dat justo to show how store values of edges work
cite_from = torch.tensor(np.random.choice(20, 15, replace=True))
cite_to = torch.tensor(np.random.choice(20, 15, replace=True))
cite = torch.concat((cite_from, cite_to)).reshape(-1,15).long()

In [49]:
write_from

tensor([3, 2, 4, 0, 2, 0, 0, 1, 2, 6, 5, 8, 8, 6, 7, 2, 4, 2, 8, 8, 5, 4, 0, 6,
        8, 7, 9, 9, 8, 3, 8, 2, 0, 2, 3, 8, 1, 5, 2, 5, 5, 6, 4, 1, 5, 6, 5, 8,
        2, 0])

In [45]:
# Pattern to declare all as one dictionary as argument of class HeteroData
data = HeteroData({'author': {'x':authors, 'y':authors_y}, 'paper':{'x':papers}},
                 author__write__paper={'edge_index':write}, paper__cite__paper={'edge_index': cite})

In [50]:
data

HeteroData(
  [1mauthor[0m={
    x=[10, 8],
    y=[10]
  },
  [1mpaper[0m={ x=[20, 4] },
  [1m(author, write, paper)[0m={ edge_index=[2, 50] },
  [1m(paper, cite, paper)[0m={ edge_index=[2, 47] },
  [1m(paper, rev_write, author)[0m={ edge_index=[2, 50] }
)

In [47]:
data = T.ToUndirected()(data)
data = T.AddSelfLoops()(data)
data = T.NormalizeFeatures()(data)

In [None]:
# Transforms from many types of nodes and edges to just one type of each
homogeneus_data = data.to_homogeneous()

# If you want to store the data
data.to_dict()

#-------------------------Example of model with HeteroData
transform = T.RandomNodeSplit()
data = transform(data)

#---------------------Model 1 
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1,-1), hidden_channels)
        self.conv2 = SAGEConv((-1,-1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

model = GNN(hidden_channels=64, out_channels=2)
model= to_hetero(model, data.metadata(), aggr='sum')

##---------------------Model 2
model = Sequential('x, edge_index', [
    (SAGEConv((-1,1),64), 'x, edge_index ->x'),
    ReLU(inplace = True),
    (SAGEConv((-1,1),64), 'x, edge_index ->x'),
    ReLU(inplace = True),
    (Linear(-1,2), 'x -> x'),
])

model = to_hetero(model, data.metadata(), aggr='sum')

#-------------------------Train Data

dataset = OGB_MAG(root='.data', preprocess='metapath2vec', transform=T.ToUndirected())
data = dataset[0]

data.metadata()

train_input_nodes = ('paper', data['paper'].train_mask)
train_loader = NeighborLoader(data, num_neighbors=[10] *2, shuffle=True, input_nodes=train_input_nodes)

for t in train_loader:
    print(t)
    break