#### Code to calculate distance matrices between samples in the holdout dataset.
The distance values are used to create the patient network and calculate the percolation threshold.

In [None]:
# %pip install gower

In [None]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist
import gower

### Real-real distance matrices

In [None]:
# OHE and normalized real data
# path_to_data is the folder for all input and output of SynTwin, replace path_to_data to your path
# holdout_ohe.csv from from step3 is stored in subfolder Encoded_data
breast_survival = pd.read_csv("path_to_data/Encoded_data/holdout_ohe.csv")
real_ids = breast_survival['PatientID']
breast_survival = breast_survival.drop(columns=['PatientID', 'Unnamed: 0'])
real = breast_survival.copy()
real = real.drop(columns = ['SurvivalMonths','VitalStatus'])
print(real.shape)
real.head()

In [None]:
euclidean_matrix = cdist(real.iloc[:,0:], real.iloc[:,0:], metric='euclidean') 
euclidean_df = pd.DataFrame(euclidean_matrix, index=real_ids, columns=real_ids)
print(euclidean_df.shape)
euclidean_df.head()
# create a subfolder Distance_matrices and save distance matrice pkl in it
euclidean_df.to_pickle("path_to_data/Distance_matrices/euclidean_real_real.pkl")
del euclidean_df

In [None]:
cosine_matrix = cdist(real.iloc[:,0:], real.iloc[:,0:], metric='cosine') 
cosine_df = pd.DataFrame(cosine_matrix, index=real_ids, columns=real_ids)
cosine_df = cosine_df.mask(cosine_df < 3e-16, 0)
print(cosine_df.shape)
cosine_df.head()
cosine_df.to_pickle("path_to_data/Distance_matrices/cosine_real_real.pkl")
del cosine_df

In [None]:
manhattan_matrix = cdist(real.iloc[:,0:], real.iloc[:,0:], metric='cityblock') 
manhattan_df = pd.DataFrame(manhattan_matrix, index=real_ids, columns=real_ids)
print(manhattan_df.shape)
manhattan_df.head()
manhattan_df.to_pickle("path_to_data/Distance_matrices/manhattan_real_real.pkl")
del manhattan_df

### Real-real Gower distance matrix


In [None]:
# Label-encoded real data
cat_cols = ['Race', 'ICDO3', 'Laterality', 'PrimarySite-labeled', 'DiagnosticConfirmation', 'ICCCSite']
# holdout_le.csv from from step3 is stored in subfolder Encoded_data
breast_survival = pd.read_csv("path_to_data/Encoded_data/holdout_le.csv")
real = breast_survival.copy()
real_ids = breast_survival['PatientID']
real = real.drop(columns = ['SurvivalMonths','VitalStatus','PatientID', 'Unnamed: 0'])
print(real.shape)
real.head()

In [None]:
real.describe()

In [None]:
# calculate Gower distance 
gower_matrix = gower.gower_matrix(real.iloc[:,0:], cat_features = [False, False, False, True, True, False, True, False, True, True, True, False]) # cat_features sets which features are categorical
gower_df = pd.DataFrame(gower_matrix, index=real_ids, columns=real_ids)
print(gower_df.shape)
gower_df.head()

In [None]:
# Another way to calculate Gower distance 
from step4b_cdist_gower import cdist_gower
gower_matrix = cdist_gower(real.iloc[:,0:], cat_features = [False, False, False, True, True, False, True, False, True, True, True, False], \
                          num_max=None, num_ranges=[14, 5, 1, 8, 7, 8]) # cat_features sets which features are categorical, num_ranges is the range (max-min) of each numerical feature in the dataset ([Age, YearDx, Sex, Grade, SeqNum, CombinedSummaryStage])
gower_df = pd.DataFrame(gower_matrix, index=real_ids, columns=real_ids)
print(gower_df.shape)
gower_df.head()

In [None]:
gower_df.to_pickle("path_to_data/Distance_matrices/gower_real_real.pkl")
del gower_df