1. Split into train and test data
2. Train model on train data normally
3. Take test data and duplicate into test prime 
4. Drop first visit from test prime data
5. Get predicted delta from test prime data. Compare to delta from test data. We know the difference (epsilon) because we dropped actual visits. What percent of time is test delta < test prime delta? 
6. Restrict it only to patients with lot of visits. Is this better?

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pickle

def clean_plot():
    ax = plt.subplot(111)    
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)    
    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)    
    
    ax.get_xaxis().tick_bottom()    
    ax.get_yaxis().tick_left()   
    plt.grid()

import matplotlib.pylab as pylab
params = {'legend.fontsize': 'x-large',
#           'figure.figsize': (10,6),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)

In [2]:
import os, sys
project_root = os.path.abspath(os.path.dirname(os.getcwd()))
sys.path.append(project_root)

In [3]:
import sys
import torch

sys.path.append('.')
from data.load import chf
from data.data_utils import parse_data
from data.synthetic_data import load_piecewise_synthetic_data


sys.path.append('./model')
from models import Sublign
from run_experiments import get_hyperparameters


In [4]:
def make_test_prime(test_data_dict_raw, drop_first_T=1.):
    # drop first year
    test_data_dict = copy.deepcopy(test_data_dict_raw)
    eps_lst        = list()
    
    X = test_data_dict['obs_t_collect']
    Y = test_data_dict['Y_collect']
    M = test_data_dict['mask_collect']
    
    N_patients = X.shape[0]
    N_visits   = X.shape[1]
    
    for i in range(N_patients):
        eps_i = X[i,1,0] - X[i,0,0]
        
        first_visit = X[i,1,0]
        # move all visits down (essentially destroying the first visit)
        for j in range(N_visits-gap):
            
            X[i,j,0] = X[i,j+gap,0] - first_visit
            Y[i,j,:] = Y[i,j+gap,:]
            M[i,j,:] = M[i,j+gap,:]
        
        for g in range(1,gap+1):
            X[i,N_visits-g,0] = int(-1000)
            Y[i,N_visits-g,:] = int(-1000)
            M[i,N_visits-g,:] = 0.
        
        eps_lst.append(eps_i)
    return test_data_dict, eps_lst

In [5]:
import pandas as pd 
data= pd.read_csv('C:/Users/nss_1/clustering-interval-censored/model/data/result_4_long_format.csv',sep=';')

  interactivity=interactivity, compiler=compiler, result=result)


In [6]:
selected_columns = ["PATNO","AGE_AT_VISIT", "FINAL_SEX_ENCODED", "MCATOT","NP1RTOT" ,"NP2PTOT", "NP3TOT", "SDMTOTAL"]
filtered_df = data[data["COHORT"].isin(["PD", "Prodromal"])][selected_columns]

In [7]:
filtered_df['subtype'] = data['COHORT'].apply(lambda x: 1 if x in ['PD', 'Prodromal'] else 0)

In [7]:
print(type(filtered_df))


<class 'pandas.core.frame.DataFrame'>


In [45]:
print(filtered_df.head())
print(filtered_df.dtypes)

    PATNO  AGE_AT_VISIT  FINAL_SEX_ENCODED  MCATOT  NP1RTOT  NP2PTOT  NP3TOT  \
9    3001          65.4                1.0     NaN      0.0      2.0    18.0   
10   3001          65.6                1.0     NaN      0.0      3.0    23.0   
11   3001          65.9                1.0     NaN      1.0      3.0    19.0   
12   3001          66.2                1.0    30.0      4.0      3.0    20.0   
13   3001          66.7                1.0     NaN      1.0      4.0    29.0   

    SDMTOTAL  subtype  
9        NaN        1  
10       NaN        1  
11       NaN        1  
12      36.0        1  
13       NaN        1  
PATNO                  int64
AGE_AT_VISIT         float64
FINAL_SEX_ENCODED    float64
MCATOT               float64
NP1RTOT              float64
NP2PTOT              float64
NP3TOT               float64
SDMTOTAL             float64
subtype                int64
dtype: object


In [8]:

filtered_df = filtered_df.sort_values(['PATNO', 'AGE_AT_VISIT']).reset_index(drop=True)




In [9]:
filtered_df['obs_time'] = pd.Series(filtered_df.groupby('PATNO').cumcount().values + 1, index=filtered_df.index)


In [10]:
filtered_df.describe

<bound method NDFrame.describe of         PATNO  AGE_AT_VISIT  FINAL_SEX_ENCODED  MCATOT  NP1RTOT  NP2PTOT  \
0        3001          65.4                1.0     NaN      0.0      2.0   
1        3001          65.6                1.0     NaN      0.0      3.0   
2        3001          65.9                1.0     NaN      1.0      3.0   
3        3001          66.2                1.0    30.0      4.0      3.0   
4        3001          66.7                1.0     NaN      1.0      4.0   
...       ...           ...                ...     ...      ...      ...   
10252  320651          56.3                NaN     NaN      1.0      5.0   
10253  324862          57.4                NaN     NaN      1.0      4.0   
10254  325051          65.0                NaN     NaN      1.0      6.0   
10255  325566          51.8                NaN     NaN      1.0      2.0   
10256  329289          62.6                NaN     NaN      3.0      8.0   

       NP3TOT  SDMTOTAL  subtype  obs_time  
0       

In [11]:
cols_to_rescale = ["MCATOT", "NP1RTOT", "NP2PTOT", "NP3TOT", "SDMTOTAL"]

for col in cols_to_rescale:
    max_val = filtered_df[col].max()
    if pd.notnull(max_val) and max_val != 0:
        filtered_df[col] = filtered_df[col] / max_val


In [12]:
from run_experiments import get_hyperparameters_ppmi
b_vae, C, d_s, d_h, d_rnn, reg_type, lr = get_hyperparameters_ppmi()

In [11]:
filtered_df['PATNO'].nunique

<bound method IndexOpsMixin.nunique of 0          3001
1          3001
2          3001
3          3001
4          3001
          ...  
10252    320651
10253    324862
10254    325051
10255    325566
10256    329289
Name: PATNO, Length: 10257, dtype: int64>

In [13]:
filtered_df = filtered_df[[
    "MCATOT", "NP1RTOT", "NP2PTOT", "NP3TOT", "SDMTOTAL",  # features
    "subtype",                  # -4
    "AGE_AT_VISIT",            # time
    "PATNO",                   # patient ID
    "obs_time"                 # relative time
]]

In [14]:
data       = filtered_df
max_visits = 9
shuffle    = True
num_output_dims = data.shape[1] - 4

In [15]:
train_data_loader, train_data_dict, _, _, test_data_loader, test_data_dict, valid_pid, test_pid, unique_pid = parse_data(
            data.values, max_visits=max_visits, test_per=0.2, valid_per=0.2, shuffle=shuffle, device='cpu')

Max visits: 9


In [25]:
filtered_df.head(9)

Unnamed: 0,MCATOT,NP1RTOT,NP2PTOT,NP3TOT,SDMTOTAL,subtype,AGE_AT_VISIT,PATNO,obs_time
0,,0.0,0.044444,0.225,,1,65.4,3001,1
1,,0.0,0.066667,0.2875,,1,65.6,3001,2
2,,0.055556,0.066667,0.2375,,1,65.9,3001,3
3,1.0,0.222222,0.066667,0.25,0.367347,1,66.2,3001,4
4,,0.055556,0.088889,0.3625,,1,66.7,3001,5
5,1.0,0.0,0.044444,0.4875,0.428571,1,67.3,3001,6
6,,0.0,0.111111,0.3125,,1,67.7,3001,7
7,0.966667,0.055556,0.133333,0.425,0.489796,1,68.3,3001,8
8,,0.055556,0.244444,0.525,,1,68.8,3001,9


In [14]:
Y, S, X, M, T = [i for i in train_data_loader][0]

In [26]:
Y[0]

tensor([[-1.0000e+03,  0.0000e+00,  8.8889e-02,  3.0000e-01, -1.0000e+03],
        [-1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03],
        [-1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03],
        [-1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03],
        [-1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03],
        [-1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03],
        [-1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03],
        [-1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03],
        [-1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03, -1.0000e+03]])

In [24]:
S[0]

tensor([1.])

In [20]:
X[0]

tensor([[    1.],
        [-1000.],
        [-1000.],
        [-1000.],
        [-1000.],
        [-1000.],
        [-1000.],
        [-1000.],
        [-1000.]])

In [21]:
M[0]

tensor([[0., 1., 1., 1., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])

In [15]:
data_loader, collect_dict, unique_pid = parse_data(
            data.values, max_visits=max_visits, device="cpu")


Max visits: 9


In [107]:
b_vae

0.01

In [111]:
import os
os.makedirs('runs', exist_ok=True)


In [16]:
b_vae, C, d_s, d_h, d_rnn, reg_type, lr = 0.01, 0.0, 10, 10, 20, 'l1', 0.1
epochs = 50
device = 'cpu'
max_delta = 5.
learn_time = True

In [33]:
train_data_dict['obs_t_collect']


array([[[    1.],
        [    2.],
        [    3.],
        ...,
        [    7.],
        [    8.],
        [    9.]],

       [[    1.],
        [-1000.],
        [-1000.],
        ...,
        [-1000.],
        [-1000.],
        [-1000.]],

       [[    1.],
        [    2.],
        [    3.],
        ...,
        [-1000.],
        [-1000.],
        [-1000.]],

       ...,

       [[    1.],
        [    2.],
        [    3.],
        ...,
        [    7.],
        [    8.],
        [    9.]],

       [[    1.],
        [    2.],
        [    3.],
        ...,
        [-1000.],
        [-1000.],
        [-1000.]],

       [[    1.],
        [    2.],
        [    3.],
        ...,
        [    7.],
        [-1000.],
        [-1000.]]], dtype=float32)

In [28]:
assert not torch.isnan(X).any(), "Input X contains NaN values"
assert not torch.isnan(Y).any(), "Input Y contains NaN values"


In [None]:


model = Sublign(d_s, d_h, d_rnn, C=C, dim_biomarkers=num_output_dims,
                sigmoid=True, reg_type=reg_type, auto_delta=True,
                max_delta=max_delta, learn_time=learn_time)

# Fit model
model.fit(
    train_data_loader,
    test_data_loader,
    epochs,
    lr=lr,
    verbose=True,
    fname='C:/Users/nss_1/clustering-interval-censored/model/runs/ppmi.pt',
    eval_freq=25
)


# Evaluate model
results = model.score(train_data_dict, test_data_dict)
print('PPMI Test ARI: %.3f' % results['ari'])

# Extract results
subtypes = model.get_subtypes_datadict(collect_dict)
labels = model.get_labels(collect_dict)
deltas = model.get_deltas(collect_dict)

# Move to numpy
subtypes = subtypes.cpu().detach().numpy()
labels = labels.cpu().detach().numpy()
deltas = deltas.cpu().detach().numpy()


# Save
pickle.dump((labels, deltas, subtypes), open('C:/Users/nss_1/clustering-interval-censored/model/runs/ppmi_icml.pk', 'wb'))


No saved model found at fname, starting fresh training.
Max sequence length: 9
All-zero sequences: tensor([False, False, False,  ..., False, False, False])
Sequence lengths: [3, 6, 1, 2, 9, 7, 6, 4, 9, 8, 1, 5, 1, 7, 1, 2, 1, 1, 1, 7, 7, 8, 4, 7, 3, 2, 2, 2, 4, 1, 1, 9, 7, 1, 1, 3, 9, 9, 5, 3, 4, 1, 1, 1, 2, 1, 9, 1, 1, 2, 1, 9, 1, 1, 1, 3, 9, 1, 4, 2, 1, 9, 2, 1, 1, 3, 7, 1, 5, 1, 4, 1, 1, 7, 2, 1, 1, 9, 7, 2, 6, 8, 1, 5, 6, 6, 3, 9, 6, 3, 5, 2, 1, 2, 4, 7, 1, 4, 1, 5, 1, 4, 1, 2, 3, 7, 1, 9, 3, 3, 4, 1, 2, 4, 7, 1, 1, 4, 7, 7, 1, 4, 1, 2, 1, 1, 5, 9, 9, 1, 9, 4, 1, 7, 9, 4, 1, 6, 3, 1, 2, 5, 1, 1, 7, 1, 2, 1, 4, 7, 1, 9, 3, 7, 5, 4, 1, 2, 2, 1, 7, 2, 4, 6, 1, 1, 3, 1, 9, 1, 9, 5, 3, 7, 9, 1, 1, 4, 3, 8, 5, 1, 3, 1, 9, 3, 4, 2, 5, 6, 1, 2, 4, 3, 2, 1, 2, 2, 2, 5, 5, 8, 2, 4, 4, 6, 5, 5, 2, 1, 9, 4, 3, 1, 7, 3, 5, 3, 5, 7, 9, 1, 1, 3, 9, 5, 4, 1, 5, 1, 7, 9, 1, 1, 8, 1, 1, 4, 4, 7, 4, 2, 6, 3, 5, 5, 4, 6, 5, 3, 3, 1, 6, 9, 5, 4, 1, 7, 5, 7, 1, 1, 4, 1, 2, 9, 1, 5, 5, 5, 2, 2, 1, 1, 8, 

In [None]:
%debug