In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class HybridModel(nn.Module):
    def __init__(self, diffusion_imputer, categorical_features_info, num_layers, device="cuda"):
        super().__init__()
        self.diffusion_imputer = diffusion_imputer
        self.categorical_features_info = categorical_features_info
        self.device = device

        # Flatten indices for categorical features
        self.categorical_indices = [idx for info in categorical_features_info for idx in info['indices']]
        self.target_categorical_indices = [idx for info in categorical_features_info for idx in info['indices_target']]

        # Define layers
        self.first_linear_layers = nn.ModuleList([
            nn.Linear(info['embedding_dim'], info['hidden_dim'])
            for info in categorical_features_info
        ])
        self.second_linear_layers = nn.ModuleList([
            nn.Linear(info['hidden_dim'], info['hidden_dim'])
            for info in categorical_features_info
        ])
        num_total_features_and_hidden_dims = sum(info['hidden_dim'] for info in categorical_features_info)

        self.first_layer_ff = nn.Linear(num_total_features_and_hidden_dims, num_total_features_and_hidden_dims)
        self.classification_layers = nn.ModuleList([
            nn.Linear(num_total_features_and_hidden_dims, num_total_features_and_hidden_dims) for _ in range(num_layers)
        ])
        self.output_layers = nn.ModuleList([
            nn.Linear(num_total_features_and_hidden_dims, info['num_classes']) for info in categorical_features_info
        ])

    def forward(self, data, target_data):
        imputed_samples, _, imputation_mask = self.diffusion_imputer.eval_with_grad(data)
        # imputation_results = torch.where(imputation_mask != 0, imputed_samples, data)

        # Use precomputed indices to split data
        target_data_categorical = target_data[:, :, self.target_categorical_indices]
        target_data_numerical = target_data[:, :, [i for i in range(target_data.shape[2]) if i not in self.target_categorical_indices]]
        imputation_results_categorical = imputed_samples[:, :, self.categorical_indices]
        imputation_results_numerical = imputed_samples[:, :, [i for i in range(data.shape[2]) if i not in self.categorical_indices]]
        print(imputation_results_categorical[0])
        # Process categorical data
        start_idx = 0
        class_predictions = []
        for first_layer, second_layer, info in zip(self.first_linear_layers, self.second_linear_layers, self.categorical_features_info):
            end_idx = start_idx + info['embedding_dim']
            feature_data = imputation_results_categorical[:, :, start_idx:end_idx]
            feature_data = F.relu(first_layer(feature_data))
            feature_data = F.relu(second_layer(feature_data))
            class_predictions.append(feature_data)
            start_idx = end_idx
        
        
        combined_data_categorical = torch.cat(class_predictions, dim=2)
        combined_data_categorical = F.relu(self.first_layer_ff(combined_data_categorical))
        for layer in self.classification_layers:
            combined_data_categorical = F.relu(layer(combined_data_categorical))

        print()

        final_outputs_categorical = torch.cat([
            output_layer(combined_data_categorical).unsqueeze(2) for output_layer in self.output_layers
        ], dim=2)

        return imputation_results_numerical, final_outputs_categorical, target_data_numerical, target_data_categorical

    def loss_func(self, outputs, targets):
        imputation_results_numerical, final_outputs_categorical = outputs
        target_numerical, target_categorical = targets
        
        loss_numerical = F.mse_loss(imputation_results_numerical, target_numerical)
        final_outputs_categorical = final_outputs_categorical.view(-1, final_outputs_categorical.shape[2], final_outputs_categorical.shape[3])
        target_categorical = target_categorical.view(-1, target_categorical.shape[2])
        target_categorical = target_categorical.long().to(self.device)

        loss_categorical = sum(F.cross_entropy(final_outputs_categorical[:, i, :], target_categorical[:, i]) for i in range(final_outputs_categorical.shape[1]))
        
        print(target_categorical[0])
        print(final_outputs_categorical[0])
        print(loss_numerical)
        print(loss_categorical)
        # return loss_numerical + loss_categorical
        # return loss_categorical 
        return loss_numerical + loss_categorical

    def eval(self, data, imputation_mask):
        # self.eval()
        with torch.no_grad():
            imputed_samples, _, _ = self.diffusion_imputer.eval_with_grad(data, imputation_mask)
            imputation_results = torch.where(imputation_mask != 0, imputed_samples, data)

            imputation_results_categorical = imputation_results[:, :, self.categorical_indices]
            imputation_results_numerical = imputation_results[:, :, [i for i in range(data.shape[2]) if i not in self.categorical_indices]]

            start_idx = 0
            class_predictions = []
            for first_layer, second_layer, info in zip(self.first_linear_layers, self.second_linear_layers, self.categorical_features_info):
                end_idx = start_idx + info['embedding_dim']
                feature_data = imputation_results_categorical[:, :, start_idx:end_idx]
                feature_data = F.relu(first_layer(feature_data))
                feature_data = F.relu(second_layer(feature_data))
                class_predictions.append(feature_data)
                start_idx = end_idx

            combined_data_categorical = torch.cat(class_predictions, dim=2)
            combined_data_categorical = F.relu(self.first_layer_ff(combined_data_categorical))
            for layer in self.classification_layers:
                combined_data_categorical = F.relu(layer(combined_data_categorical))

            final_outputs_categorical = torch.cat([
                output_layer(combined_data_categorical).unsqueeze(2) for output_layer in self.output_layers
            ], dim=2)
            final_outputs_categorical = torch.argmax(final_outputs_categorical, dim=2)

            final_outputs = torch.cat([imputation_results_numerical, final_outputs_categorical], dim=2)
            return final_outputs


In [29]:
import numpy as np

# Example initial data setup
data = {
    'outputs': np.array([
        [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]],  # Patient 1
        [[16, 17, 18], [19, 20, 21], [22, 23, 24], [25, 26, 27], [28, 29, 30]]  # Patient 2
    ]),
    'prev_outputs': np.array([
        [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11], [12, 13, 14]],  # Patient 1
        [[15, 16, 17], [18, 19, 20], [21, 22, 23], [24, 25, 26], [27, 28, 29]]  # Patient 2
    ]),
    'sequence_lengths': np.array([5, 5]),  # Both patients have full sequences
    'active_entries': np.ones((2, 5, 3)),  # All entries are active
    'current_treatments': np.array([
        [[0, 0, 0], [1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]],  # Patient 1
        [[5, 5, 5], [6, 6, 6], [7, 7, 7], [8, 8, 8], [9, 9, 9]]  # Patient 2
    ]),
    'prev_treatments': np.array([
        [[0, 1, 0], [1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 0]],  # Patient 1
        [[1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 0], [1, 0, 1]]  # Patient 2
    ]),
    'static_features': np.array([
        [0.1, 0.2, 0.3],  # Patient 1
        [0.4, 0.5, 0.6]  # Patient 2
    ])
}

# Assume scaling params for unscaled outputs
scaling_params = {
    'output_means': np.array([1, 2, 3]),
    'output_stds': np.array([0.5, 0.5, 0.5])
}

# Projection horizon
projection_horizon = 2

In [30]:

def explode_trajectories(data, projection_horizon, scaling_params):
    outputs = data['outputs']
    prev_outputs = data['prev_outputs']
    sequence_lengths = data['sequence_lengths']
    active_entries = data['active_entries']
    current_treatments = data['current_treatments']
    previous_treatments = data['prev_treatments']
    static_features = data['static_features']

    num_patients, max_seq_length, num_features = outputs.shape
    num_seq2seq_rows = num_patients * max_seq_length

    seq2seq_previous_treatments = np.zeros((num_seq2seq_rows, max_seq_length, previous_treatments.shape[-1]))
    seq2seq_current_treatments = np.zeros((num_seq2seq_rows, max_seq_length, current_treatments.shape[-1]))
    seq2seq_static_features = np.zeros((num_seq2seq_rows, static_features.shape[-1]))
    seq2seq_outputs = np.zeros((num_seq2seq_rows, max_seq_length, outputs.shape[-1]))
    seq2seq_prev_outputs = np.zeros((num_seq2seq_rows, max_seq_length, prev_outputs.shape[-1]))
    seq2seq_active_entries = np.zeros((num_seq2seq_rows, max_seq_length, active_entries.shape[-1]))
    seq2seq_sequence_lengths = np.zeros(num_seq2seq_rows)

    total_seq2seq_rows = 0  # we use this to shorten any trajectories later

    for i in range(num_patients):
        sequence_length = int(sequence_lengths[i])

        for t in range(projection_horizon, sequence_length):  # shift outputs back by 1
            seq2seq_active_entries[total_seq2seq_rows, :(t + 1), :] = active_entries[i, :(t + 1), :]
            seq2seq_previous_treatments[total_seq2seq_rows, :(t + 1), :] = previous_treatments[i, :(t + 1), :]
            seq2seq_current_treatments[total_seq2seq_rows, :(t + 1), :] = current_treatments[i, :(t + 1), :]
            seq2seq_outputs[total_seq2seq_rows, :(t + 1), :] = outputs[i, :(t + 1), :]
            seq2seq_prev_outputs[total_seq2seq_rows, :(t + 1), :] = prev_outputs[i, :(t + 1), :]
            seq2seq_sequence_lengths[total_seq2seq_rows] = t + 1
            seq2seq_static_features[total_seq2seq_rows] = static_features[i]

            total_seq2seq_rows += 1

    # Filter everything shorter
    seq2seq_previous_treatments = seq2seq_previous_treatments[:total_seq2seq_rows, :, :]
    seq2seq_current_treatments = seq2seq_current_treatments[:total_seq2seq_rows, :, :]
    seq2seq_static_features = seq2seq_static_features[:total_seq2seq_rows, :]
    seq2seq_outputs = seq2seq_outputs[:total_seq2seq_rows, :, :]
    seq2seq_prev_outputs = seq2seq_prev_outputs[:total_seq2seq_rows, :, :]
    seq2seq_active_entries = seq2seq_active_entries[:total_seq2seq_rows, :, :]
    seq2seq_sequence_lengths = seq2seq_sequence_lengths[:total_seq2seq_rows]

    new_data = {
        'prev_treatments': seq2seq_previous_treatments,
        'current_treatments': seq2seq_current_treatments,
        'static_features': seq2seq_static_features,
        'prev_outputs': seq2seq_prev_outputs,
        'outputs': seq2seq_outputs,
        'unscaled_outputs': seq2seq_outputs * scaling_params['output_stds'] + scaling_params['output_means'],
        'sequence_lengths': seq2seq_sequence_lengths,
        'active_entries': seq2seq_active_entries,
    }

    return new_data



In [35]:
for key, value in data.items():
    print(key, value.shape, value)

outputs (2, 5, 3) [[[ 1  2  3]
  [ 4  5  6]
  [ 7  8  9]
  [10 11 12]
  [13 14 15]]

 [[16 17 18]
  [19 20 21]
  [22 23 24]
  [25 26 27]
  [28 29 30]]]
prev_outputs (2, 5, 3) [[[ 0  1  2]
  [ 3  4  5]
  [ 6  7  8]
  [ 9 10 11]
  [12 13 14]]

 [[15 16 17]
  [18 19 20]
  [21 22 23]
  [24 25 26]
  [27 28 29]]]
sequence_lengths (2,) [5 5]
active_entries (2, 5, 3) [[[1. 1. 1.]
  [1. 1. 1.]
  [1. 1. 1.]
  [1. 1. 1.]
  [1. 1. 1.]]

 [[1. 1. 1.]
  [1. 1. 1.]
  [1. 1. 1.]
  [1. 1. 1.]
  [1. 1. 1.]]]
current_treatments (2, 5, 3) [[[0 0 0]
  [1 1 1]
  [2 2 2]
  [3 3 3]
  [4 4 4]]

 [[5 5 5]
  [6 6 6]
  [7 7 7]
  [8 8 8]
  [9 9 9]]]
prev_treatments (2, 5, 3) [[[0 1 0]
  [1 0 1]
  [0 1 0]
  [1 0 1]
  [0 1 0]]

 [[1 0 1]
  [0 1 0]
  [1 0 1]
  [0 1 0]
  [1 0 1]]]
static_features (2, 3) [[0.1 0.2 0.3]
 [0.4 0.5 0.6]]


In [31]:
# Run the function
new_data = explode_trajectories(data, projection_horizon, scaling_params)

# Print the results for illustration
output_results = {}
for key, value in new_data.items():
    output_results[key] = value.shape, value

output_results

{'prev_treatments': ((6, 5, 3),
  array([[[0., 1., 0.],
          [1., 0., 1.],
          [0., 1., 0.],
          [0., 0., 0.],
          [0., 0., 0.]],
  
         [[0., 1., 0.],
          [1., 0., 1.],
          [0., 1., 0.],
          [1., 0., 1.],
          [0., 0., 0.]],
  
         [[0., 1., 0.],
          [1., 0., 1.],
          [0., 1., 0.],
          [1., 0., 1.],
          [0., 1., 0.]],
  
         [[1., 0., 1.],
          [0., 1., 0.],
          [1., 0., 1.],
          [0., 0., 0.],
          [0., 0., 0.]],
  
         [[1., 0., 1.],
          [0., 1., 0.],
          [1., 0., 1.],
          [0., 1., 0.],
          [0., 0., 0.]],
  
         [[1., 0., 1.],
          [0., 1., 0.],
          [1., 0., 1.],
          [0., 1., 0.],
          [1., 0., 1.]]])),
 'current_treatments': ((6, 5, 3),
  array([[[0., 0., 0.],
          [1., 1., 1.],
          [2., 2., 2.],
          [0., 0., 0.],
          [0., 0., 0.]],
  
         [[0., 0., 0.],
          [1., 1., 1.],
          [2., 2.

In [40]:
from copy import deepcopy

def process_sequential_test(data, projection_horizon, encoder_r=None, save_encoder_r=False):
    """
    Pre-process test dataset for multiple-step-ahead prediction: takes the last n-steps according to the projection horizon
    """

    sequence_lengths = data['sequence_lengths']
    outputs = data['outputs']
    current_treatments = data['current_treatments']
    previous_treatments = data['prev_treatments'][:, 1:, :]  # Without zero_init_treatment
    current_covariates = data['current_covariates']

    num_patient_points, max_seq_length, num_features = outputs.shape

    if encoder_r is not None:
        seq2seq_state_inits = np.zeros((num_patient_points, encoder_r.shape[-1]))
    seq2seq_active_encoder_r = np.zeros((num_patient_points, max_seq_length - projection_horizon))
    seq2seq_previous_treatments = np.zeros((num_patient_points, projection_horizon, previous_treatments.shape[-1]))
    seq2seq_current_treatments = np.zeros((num_patient_points, projection_horizon, current_treatments.shape[-1]))
    seq2seq_current_covariates = np.zeros((num_patient_points, projection_horizon, current_covariates.shape[-1]))
    seq2seq_outputs = np.zeros((num_patient_points, projection_horizon, outputs.shape[-1]))
    seq2seq_active_entries = np.zeros((num_patient_points, projection_horizon, 1))
    seq2seq_sequence_lengths = np.zeros(num_patient_points)

    for i in range(num_patient_points):
        fact_length = int(sequence_lengths[i]) - projection_horizon
        if encoder_r is not None:
            seq2seq_state_inits[i] = encoder_r[i, fact_length - 1]
        seq2seq_active_encoder_r[i, :fact_length] = 1.0

        seq2seq_active_entries[i] = np.ones(shape=(projection_horizon, 1))
        seq2seq_previous_treatments[i] = previous_treatments[i, fact_length - 1:fact_length + projection_horizon - 1, :]
        seq2seq_current_treatments[i] = current_treatments[i, fact_length:fact_length + projection_horizon, :]
        seq2seq_outputs[i] = outputs[i, fact_length: fact_length + projection_horizon, :]
        seq2seq_sequence_lengths[i] = projection_horizon
        seq2seq_current_covariates[i] = np.repeat([current_covariates[i, fact_length - 1]], projection_horizon, axis=0)

    # Package outputs
    seq2seq_data = {
        'active_encoder_r': seq2seq_active_encoder_r,
        'prev_treatments': seq2seq_previous_treatments,
        'current_treatments': seq2seq_current_treatments,
        'current_covariates': seq2seq_current_covariates,
        'prev_outputs': seq2seq_current_covariates[:, :, :1],
        'static_features': seq2seq_current_covariates[:, 0, 1:],
        'outputs': seq2seq_outputs,
        'sequence_lengths': seq2seq_sequence_lengths,
        'active_entries': seq2seq_active_entries,
        'unscaled_outputs': seq2seq_outputs * scaling_params['output_stds'] + scaling_params['output_means'],
        'patient_types': data['patient_types'],
        'patient_ids_all_trajectories': data['patient_ids_all_trajectories'],
        'patient_current_t': data['patient_current_t']
    }
    if encoder_r is not None:
        seq2seq_data['init_state'] = seq2seq_state_inits

    data_original = deepcopy(data)
    data = seq2seq_data
    data_shapes = {k: v.shape for k, v in data.items()}

    if save_encoder_r and encoder_r is not None:
        encoder_r = encoder_r[:, :max_seq_length - projection_horizon, :]

    processed_sequential = True

    return data

# Example initial data setup for process_sequential_test
data['prev_treatments'] = np.array([
    [[0, 1, 0], [1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 0]],  # Patient 1
    [[1, 0, 1], [0, 1, 0], [1, 0, 1], [0, 1, 0], [1, 0, 1]]  # Patient 2
])

# Adding dummy current_covariates for the example
data['current_covariates'] = np.array([
    [[0, 0, 0], [1, 1, 1], [2, 2, 2], [3, 3, 3], [4, 4, 4]],  # Patient 1
    [[5, 5, 5], [6, 6, 6], [7, 7, 7], [8, 8, 8], [9, 9, 9]]  # Patient 2
])

# Adding missing keys for the sake of the example
data['patient_types'] = np.array([0, 1])
data['patient_ids_all_trajectories'] = np.array([101, 102])
data['patient_current_t'] = np.array([0, 0])


# Running the function
processed_data = process_sequential_test(data, projection_horizon)

# Printing the results

processed_data_shapes = {k: v.shape for k, v in processed_data.items()}
processed_data_shapes

{'active_encoder_r': (2, 3),
 'prev_treatments': (2, 2, 3),
 'current_treatments': (2, 2, 3),
 'current_covariates': (2, 2, 3),
 'prev_outputs': (2, 2, 1),
 'static_features': (2, 2),
 'outputs': (2, 2, 3),
 'sequence_lengths': (2,),
 'active_entries': (2, 2, 1),
 'unscaled_outputs': (2, 2, 3),
 'patient_types': (2,),
 'patient_ids_all_trajectories': (2,),
 'patient_current_t': (2,)}

In [45]:
data

{'outputs': array([[[ 1,  2,  3],
         [ 4,  5,  6],
         [ 7,  8,  9],
         [10, 11, 12],
         [13, 14, 15]],
 
        [[16, 17, 18],
         [19, 20, 21],
         [22, 23, 24],
         [25, 26, 27],
         [28, 29, 30]]]),
 'prev_outputs': array([[[ 0,  1,  2],
         [ 3,  4,  5],
         [ 6,  7,  8],
         [ 9, 10, 11],
         [12, 13, 14]],
 
        [[15, 16, 17],
         [18, 19, 20],
         [21, 22, 23],
         [24, 25, 26],
         [27, 28, 29]]]),
 'sequence_lengths': array([5, 5]),
 'active_entries': array([[[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]],
 
        [[1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.],
         [1., 1., 1.]]]),
 'current_treatments': array([[[0, 0, 0],
         [1, 1, 1],
         [2, 2, 2],
         [3, 3, 3],
         [4, 4, 4]],
 
        [[5, 5, 5],
         [6, 6, 6],
         [7, 7, 7],
         [8, 8, 8],
      

In [42]:
processed_data

{'active_encoder_r': array([[1., 1., 1.],
        [1., 1., 1.]]),
 'prev_treatments': array([[[1., 0., 1.],
         [0., 1., 0.]],
 
        [[0., 1., 0.],
         [1., 0., 1.]]]),
 'current_treatments': array([[[3., 3., 3.],
         [4., 4., 4.]],
 
        [[8., 8., 8.],
         [9., 9., 9.]]]),
 'current_covariates': array([[[2., 2., 2.],
         [2., 2., 2.]],
 
        [[7., 7., 7.],
         [7., 7., 7.]]]),
 'prev_outputs': array([[[2.],
         [2.]],
 
        [[7.],
         [7.]]]),
 'static_features': array([[2., 2.],
        [7., 7.]]),
 'outputs': array([[[10., 11., 12.],
         [13., 14., 15.]],
 
        [[25., 26., 27.],
         [28., 29., 30.]]]),
 'sequence_lengths': array([2., 2.]),
 'active_entries': array([[[1.],
         [1.]],
 
        [[1.],
         [1.]]]),
 'unscaled_outputs': array([[[ 6. ,  7.5,  9. ],
         [ 7.5,  9. , 10.5]],
 
        [[13.5, 15. , 16.5],
         [15. , 16.5, 18. ]]]),
 'patient_types': array([0, 1]),
 'patient_ids_all_tr

# Eval

In [None]:
# diffusion_imputer = torch.load("diffusion_imputer_cancer_jul5.pt")
# data_embedder = torch.load("data_embedder_cancer_jul5.pt")

In [None]:
# num_batch_test = 82
# data_loaders = get_dataloader(num_batch_test, "cuda")
# data_loader_test = get_dataloader(num_batch_test, "cuda")[2]

In [None]:
training_standard_deviation = stds['cancer_volume']
training_mean = means['cancer_volume']

training_standard_deviation
training_mean

63.3630417049946

9.79698436907723

In [None]:
test_data = test_data_counterfactuals_tensor
# test_data = test_data_factuals_tensor
# test_data = validation_data_tensor

test_data.shape

torch.Size([204568, 60, 3])

In [None]:
test_data_counterfactuals_sequence_lengths

array([ 1.,  1.,  1., ..., 37., 37., 37.])

In [None]:
#divide test data to parts based on test_data_counterfactuals_sequence_lengths (So that the stuff in each chunk is of the same length)
#row 0 of test_data_counterfactuals_sequence_lengths tells the sequence length of the first row of test_data and so on
#so, if rows 0, 5, and 19 of test_data_counterfactuals_sequence_lengths are the same, then the first, sixth, and twentieth rows of test_data are of the same length and should be in the same chunk

#find the unique sequence lengths
unique_sequence_lengths, counts = np.unique(test_data_counterfactuals_sequence_lengths, return_counts=True)
# unique_sequence_lengths, counts = np.unique(test_data_factuals_sequence_lengths, return_counts=True)
# unique_sequence_lengths, counts = np.unique(validation_data_sequence_lengths, return_counts=True)

unique_sequence_lengths
counts

#find the indices of the unique sequence lengths
indices = [np.where(test_data_counterfactuals_sequence_lengths == i)[0] for i in unique_sequence_lengths]
# indices = [np.where(test_data_factuals_sequence_lengths == i)[0] for i in unique_sequence_lengths]
# indices = [np.where(validation_data_sequence_lengths == i)[0] for i in unique_sequence_lengths]

#divide the test data into chunks based on the unique sequence lengths
test_data_chunks = [test_data[i] for i in indices]
len(test_data_chunks)
test_data_chunks[0].shape
test_data_chunks[10].shape

#cut the second dimension of each chunk to its corresponding sequence length
# test_data_chunks = [chunk[:, :int(length)+1, :] for chunk, length in zip(test_data_chunks, unique_sequence_lengths)]
test_data_chunks = [chunk[:, :int(length)+1, :] for chunk, length in zip(test_data_chunks, unique_sequence_lengths)]

test_data_chunks[0].shape
test_data_chunks[10].shape



array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
       27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39.,
       40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 51., 52.,
       53., 54., 55., 56., 57., 58., 59.])

array([4000, 3944, 3920, 3888, 3864, 3852, 3852, 3848, 3840, 3828, 3828,
       3824, 3820, 3820, 3812, 3808, 3800, 3800, 3796, 3780, 3780, 3772,
       3764, 3744, 3716, 3692, 3684, 3660, 3636, 3612, 3600, 3588, 3552,
       3532, 3492, 3456, 3420, 3372, 3352, 3320, 3272, 3240, 3200, 3164,
       3132, 3088, 3056, 3028, 2984, 2952, 2916, 2888, 2844, 2796, 2752,
       2716, 2680, 2624, 2568])

59

torch.Size([4000, 60, 3])

torch.Size([3828, 60, 3])

torch.Size([4000, 2, 3])

torch.Size([3828, 12, 3])

In [None]:
#copy to the gpu
test_data_chunks = [chunk.to("cuda") for chunk in test_data_chunks]

#use embedder
test_data_chunks_embedded = [data_embedder(chunk) for chunk in test_data_chunks]

#create impuation masks
imputation_masks = [diffusion_imputer.get_mask(chunk,
                                               strategy='selected_features_last_n_time') for chunk in test_data_chunks_embedded]
imputation_masks = [mask.to("cuda") for mask in imputation_masks]
imputation_masks[0]


NameError: name 'data_embedder' is not defined

In [None]:
imputation_masks[16].shape

torch.Size([3800, 18, 3])

In [None]:
given_points = [torch.where(mask == 0, chunk, torch.tensor(float('nan')).to("cuda")) for chunk, mask in zip(test_data_chunks_embedded, imputation_masks)]
evaluated_points = [torch.where(mask != 0, chunk, torch.tensor(float('nan')).to("cuda")) for chunk, mask in zip(test_data_chunks_embedded, imputation_masks)]

given_points[58]
evaluated_points[58]

tensor([[[ 0.1664, -2.5844, -0.1376],
         [-0.1161, -2.5844, -0.1397],
         [-0.1161, -2.5844, -0.1419],
         ...,
         [-0.1161, -1.9149, -0.1546],
         [ 0.1664, -1.9149, -0.1546],
         [ 0.1664, -2.5844,     nan]],

        [[ 0.1664, -2.5844, -0.1376],
         [-0.1161, -2.5844, -0.1397],
         [-0.1161, -2.5844, -0.1419],
         ...,
         [-0.1161, -1.9149, -0.1546],
         [ 0.1664, -1.9149, -0.1546],
         [ 0.1664, -1.9149,     nan]],

        [[ 0.1664, -2.5844, -0.1376],
         [-0.1161, -2.5844, -0.1397],
         [-0.1161, -2.5844, -0.1419],
         ...,
         [-0.1161, -1.9149, -0.1546],
         [ 0.1664, -1.9149, -0.1546],
         [-0.1161, -2.5844,     nan]],

        ...,

        [[ 0.1664, -2.5844, -0.1469],
         [-0.1161, -2.5844, -0.1472],
         [ 0.1664, -2.5844, -0.1471],
         ...,
         [-0.1161, -2.5844, -0.1546],
         [-0.1161, -1.9149, -0.1546],
         [ 0.1664, -2.5844,     nan]],

        [[

tensor([[[    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         ...,
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan, -0.1546]],

        [[    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         ...,
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan, -0.1546]],

        [[    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         ...,
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan, -0.1546]],

        ...,

        [[    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         ...,
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan, -0.1546]],

        [[

In [None]:
sample_number = 20
sample_count = 100
# sample_count = test_data_chunks_embedded[0].shape[0]
samples = []
for i in range(sample_number):
    for j in range(len(test_data_chunks_embedded)):#range(58,59):
        samples.append(diffusion_imputer.eval(
            test_data_chunks_embedded[j][2000:(2000+sample_count)], imputation_masks[j][2000:(2000+sample_count)],
            mean = training_mean, std = training_standard_deviation, scale = 1
            ),
            )
        progress = (i * len(test_data_chunks_embedded) + j) / (sample_number * len(test_data_chunks_embedded))      
        print(f"Progress: {progress * 100:.2f}%")
        print(j)

max difference =  74.056884765625
data at max difference =  3.7393510341644287
imputed at max difference =  77.79623413085938
mae =  0.8374786376953125
rmse =  40.83018692680027
Progress: 0.00%
0
max difference =  14.039884567260742
data at max difference =  1.941130518913269
imputed at max difference =  15.9810152053833
mae =  0.3028331398963928
rmse =  8.947113700534986
Progress: 0.08%
1
max difference =  4.248103141784668
data at max difference =  5.211277484893799
imputed at max difference =  0.9631745219230652
mae =  0.1619843691587448
rmse =  3.366079910941746
Progress: 0.17%
2
max difference =  5.0010480880737305
data at max difference =  4.876647472381592
imputed at max difference =  -0.12440063059329987
mae =  0.24792127311229706
rmse =  4.737360415251358
Progress: 0.25%
3
max difference =  46.03645706176758
data at max difference =  4.2342400550842285
imputed at max difference =  50.27069854736328
mae =  0.5802479982376099
rmse =  25.469065790591035
Progress: 0.34%
4
max diff

In [None]:
len(samples)

1180

In [None]:
#save the samples
torch.save(samples, "samples_cancer_jul18.pt")

In [None]:
samples = torch.load("samples_cancer_jul18.pt")

In [None]:
sample_number = 20

In [None]:
# imputation_results = [samples[i][0] for i in range(len(samples))] # = imputation results, input data, mask, mae
# imputation_results = torch.stack(imputation_results, dim = 0)

In [None]:
target_column = 2
sample_count = 100

In [None]:
samples[0][0][:, -1, target_column]
samples[1][0][:, -1, target_column]

tensor([-1.4966e-01, -1.4914e-01, -1.5102e-01, -1.4979e-01, -1.6304e-01,
        -1.5139e-01, -1.5176e-01, -1.5261e-01, -1.4264e-01, -1.4423e-01,
        -1.4835e-01, -1.4299e-01, -1.5450e-01, -1.5416e-01, -1.5417e-01,
        -1.5443e-01,  1.6441e+00,  2.3323e+00,  6.3973e-01,  7.8841e-01,
        -1.5220e-01, -1.5172e-01, -1.5158e-01, -1.5215e-01,  2.7613e-01,
         4.5262e-01,  3.9072e-01,  2.9732e-01, -1.4774e-01, -1.4628e-01,
        -1.4643e-01, -1.4770e-01, -1.5400e-01, -1.5291e-01, -1.5387e-01,
        -1.5385e-01,  4.5932e-01,  6.5887e-01,  9.9221e-01,  5.4589e-01,
         3.8584e-02,  1.0498e-02,  7.3444e-03, -2.7701e-02,  5.0006e+00,
         1.7736e+00,  7.7796e+01,  2.0759e+00, -1.0107e-01, -1.1607e-01,
        -1.1746e-01, -1.0900e-01, -1.4332e-01, -1.4690e-01, -1.4138e-01,
        -1.4555e-01, -1.5475e-01, -1.5344e-01, -1.5400e-01, -1.5459e-01,
        -1.5602e-01, -1.5288e-01, -1.5384e-01, -1.5398e-01, -1.5491e-01,
        -1.5442e-01, -1.5564e-01, -1.5584e-01, -1.5

tensor([-1.5306e-01, -1.5309e-01, -1.5356e-01, -1.5313e-01,  1.8465e-01,
         4.0122e-01,  1.7183e-01,  1.6584e-01, -1.4953e-01, -1.4745e-01,
        -1.4868e-01, -1.4831e-01, -1.5351e-01, -1.5296e-01, -1.5402e-01,
        -1.5352e-01,  6.1852e-01,  6.8694e-01,  2.4686e-01,  5.7783e-01,
         1.5740e-02,  1.2217e-02,  2.9980e-03, -2.7584e-02,  1.5981e+01,
         5.9380e+00,  9.5963e-01,  1.1508e+01, -1.0969e-01, -1.0823e-01,
        -1.1730e-01, -1.1252e-01, -1.4156e-01, -1.4307e-01, -1.4203e-01,
        -1.4296e-01, -1.5262e-01, -1.5228e-01, -1.5351e-01, -1.5373e-01,
        -1.5347e-01, -1.5314e-01, -1.5391e-01, -1.5324e-01, -1.5466e-01,
        -1.5353e-01, -1.5379e-01, -1.5479e-01, -1.5154e-01, -1.5201e-01,
        -1.5204e-01, -1.5289e-01, -1.3660e-01, -1.3292e-01, -1.3490e-01,
        -1.3426e-01, -1.5384e-01, -1.5397e-01, -1.5414e-01, -1.5454e-01,
        -6.9597e-02, -1.3041e-01, -6.5243e-02, -5.1264e-02, -1.3840e-01,
        -1.4255e-01, -1.3649e-01, -1.3897e-01,  4.4

In [None]:
imputation_results = [samples[i][0][:, -1, target_column] for i in range(len(samples))] # = imputation results, input data, mask, mae
imputation_results = torch.cat(imputation_results, dim = 0)
imputation_results_denormed = imputation_results * training_standard_deviation + training_mean
imputation_results_denormed = imputation_results_denormed.reshape(sample_number, sample_count, -1)
# imputation_results_denormed
#change all values of imputation at column 2 that are higher than 1150 to 1150
# imputation_results_denormed[imputation_results_denormed > 1150] = 1150
imputation_results_denormed.shape

torch.Size([20, 100, 59])

In [None]:
# imputation_results = [samples[i][0] for i in range(len(samples))] # = imputation results, input data, mask, mae
# imputation_results = torch.stack(imputation_results, dim = 0)

In [None]:
samples[1179][1].shape

torch.Size([100, 60, 3])

In [None]:
len(samples)

1180

In [None]:
denormed_data = [samples[i][1][:, -1, target_column] for i in range(len(samples))] # = imputation results, input data, mask, mae
denormed_data = torch.cat(denormed_data, dim = 0)
denormed_data = denormed_data * training_standard_deviation + training_mean
denormed_data = denormed_data.reshape(sample_number, sample_count, -1)
denormed_data = denormed_data[0]
denormed_data.shape

torch.Size([100, 59])

In [None]:
# imputation_results_denormed[0]
# imputation_results_denormed[1]
# denormed_data = imputation_results_denormed[1]
# imputation_results_denormed = imputation_results_denormed[0]


In [None]:
# imputation_mask = [samples[i][2][:, -1, 2] for i in range(len(samples))]
# imputation_mask = torch.cat(imputation_mask, dim = 0)
# imputation_mask

In [None]:
# denormed_data = test_data_chunks[16] * training_standard_deviation + training_mean
# denormed_data = denormed_data.cpu().detach()

# given_points = given_points.cpu().detach()
# eval_points = eval_points.cpu().detach()

In [None]:
# qlist = [0.05, 0.25, 0.50, 0.75, 0.95]
# #qlist = [0.5]
# quantiles_imp = []
# for q in qlist:
#     quantiles = torch.quantile(imputation_results_denormed, q, dim=0, interpolation="linear")
#     quantiles_imp.append(quantiles)

# means = torch.mean(imputation_results_denormed, dim=0)
# quantiles_imp.append(means)

# quantiles_imp = torch.stack(quantiles_imp, dim=0).cpu()


In [None]:
qlist = [0.05, 0.25, 0.50, 0.75, 0.95]
#qlist = [0.5]
quantiles_imp = []
for q in qlist:
    quantiles = torch.quantile(imputation_results_denormed, q, dim=0, interpolation="linear")
    quantiles_imp.append(quantiles)

means = torch.mean(imputation_results_denormed, dim=0)
quantiles_imp.append(means)

quantiles_imp = torch.stack(quantiles_imp, dim=0).cpu()


In [None]:
quantiles_imp.shape

torch.Size([6, 100, 59])

In [None]:
# mae = torch.mean(torch.abs(denormed_data[imputation_masks[58].cpu() != 0] - torch.Tensor(quantiles_imp[2][imputation_masks[58].cpu() != 0]))).item()
# print(mae/1150 * 100)

In [None]:
denormed_data.shape

torch.Size([100, 59])

In [None]:
mae = torch.mean(torch.abs(denormed_data.cpu() - torch.Tensor(quantiles_imp[2].cpu()))).item()
print(mae/1150 * 100)

0.13610567217287808


In [None]:
# rmse = torch.sqrt(torch.mean((denormed_data[imputation_masks[16].cpu() != 0] - 
#                               torch.Tensor(quantiles_imp[2][imputation_masks[16].cpu() != 0]))**2)).item()
# print(rmse/1150 * 100)

In [None]:
rmse = torch.sqrt(torch.mean((denormed_data.cpu() - torch.Tensor(quantiles_imp[2].cpu()))**2)).item()
print(rmse/1150 * 100)

1.1497809368631116


In [None]:
def quantile_loss(target, forecast, q: float, eval_points) -> float:
    return 2 * torch.sum(
        torch.abs((forecast - target) * eval_points * ((target <= forecast) * 1.0 - q))
    )


def calc_denominator(target, eval_points):
    return torch.sum(torch.abs(target * eval_points))


def calc_quantile_CRPS(target, forecast, eval_points, mean_scaler, scaler):
    target = target * scaler + mean_scaler
    forecast = forecast * scaler + mean_scaler

    quantiles = np.arange(0.05, 1.0, 0.05)
    denom = calc_denominator(target, eval_points)
    CRPS = 0
    for i in range(len(quantiles)):
        q_pred = []
        for j in range(len(forecast)):
            q_pred.append(torch.quantile(forecast[j : j + 1], quantiles[i], dim=1))
        q_pred = torch.cat(q_pred, 0)
        q_loss = quantile_loss(target, q_pred, quantiles[i], eval_points)
        CRPS += q_loss / denom
    return CRPS.item() / len(quantiles)


In [None]:
sample_number = 100

In [None]:
all_target = samples[0][1]  # input_data same for all samples (B, L, K)
all_generated_samples = torch.stack([samples[i][0] for i in range(sample_number)]).permute(1,0,2,3)  # (B, sample_num, L, K)
all_evalpoint = samples[0][2]  # mask same for all samples (B, L, K)
CRPS = calc_quantile_CRPS(all_target, all_generated_samples, all_evalpoint, training_mean, training_standard_deviation)
print(CRPS)

IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr"

In [None]:
quantiles_imp.shape

torch.Size([6, 1000, 60, 3])

In [None]:
L = test_data.shape[1]
K = test_data.shape[2]

dataind = 40  # Number of samples to visualize
start_time = 40 
n_skip = 0  # Number of columns to skip

plt.rcParams["font.size"] = 16
fig, axes = plt.subplots(nrows=dataind, ncols=K - n_skip, figsize=(24.0, 6 * dataind))

for i in range(dataind):
    for k in range(n_skip, K):  # Start from n_skip instead of 0
        df = pd.DataFrame({
            "x": np.arange(start_time, L),
            "val": denormed_data[i, start_time:, k],
            "y": eval_points[i, start_time:, k]
        })
        df = df[df.y != 0]
        df2 = pd.DataFrame({
            "x": np.arange(start_time, L),
            "val": denormed_data[i, start_time:, k],
            "y": given_points[i, start_time:, k]
        })
        df2 = df2[df2.y != 0]
        indices = df.x.astype(int).to_numpy()
        row = i
        col = k - n_skip  # Adjust column index for skipped columns

        axes[row][col].plot(range(start_time, L), quantiles_imp[2][i, start_time:, k], color='g', linestyle='solid', label='median')
        axes[row][col].fill_between(range(start_time, L), quantiles_imp[0][i, start_time:, k], quantiles_imp[4][i, start_time:, k], color='g', alpha=0.3)
        axes[row][col].plot(df.x, df.val, color='b', marker='o', linestyle='None')
        axes[row][col].plot(df.x, quantiles_imp[5][i, indices, k], color='r', linestyle='None', label='median', marker='x')
        axes[row][col].plot(df.x, quantiles_imp[0][i, indices, k], color='r', linestyle='None', marker=1)
        axes[row][col].plot(df.x, quantiles_imp[4][i, indices, k], color='r', linestyle='None', marker=1)

        if col == 0:  # Only label the first of the remaining columns
            axes[row][col].set_ylabel('Value')
        if row == dataind - 1:  # Only label the last row
            axes[row][col].set_xlabel('Time')

# Optional: Adjust the layout for better spacing
plt.tight_layout()
plt.show()


IndexError: too many indices for tensor of dimension 2

Error in callback <function flush_figures at 0x2b955f7ad990> (for post_execute), with arguments args (),kwargs {}:


KeyboardInterrupt: 

# Eval old

In [None]:
# num_batch_test = 82
# data_loaders = get_dataloader(num_batch_test, "cuda")
# data_loader_test = get_dataloader(num_batch_test, "cuda")[2]

In [None]:
test_data = test_data_counterfactuals_tensor
# test_data = test_data_factuals_tensor
# test_data = validation_data_tensor

test_data.shape

torch.Size([204568, 60, 3])

In [None]:
test_data_counterfactuals_sequence_lengths

array([ 1.,  1.,  1., ..., 37., 37., 37.])

In [None]:
#divide test data to parts based on test_data_counterfactuals_sequence_lengths (So that the stuff in each chunk is of the same length)
#row 0 of test_data_counterfactuals_sequence_lengths tells the sequence length of the first row of test_data and so on
#so, if rows 0, 5, and 19 of test_data_counterfactuals_sequence_lengths are the same, then the first, sixth, and twentieth rows of test_data are of the same length and should be in the same chunk

#find the unique sequence lengths
unique_sequence_lengths, counts = np.unique(test_data_counterfactuals_sequence_lengths, return_counts=True)
# unique_sequence_lengths, counts = np.unique(test_data_factuals_sequence_lengths, return_counts=True)
# unique_sequence_lengths, counts = np.unique(validation_data_sequence_lengths, return_counts=True)

unique_sequence_lengths
counts

#find the indices of the unique sequence lengths
indices = [np.where(test_data_counterfactuals_sequence_lengths == i)[0] for i in unique_sequence_lengths]
# indices = [np.where(test_data_factuals_sequence_lengths == i)[0] for i in unique_sequence_lengths]
# indices = [np.where(validation_data_sequence_lengths == i)[0] for i in unique_sequence_lengths]

#divide the test data into chunks based on the unique sequence lengths
test_data_chunks = [test_data[i] for i in indices]
len(test_data_chunks)
test_data_chunks[0].shape
test_data_chunks[10].shape

#cut the second dimension of each chunk to its corresponding sequence length
# test_data_chunks = [chunk[:, :int(length)+1, :] for chunk, length in zip(test_data_chunks, unique_sequence_lengths)]
test_data_chunks = [chunk[:, :int(length)+1, :] for chunk, length in zip(test_data_chunks, unique_sequence_lengths)]

test_data_chunks[0].shape
test_data_chunks[10].shape



array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12., 13.,
       14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24., 25., 26.,
       27., 28., 29., 30., 31., 32., 33., 34., 35., 36., 37., 38., 39.,
       40., 41., 42., 43., 44., 45., 46., 47., 48., 49., 50., 51., 52.,
       53., 54., 55., 56., 57., 58., 59.])

array([4000, 3944, 3920, 3888, 3864, 3852, 3852, 3848, 3840, 3828, 3828,
       3824, 3820, 3820, 3812, 3808, 3800, 3800, 3796, 3780, 3780, 3772,
       3764, 3744, 3716, 3692, 3684, 3660, 3636, 3612, 3600, 3588, 3552,
       3532, 3492, 3456, 3420, 3372, 3352, 3320, 3272, 3240, 3200, 3164,
       3132, 3088, 3056, 3028, 2984, 2952, 2916, 2888, 2844, 2796, 2752,
       2716, 2680, 2624, 2568])

59

torch.Size([4000, 60, 3])

torch.Size([3828, 60, 3])

torch.Size([4000, 2, 3])

torch.Size([3828, 12, 3])

In [None]:
#copy to the gpu
test_data_chunks = [chunk.to("cuda") for chunk in test_data_chunks]

#use embedder
test_data_chunks_embedded = [data_embedder(chunk) for chunk in test_data_chunks]

#create impuation masks
diffusion_imputer.features_to_impute = [2]
diffusion_imputer.last_n_time = 1
imputation_masks = [diffusion_imputer.get_mask(chunk,
                                               strategy='selected_features_last_n_time',
                                               ) for chunk in test_data_chunks_embedded]
imputation_masks = [mask.to("cuda") for mask in imputation_masks]
imputation_masks[0]

tensor([[[0., 0., 0.],
         [0., 0., 1.]],

        [[0., 0., 0.],
         [0., 0., 1.]],

        [[0., 0., 0.],
         [0., 0., 1.]],

        ...,

        [[0., 0., 0.],
         [0., 0., 1.]],

        [[0., 0., 0.],
         [0., 0., 1.]],

        [[0., 0., 0.],
         [0., 0., 1.]]], device='cuda:0')

In [None]:
imputation_masks[16].shape

torch.Size([3800, 18, 3])

In [None]:
given_points = [torch.where(mask == 0, chunk, torch.tensor(float('nan')).to("cuda")) for chunk, mask in zip(test_data_chunks_embedded, imputation_masks)]
evaluated_points = [torch.where(mask != 0, chunk, torch.tensor(float('nan')).to("cuda")) for chunk, mask in zip(test_data_chunks_embedded, imputation_masks)]

given_points[58]
evaluated_points[58]

tensor([[[-1.1806, -0.1769, -0.1376],
         [-0.9084, -0.1769, -0.1397],
         [-0.9084, -0.1769, -0.1419],
         ...,
         [-0.9084, -0.1769, -0.1546],
         [-1.1806, -0.1769, -0.1546],
         [-1.1806, -0.1769,     nan]],

        [[-1.1806, -0.1769, -0.1376],
         [-0.9084, -0.1769, -0.1397],
         [-0.9084, -0.1769, -0.1419],
         ...,
         [-0.9084, -0.1769, -0.1546],
         [-1.1806, -0.1769, -0.1546],
         [-1.1806, -0.1769,     nan]],

        [[-1.1806, -0.1769, -0.1376],
         [-0.9084, -0.1769, -0.1397],
         [-0.9084, -0.1769, -0.1419],
         ...,
         [-0.9084, -0.1769, -0.1546],
         [-1.1806, -0.1769, -0.1546],
         [-0.9084, -0.1769,     nan]],

        ...,

        [[-1.1806, -0.1769, -0.1469],
         [-0.9084, -0.1769, -0.1472],
         [-1.1806, -0.1769, -0.1471],
         ...,
         [-0.9084, -0.1769, -0.1546],
         [-0.9084, -0.1769, -0.1546],
         [-1.1806, -0.1769,     nan]],

        [[

tensor([[[    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         ...,
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan, -0.1546]],

        [[    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         ...,
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan, -0.1546]],

        [[    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         ...,
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan, -0.1546]],

        ...,

        [[    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         ...,
         [    nan,     nan,     nan],
         [    nan,     nan,     nan],
         [    nan,     nan, -0.1546]],

        [[

In [None]:
sample_number = 1
sample_count = 10
shift = 0
# sample_count = test_data_chunks_embedded[0].shape[0]
samples = []
for i in range(sample_number):
    for j in range(len(test_data_chunks_embedded)):#range(58,59):
        samples.append(diffusion_imputer.eval(
            test_data_chunks_embedded[j][shift:(
                shift+sample_count)], imputation_masks[j][shift:(shift+sample_count)],
            mean = training_mean, std = training_standard_deviation, scale = 0
            ),
            )
        progress = (i * len(test_data_chunks_embedded) + j) / (sample_number * len(test_data_chunks_embedded))      
        print(f"Progress: {progress * 100:.2f}%")
        print(j)

max difference =  2.3893959522247314
data at max difference =  2.678281545639038
imputed at max difference =  0.28888556361198425
mae =  0.5629335045814514
rmse =  5.780492367951767
Progress: 0.00%
0
max difference =  0.3838134706020355
data at max difference =  0.40379798412323
imputed at max difference =  0.01998450793325901
mae =  0.09787647426128387
rmse =  0.9511500648830248
Progress: 1.69%
1
max difference =  0.4813027083873749
data at max difference =  0.0189066119492054
imputed at max difference =  0.5002093315124512
mae =  0.09478287398815155
rmse =  1.0939372518788213
Progress: 3.39%
2
max difference =  0.40381354093551636
data at max difference =  0.3912235200405121
imputed at max difference =  -0.012590025551617146
mae =  0.1017293855547905
rmse =  0.9900554159413214
Progress: 5.08%
3
max difference =  0.1932147741317749
data at max difference =  0.015995968133211136
imputed at max difference =  0.20921073853969574
mae =  0.046834547072649
rmse =  0.46471757474152936
Progre

In [None]:
# Updated evaluation function


def evaluate_model(dataloader, imputer, training_mean, training_std, scale=1):
    all_samples = []
    total_batches = len(dataloader)
    print(f"Total batches: {total_batches}")

    for batch_idx, data_batch in enumerate(dataloader):
        # Get the data from the batch (collate_fn returns a tuple)
        data_batch = data_batch.to(imputer.device)
        seq_length = data_batch.shape[1]
        print(f"Batch {batch_idx + 1} sequence length: {seq_length}")

        # Generate imputation masks for the current batch
        imputation_masks = imputer.get_mask(
            data_batch, strategy='selected_features_last_n_time').to(imputer.device)
        
        imputed_samples = imputer.eval(
            data_batch,
            imputation_masks,
            mean=training_mean,
            std=training_std,
            scale=scale
        )

        all_samples.append(imputed_samples)

    return torch.cat(all_samples, dim=0)

# Main function to run the evaluation multiple times with accurate progress tracking


In [None]:
#create data loader for train, val, and test 
# train_loader = torch.utils.data.DataLoader(training_data_tensor, batch_size=20, shuffle=True)
# val_loader = torch.utils.data.DataLoader(validation_data_tensor, batch_size=20, shuffle=True)
# test_loader = torch.utils.data.DataLoader(test_data_counterfactuals_tensor, batch_size=20, shuffle=True)

# train_loader = create_dataloader(
#     training_data_tensor, training_data_sequence_lengths, batch_size=20)
# val_loader = create_dataloader(
#     validation_data_tensor, validation_data_sequence_lengths, batch_size=20)
# test_loader = create_dataloader(test_data_counterfactuals_tensor, test_data_counterfactuals_sequence_lengths)

train_loader = create_balanced_dataloader(
    training_data_tensor, training_data_sequence_lengths, batch_size=300, balance_factor=1.0)

val_loader = create_balanced_dataloader(
    validation_data_tensor, validation_data_sequence_lengths, batch_size=300, balance_factor=1.0)

# test_loader = create_balanced_dataloader(
#     test_data_counterfactuals_tensor, test_data_counterfactuals_sequence_lengths, batch_size=20, balance_factor=1.0)


#check the size of the train, val, and test sets
print(len(train_loader))
print(len(val_loader))
# print(len(test_loader))

1236
103


In [None]:
target_column = 2
# sample_count = 300
# sample_number = 20

In [None]:
samples[0][0][:, -1, target_column]
samples[1][0][:, -1, target_column]

tensor([ 2.6365,  0.2889,  0.0402,  2.5317, -0.1480, -0.1490, -0.1452, -0.1326,
        -0.1541, -0.1538], device='cuda:0')

tensor([ 0.0200,  0.3465,  0.4109, -0.1116, -0.1438, -0.1371, -0.1375, -0.1494,
        -0.1537, -0.1538], device='cuda:0')

In [None]:
imputation_results = [samples[i][0][:sample_count, -1, target_column]
                      # = imputation results, input data, mask, mae
                      for i in range(len(samples))]
# imputation_results[0].shape
# len(imputation_results)

imputation_results = torch.cat(imputation_results, dim = 0)
# imputation_results.shape

imputation_results_denormed = imputation_results * training_standard_deviation + training_mean
imputation_results_denormed = imputation_results_denormed.reshape(sample_number, sample_count, -1)
# imputation_results_denormed
#change all values of imputation at column 2 that are higher than 1150 to 1150
# imputation_results_denormed[imputation_results_denormed > 1150] = 1150
imputation_results_denormed.shape

torch.Size([1, 10, 59])

In [None]:
denormed_data = [samples[i][1][:sample_count, -1, target_column]
                 # = imputation results, input data, mask, mae
                 for i in range(len(samples))]
denormed_data = torch.cat(denormed_data, dim = 0)
denormed_data = denormed_data * training_standard_deviation + training_mean
denormed_data = denormed_data.reshape(sample_number, sample_count, -1)
denormed_data = denormed_data[0]
denormed_data.shape

torch.Size([10, 59])

In [None]:
# choose_time_more_than = 58
# denormed_data = denormed_data[:, choose_time_more_than:]
# imputation_results_denormed = imputation_results_denormed[:, :, choose_time_more_than:]

In [None]:
qlist = [0.05, 0.25, 0.50, 0.75, 0.95]
#qlist = [0.5]
quantiles_imp = []
for q in qlist:
    quantiles = torch.quantile(imputation_results_denormed, q, dim=0, interpolation="linear")
    quantiles_imp.append(quantiles)

means = torch.mean(imputation_results_denormed, dim=0)
quantiles_imp.append(means)

quantiles_imp = torch.stack(quantiles_imp, dim=0).cpu()


In [None]:
quantiles_imp.shape

torch.Size([6, 10, 59])

In [None]:
denormed_data.shape

torch.Size([10, 59])

In [None]:
mae = torch.mean(torch.abs(denormed_data.cpu() - torch.Tensor(quantiles_imp[2].cpu()))).item()
print(mae/1150 * 100)

0.10002058485279912


In [None]:
rmse = torch.sqrt(torch.mean((denormed_data.cpu() - torch.Tensor(quantiles_imp[2].cpu()))**2)).item()
print(rmse/1150 * 100)

0.7916258107060972


In [None]:
# import torch
# import time


# def run_multiple_evaluations(
#         dataloader, imputer, training_mean, training_std, sample_number, data_embedder = None,
#         old_sample=[],
#         min_sequence_len=2, max_sequence_len=None, scale=1, num_gpus=1,
#         verbose=True, show_max_diff=False, show_rmse=False):
#     final_samples = old_sample
#     max_seq_len = 0
#     total_batches = len(dataloader) * sample_number
#     completed_batches = 0
#     sample_time = []
#     average_sample_time = 0

#     # Limit the number of GPUs to available GPUs and the specified number
#     num_gpus = min(num_gpus, torch.cuda.device_count())
#     if num_gpus > 1:
#         # Wrap the imputer with DataParallel
#         imputer = torch.nn.DataParallel(
#             imputer, device_ids=list(range(num_gpus)))

#         if data_embedder is not None:
#             data_embedder = torch.nn.DataParallel(
#                 data_embedder, device_ids=list(range(num_gpus)))

#     print(f"Using {num_gpus} GPUs for evaluation.")


#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     imputer = imputer.to(device)
#     if data_embedder is not None:
#         data_embedder = data_embedder.to(device)

#     for i in range(sample_number):
#         sample_start = time.time()
#         print("-------------------------------------------------")
#         print(f"Running sample {i + 1}/{sample_number}")
#         all_samples = []

#         for batch_idx, data_batch in enumerate(dataloader):
#             # Get the data from the batch (collate_fn returns a tuple)
#             data_batch = data_batch.to(device)
#             if data_embedder is not None:
#                 data_batch = data_embedder(data_batch)
#             seq_length = data_batch.shape[1]
#             if seq_length < min_sequence_len:
#                 completed_batches += 1
#                 continue
#             if max_sequence_len is not None and seq_length > max_sequence_len:
#                 completed_batches += 1
#                 continue

#             print(f"sequence length: {seq_length}")

#             # Generate imputation masks for the current batch
#             if num_gpus > 1:
#                 imputation_masks = imputer.module.get_mask(
#                     data_batch, strategy='selected_features_last_n_time').to(device)
#             else:
#                 imputation_masks = imputer.get_mask(
#                     data_batch, strategy='selected_features_last_n_time').to(device)

#             if num_gpus > 1:
#                 imputed_samples = imputer.module.eval(
#                     data_batch,
#                     imputation_masks,
#                     mean=training_mean,
#                     std=training_std,
#                     scale=scale,
#                     verbose=verbose,
#                     show_max_diff=show_max_diff,
#                     show_rmse=show_rmse
#                 )
#             else:
#                 imputed_samples = imputer.eval(
#                     data_batch,
#                     imputation_masks,
#                     mean=training_mean,
#                     std=training_std,
#                     scale=scale,
#                     verbose=verbose,
#                     show_max_diff=show_max_diff,
#                     show_rmse=show_rmse
#                 )

#             all_samples.append(imputed_samples)

#             completed_batches += 1
#             progress = completed_batches / total_batches
#             print(f"Overall Progress: {progress * 100:.2f}%")
#             print(f"Time to finish (est.): {average_sample_time * (sample_number - i - 1) / 60:.2f} min")

#         sample_end = time.time()
#         sample_time.append(sample_end - sample_start)
#         average_sample_time = sum(sample_time) / len(sample_time)
#         final_samples.append(all_samples)

#         rmse, rmse_median = calculate_rmse(
#             final_samples, training_mean, training_std)
#         print(f"RMSE: {rmse:.3f} | RMSE (Median): {rmse_median:.3f}")

#     return final_samples

In [None]:
# import torch.multiprocessing as mp
# import time


# def run_multiple_evaluations_on_gpu(gpu_id, dataloader, imputer, training_mean, training_std, sample_number, old_sample,
#                                     min_sequence_len=2, max_sequence_len=None, scale=1, verbose=True, show_max_diff=False, show_rmse=False, result_queue=None):
#     imputer.device = torch.device(f'cuda:{gpu_id}')
#     sample_time = []

#     for i in range(sample_number):
#         sample_start = time.time()
#         print("-------------------------------------------------")
#         print(f"Running sample {i + 1}/{sample_number} on GPU {gpu_id}")
#         all_samples = []

#         for batch_idx, data_batch in enumerate(dataloader):
#             data_batch = data_batch.to(imputer.device)
#             seq_length = data_batch.shape[1]
#             if seq_length < min_sequence_len or (max_sequence_len is not None and seq_length > max_sequence_len):
#                 continue

#             print(f"sequence length: {seq_length}")

#             imputation_masks = imputer.get_mask(
#                 data_batch, strategy='selected_features_last_n_time').to(imputer.device)

#             imputed_samples = imputer.eval(data_batch, imputation_masks, mean=training_mean, std=training_std,
#                                            scale=scale, verbose=verbose, show_max_diff=show_max_diff, show_rmse=show_rmse)

#             all_samples.append(imputed_samples)

#         sample_end = time.time()
#         sample_time.append(sample_end - sample_start)
#         average_sample_time = sum(sample_time) / len(sample_time)

#         if result_queue is not None:
#             result_queue.put((gpu_id, all_samples, average_sample_time))


# def wrapper_run_multiple_evaluations(dataloader, imputer, training_mean, training_std, sample_number, old_sample=[],
#                                      min_sequence_len=2, max_sequence_len=None, scale=1, verbose=True,
#                                      show_max_diff=False, show_rmse=False, num_gpus=1):
#     mp.set_start_method('spawn', force=True)  # Set the start method globally
#     manager = mp.Manager()
#     result_queue = manager.Queue()  # Use mp.Manager to create the queue
#     processes = []
#     samples_per_gpu = sample_number // num_gpus

#     for gpu_id in range(num_gpus):
#         p = mp.Process(target=run_multiple_evaluations_on_gpu, args=(gpu_id, dataloader, imputer, training_mean, training_std,
#                                                                      samples_per_gpu, old_sample, min_sequence_len, max_sequence_len,
#                                                                      scale, verbose, show_max_diff, show_rmse, result_queue))
#         p.start()
#         processes.append(p)

#     final_samples = []
#     total_samples = samples_per_gpu * num_gpus
#     completed_samples = 0
#     total_time = 0

#     while completed_samples < total_samples:
#         gpu_id, samples, sample_time = result_queue.get()
#         final_samples.extend(samples)
#         completed_samples += 1
#         total_time += sample_time
#         remaining_samples = total_samples - completed_samples
#         average_time_per_sample = total_time / completed_samples
#         estimated_time_remaining = remaining_samples * average_time_per_sample

#         rmse, rmse_median = calculate_rmse(
#             final_samples, training_mean, training_std)
#         print(f"Progress: {completed_samples}/{total_samples} samples completed.")
#         print(f"Time remaining (est.): {estimated_time_remaining / 60:.2f} minutes.")
#         print(f"Current RMSE: {rmse:.3f} | RMSE (Median): {rmse_median:.3f}")

#     for p in processes:
#         p.join()

#     return final_samples

# # Example usage (ensure to define calculate_rmse and other necessary functions):
# # final_samples = wrapper_run_multiple_evaluations(dataloader, imputer, training_mean, training_std, sample_number=20, num_gpus=4)

# Old train

In [None]:
# %matplotlib inline
# from IPython.display import display, clear_output
# import matplotlib.pyplot as plt
# import time
# import statistics
# from itertools import chain
# import numpy as np
# from collections import deque

# def train(model, data_loader, data_loader_validation, epochs, lr, loss_func, batch_embedder,
#           windowed_mode=False, window_mode="uniform", window_start_mode="random", min_window=50, max_window=100, neg_bin_p=0.95, train_on_all_every=4,
#           annealing_mode = False, annealing_window=5, annealing_multiplier=1.25, annealing_ratio = 0.5, annealing_minimum = 1e-6,
#           device="cuda", verbose=False, plot_every=10,
#           validation_frequency=1, validation_prp=10, moving_avg_window=10):

#     batch_embedder = batch_embedder.to(device)
#     model = model.to(device)

#     optimizer = torch.optim.Adam(
#         chain(batch_embedder.parameters(), model.parameters()),
#         lr=lr
#     )

#     model.train()
#     batch_embedder.train()
#     loss_list = []
#     initial_value = 1.0  # Initial value for equal probability
#     window_losses = torch.ones(max_window - min_window + 1, device=device) * initial_value  # Track losses for each window length
#     window_counts = torch.zeros(max_window - min_window + 1, device=device)  # Track counts for each window length
#     loss_deques = [deque(maxlen=moving_avg_window) for _ in range(max_window - min_window + 1)]  # Deques for moving average
#     if windowed_mode and window_mode == "biased_loss":
#         fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(10, 12))
#     else:
#         fig, ax1 = plt.subplots(1, 1, figsize=(10, 6))
#     epoch_loss_list = []
#     val_loss = 0

#     for epoch in range(epochs):
#         # Annealing for the learning rate
#         if annealing_mode and epoch > annealing_window:
#             if len(epoch_loss_list) > 0 and epoch_loss_list[-1] >= annealing_multiplier * (statistics.mean(epoch_loss_list[-annealing_window:])):
#                 for g in optimizer.param_groups:
#                     if g['lr'] * annealing_ratio < annealing_minimum:
#                         g['lr'] = annealing_minimum
#                     else:
#                         g['lr'] *= annealing_ratio

#         start = time.time()
#         for i, batch in enumerate(data_loader):

#             batch = batch.to(device)
#             batch = batch_embedder(batch)

#             batch_length = batch.shape[1]

#             # Windowed mode logic
#             if windowed_mode:
#                 if batch_length < min_window:
#                     continue
#                 if window_start_mode == "random":
#                     cut_start = torch.randint(0, batch_length - window_length + 1, (1,)).item()
#                 elif window_start_mode == "fixed":
#                     cut_start = 0
#                 if window_mode == "uniform":
#                     while True:
#                         window_length = torch.randint(min_window, batch_length + 1, (1,)).item()
#                         cut_end = cut_start + window_length
#                         if min_window <= (cut_end - cut_start) <= batch_length:
#                             break
#                     batch = batch[:, cut_start:cut_end, :]

#                 elif window_mode == "negative_binomial":
#                     total_count = 1
#                     probs = neg_bin_p
#                     distribution = torch.distributions.NegativeBinomial(total_count=total_count, probs=probs)
#                     while True:
#                         window_length = distribution.sample().item() + min_window
#                         cut_end = cut_start + window_length
#                         if min_window <= window_length <= batch_length:
#                             break
#                     batch = batch[:, cut_start:cut_end, :]

#                 elif window_mode == "biased_loss":
#                     if torch.min(window_counts) < 2:
#                         # Use uniform distribution until each length has been used at least twice
#                         window_probs = torch.ones_like(window_losses) / len(window_losses)
#                     elif torch.sum(window_counts) % train_on_all_every == 0:
#                         window_probs = torch.ones_like(window_losses) / len(window_losses)
#                     else:
#                         # Update probabilities based on moving average of losses
#                         avg_losses = torch.tensor([np.mean(loss_deque) if len(loss_deque) > 0 else initial_value for loss_deque in loss_deques], device=device)
#                         window_probs = avg_losses / avg_losses.sum()
#                     while True:
#                         window_length = torch.multinomial(window_probs, 1).item() + min_window
#                         #check if the window length does work with the batch length
#                         if window_length > batch_length:
#                             continue
#                         cut_end = cut_start + window_length
#                         if min_window <= window_length <= batch_length:
#                             break
#                     batch = batch[:, cut_start:cut_end, :]
#                     window_counts[window_length - min_window] += 1  # Update window counts

#             optimizer.zero_grad()
#             predicted_noise, noise, noise_mask = model(batch)
#             loss = loss_func(predicted_noise, noise, noise_mask)
#             loss.backward()
#             # # Gradient clipping
#             max_grad_norm = 1.0
#             torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
#             torch.nn.utils.clip_grad_norm_(batch_embedder.parameters(), max_grad_norm)
#             optimizer.step()
#             loss_list.append(loss.item())

#             epoch_loss = sum(loss_list[-len(data_loader):]) / len(data_loader)
#             epoch_loss_list.append(epoch_loss)

#             # Update window losses and moving average deque
#             if windowed_mode and window_mode == "biased_loss":
#                 window_idx = window_length - min_window
#                 window_losses[window_idx] += loss.item()
#                 loss_deques[window_idx].append(loss.item())

#             # Dynamic plot update
#             if i % plot_every == 0:
#                 ax1.clear()
#                 ax1.set_ylim(0, 1)
#                 ax1.plot(loss_list)
#                 if len(loss_list) > 100:
#                     ax1.plot(np.convolve(loss_list, np.ones((100,))/100, mode='valid'))
#                     ax1.text(len(loss_list) - 1, np.convolve(loss_list, np.ones((100,))/100, mode='valid')[-1],
#                             str(round(np.convolve(loss_list, np.ones((100,))/100, mode='valid')[-1], 3)))
#                 if len(epoch_loss_list) > 0:
#                     ax1.text(0.1, 0.9, f"Epoch: {epoch} | Learning rate: {optimizer.param_groups[0]['lr']:.2e}")
#                 # ax1.text(0.1, 0.8, f"Learning rate: {optimizer.param_groups[0]['lr']:.4e}")
#                 ax1.text(0.1, 0.8, f"Loss: {epoch_loss_list[-1]:.3e} | Validation loss: {val_loss:.3e}")
#                 ax1.text(0.1, 0.7, f"Time per step: {((time.time() - start) / (i + 1)):.2f} s | Time per epoch: {((time.time() - start) / (i + 1) * len(data_loader)):.2f} s")
#                 ax1.text(0.1, 0.6, f"Time till finish (est.): {((time.time() - start) / (i + 1) * len(data_loader) * (epochs - epoch)) / 60:.2f} min")
#                 if windowed_mode and window_mode == "biased_loss":
#                     ax2.clear()
#                     ax2.bar(range(min_window, max_window + 1), window_counts.cpu().numpy())
#                     ax2.set_ylabel("Counts")
#                     ax2.set_title("Counts of Each Window Length Used")

#                     moving_avg_losses = torch.tensor([np.mean(loss_deque) if len(loss_deque) > 0 else initial_value for loss_deque in loss_deques], device=device).cpu().numpy()
#                     ax3.clear()
#                     ax3.bar(range(min_window, max_window + 1), moving_avg_losses)
#                     ax3.set_xlabel("Window Length")
#                     ax3.set_ylabel("Moving Average Loss")
#                     ax3.set_title("Moving Average Loss for Each Window Length")

#                 display(fig)
#                 clear_output(wait=True)

#         end = time.time()

#         # Validation
#         if epoch % validation_frequency == 0:
#             loss_list_validation = []
#             for i, batch in enumerate(data_loader_validation):
#                 batch = batch.to(device)
#                 batch = batch_embedder(batch)
#                 if i % validation_prp == 0:
#                     predicted_noise, noise, noise_mask = model(batch)
#                     loss = loss_func(predicted_noise, noise, noise_mask)
#                     loss_list_validation.append(loss.item())

#             val_loss = np.mean(loss_list_validation)

#         if verbose:
#             print(f"Epoch {epoch} completed in {end - start} seconds, Loss: {epoch_loss}")
#             print(f"Validation Loss: {val_loss}")


#     return model, loss_list