# 1- Implementing the Preparation tasks

In [1]:
# Importing required libraries
import dask.dataframe as dd
import pandas as pd
from tqdm import tqdm
import ast
from sklearn.model_selection import train_test_split
from datetime import timedelta
from dask.diagnostics import ProgressBar
import seaborn as sns
import matplotlib.pyplot as plt

import preparation as pp

In [2]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', None)


### 1.1- filtered: contains records up to the occurrence of the first outcome or the entire group if no there is no outcome present.
### 1.2- filtered_with_events: contains the records with event_id and the records have been expanded, as each record has only one event.

The aim of the filtered data frame is to be used for the matrix method 

The aim of the filtered data frame with the event ID and expanded events is to be used in the sliding windowing method.


In [3]:
# Read data from multiple h5 files into a Dask DataFrame
# Each file is read in parallel as a separate partition
print("Reading data from multiple h5 files...")
ddf = dd.read_hdf('data/df_subjects_h5/*.h5', key='df_subjects')

# 1-filtering the records up until the first outcome.
print('Extracting Records up to the occurrence of the first outcome or the entire group if no there is no outcome presents were filtered')
filtered=pp.getRecordsUntilOutcome(ddf)
print('data frame with first outcome was filtered successfully')

# 2- adding events id and expanding the events

# Add events in the filtered dataset using the 'preprocess_events' function
print("Adding events in the filtered dataset using the 'preprocess_events' function...")
filtered_with_events = pp.preprocess_events(filtered)

print('Event IDs have been added, and the records have been expanded, as each record now contains only one event.')

Reading data from multiple h5 files...
Extracting Records up to the occurrence of the first outcome or the entire group if no there is no outcome presents were filtered
Processing the dataset using the 'process_dataset' function...
[                                        ] | 0% Completed | 108.03 ms

We're assuming that the indices of each dataframes are 
 aligned. This assumption is not generally safe.


[########################################] | 100% Completed | 72.91 s
Stacked Data Shape: (31989618, 68)
Number of Partitions: 32
Applying the filtering function to the dataset...


  Before: .apply(func)
  After:  .apply(func, meta={'x': 'f8', 'y': 'f8'}) for dataframe result
  or:     .apply(func, meta=('x', 'f8'))            for series result
  filtered = grouped_data.apply(filter_records)
You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=(None, 'object'))



data frame with first outcome was filtered successfully
Adding events in the filtered dataset using the 'preprocess_events' function...
Event IDs have been added, and the records have been expanded, as each record now contains only one event.


#### 1.3- Get the unique subject ids and Split them into training_wo_vlidation, training_w_validation, validation, test

In [4]:
id ='subject_id'

print ('splitting the subjects into train, test, and validation sets and return them as the list...' )
train, train_w_val, test, validation=pp.split_subject_ids(ddf,id)
##################################################################################################
print(' Creating the train, test, and validation based on the subject train, test, and validation for the filtered data frame...')
ddf_name='filtered'
ddf=filtered
train_filtered_data,train_filtered_data_w_val,test_filtered_data,validation_filtered_data=pp.split_datasets(ddf,ddf_name,id,train, train_w_val, test, validation)
################################################################################################
print ('Saving the filtered and train test and validation Dask DataFrame as a parquet file...')

ddf_path="data/df_subjects_first_outcome/matrix_pp/"

train_filtered_data.to_parquet(ddf_path+"train/", write_index=False)
train_filtered_data_w_val.to_parquet(ddf_path+"train_w_val/", write_index=False)
test_filtered_data.to_parquet(ddf_path+"test/", write_index=False)
validation_filtered_data.to_parquet(ddf_path+"validation/", write_index=False)
filtered.to_parquet(ddf_path+"filtered/", write_index=False)
################################################################################################

print ('Creating the train, test, and validation based on the subject train, test, and validation for the filtered_with_events data frame...')

ddf_name='filtered_with_events'
ddf=filtered_with_events

train_filtered_with_events_data,train_filtered_with_events_data_w_val,test_filtered_with_events_data,validation_filtered_with_events_data=pp.split_datasets(ddf,ddf_name,id,train, train_w_val, test, validation)
###################################################################################################

print ('Saving the filtered and the splitter Dask DataFrame as a parquet file...')

ddf_path="data/df_subjects_first_outcome/sliding_pp/"

train_filtered_with_events_data.to_parquet(ddf_path+"train/", write_index=False)
train_filtered_with_events_data_w_val.to_parquet(ddf_path+"train_w_val/", write_index=False)
test_filtered_with_events_data.to_parquet(ddf_path+"test/", write_index=False)
validation_filtered_with_events_data.to_parquet(ddf_path+"validation/", write_index=False)
filtered_with_events.to_parquet(ddf_path+"filtered_with_events/", write_index=False)

#################################################################################################

print("finished the jobs successfully")

splitting the subjects into train, test, and validation sets and return them as the list...
Computing unique subject_ids...
Group the subject and Unique subject_ids as a list was computed successfully!
The number of individuals in the training set: 216197
The number of individuals in training data set with deduction of validation is: 194577
The number of individuals in the test data set is: 24022
The number of individuals in the validation data set  is: 21620
 Creating the train, test, and validation based on the subject train, test, and validation for the filtered data frame...
Computing train, test, and validation dataframes for : filtered
The number of records in the training set: 27581683
The number of individuals in the training set: 216197
The number of records in the train_data_w_val set: 24805864
The number of individuals in the train_data_w_val set: 194577
The number of records in the test_data set: 3045651
The number of individuals in the test_data set: 24022
The number of re