# Start from simple

Zero padding requires:
- 0 only represent the position holder (missing value in our case);
- The none zero values (observed) usually have at least 10% out of all observations;

Convert all 36 weeks into 5 different collection periods and taking the within period mean of each covariates (both time varying and not-varying)
- Largely reduce the number of padded zeros;
- Solves the issues that there are multiple collections in the same week;
- Maybe no need to do the label smoothing since only 4 or 5 labels per patient;

In [211]:
import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn
import sklearn as sk

# from torch.autograd import Variable 

from itertools import islice
from sklearn import metrics
from sklearn import preprocessing

from collections import Counter,defaultdict, OrderedDict

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

# 1. Read-in & clean data.

In [3]:
# data directory
# alpha_dir     = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/alpha_diversity/alpha_diversity.csv'
# cst_dir       = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/community_state_types/cst_valencia.csv'
meta_dir      = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/metadata_imputed1.csv'
# krdlong_dir   = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/pairwise_distance/krd_distance_long.csv'
# krdwide_dir   = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/pairwise_distance/krd_distance_wide.csv'
phylotype_dir = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/phylotypes/phylotype_relabd.1e0.csv'
taxonomy_dir  = '/Users/mli171/Desktop/JHU/3Summer2022_JHU/DREAM/training_data_2022-05-27/taxonomy/taxonomy_relabd.family.csv'

## Meta data

In [4]:
meta_data = pd.DataFrame(pd.read_csv(meta_dir, delimiter=','))
meta_data = meta_data[['participant_id', 'project', 'delivery_wk', 'collect_wk', 'age_imp', 'race_imp']]

print(meta_data.shape)

for i in range(1,meta_data.shape[1]):
    if meta_data.iloc[:,i].dtypes == object:
        meta_data.iloc[:,i] = meta_data.iloc[:,i].astype('category').cat.codes + 1
        meta_data.iloc[:,i] = meta_data.iloc[:,i].astype('float64')

# create new variable collection period
meta_data['collect_period'] = 1
meta_data['collect_period'][(meta_data['collect_wk']>=9)  & (meta_data['collect_wk']<=16)] = 2
meta_data['collect_period'][(meta_data['collect_wk']>=17) & (meta_data['collect_wk']<=24)] = 3
meta_data['collect_period'][(meta_data['collect_wk']>=25) & (meta_data['collect_wk']<=32)] = 4
meta_data['collect_period'][(meta_data['collect_wk']>32)]                                  = 5

collect_period = meta_data['collect_period']
participant_id = meta_data['participant_id']

# create class label
meta_data['was_preterm'] = 1*(meta_data['delivery_wk'] < 37)
meta_data['was_early_preterm'] = 1*(meta_data['delivery_wk'] < 32)

# number of patient
unique, counts = np.unique(meta_data['participant_id'], return_counts=True)
print(len(unique))

(3578, 6)
1268


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_data['collect_period'][(meta_data['collect_wk']>=9)  & (meta_data['collect_wk']<=16)] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_data['collect_period'][(meta_data['collect_wk']>=17) & (meta_data['collect_wk']<=24)] = 3
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_data['collect_period'][(meta_data['collect_wk']>=25) & (meta_data['collect_wk']<=32)] = 4
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the d

- Filtered out observations with collect_wk<=32;
- Average within each collection period;

In [5]:
# Filtered out observations with "collect_wk<=32" == "collect_period<=4" 
meta_data = meta_data[meta_data['collect_period']<=4]
# Average within each collection period
meta_data = meta_data.groupby(['participant_id', 'collect_period'], as_index = False).mean()
print(meta_data.shape)

(1582, 9)


## Taxonomy OTU (RA) data (Family level)
- Filtered out observations with collect_wk<=32;
- Average within each collection period;

In [6]:
taxonomy_data = pd.DataFrame(pd.read_csv(taxonomy_dir, delimiter=','))
taxonomy_data = pd.concat([participant_id, collect_period, taxonomy_data], axis=1)

# Filtered out observations with "collect_wk<=32" == "collect_period<=4" 
taxonomy_data = taxonomy_data[taxonomy_data['collect_period']<=4]
# Average within each collection period
taxonomy_data = taxonomy_data.groupby(['participant_id', 'collect_period'], as_index = False).mean()
print(taxonomy_data.shape)

# # delete the temporary filter-used and ID columns
# taxonomy_data = taxonomy_data.drop(["participant_id", "collect_period"], axis = 1)

(1582, 527)


## Phylotype data
- Filtered out observations with collect_wk<=32;
- Average within each collection period;

In [7]:
phylotype_data = pd.DataFrame(pd.read_csv(phylotype_dir, delimiter=','))
phylotype_data = pd.concat([participant_id, collect_period, phylotype_data], axis=1)

# Filtered out observations with "collect_wk<=32" == "collect_period<=4" 
phylotype_data = phylotype_data[phylotype_data['collect_period']<=4]
# Average within each collection period
phylotype_data = phylotype_data.groupby(['participant_id', 'collect_period'], as_index = False).mean()
print(phylotype_data.shape)

# # delete the temporary filter-used and ID columns
# phylotype_data = phylotype_data.drop(["participant_id", "collect_period"], axis = 1)

(1582, 1846)


## Other Datasets (...)
- Filtered out observations with collect_wk<=32;
- Average within each collection period;

In [8]:
# 

## Dimension Summary

In [9]:
uniquenames, counts = np.unique(meta_data["participant_id"], return_counts=True)
subjects = list(uniquenames)
seq_max_len = max(counts)

print("# of subjects = ", len(subjects))
print("# of samples  = ", meta_data.shape[0])
print("# of taxonnomy features = ", len(list(taxonomy_data)))
print("# of phylotype features = ", len(list(phylotype_data)))

# of subjects =  1214
# of samples  =  1582
# of taxonnomy features =  527
# of phylotype features =  1846


# 2. Data sets splitter

Since we have multiple datasets, we will use Index of subjects to guide training, validation and testing set spliter. Also note, Since there are different number of records for each patient, the dimension of train and testing data sets are not follow the proportion 0.8, but the patients will follow.

In [10]:
def dataset_splitID(meta_data, subjects, prop, myseed):
    
    if myseed != None:
        random.seed(myseed)
        
    numsubjects = len(subjects)

    subjects_shuffle = random.sample(subjects, numsubjects)
    
    train_subjects = subjects_shuffle[0:(int(len(subjects)*prop[0])+1)] 
    valid_subjects = subjects_shuffle[(int(len(subjects)*prop[0])+2):(int(len(subjects)*(prop[0]+prop[1]))+1)]
    test_subjects = subjects_shuffle[(int(len(subjects)*(prop[0]+prop[1]))+2):numsubjects]
    
    splitID_train = meta_data['participant_id'].isin(train_subjects)
    splitID_valid = meta_data['participant_id'].isin(valid_subjects)
    splitID_test = meta_data['participant_id'].isin(test_subjects)
    
    return splitID_train, splitID_valid, splitID_test

In [11]:
# set myseed=None to have complete random state
splitID_train, splitID_valid, splitID_test = dataset_splitID(meta_data, subjects, prop = [0.7, 0.2, 0.1], myseed=0)

# apply to each data sets
meta_data_train = meta_data[splitID_train]
meta_data_valid = meta_data[splitID_valid]
meta_data_test  = meta_data[splitID_test]

print(meta_data_train.shape)
print(meta_data_valid.shape)
print(meta_data_test.shape)

taxonomy_data_train = taxonomy_data[splitID_train]
taxonomy_data_valid = taxonomy_data[splitID_valid]
taxonomy_data_test  = taxonomy_data[splitID_test]

print(taxonomy_data_train.shape)
print(taxonomy_data_valid.shape)
print(taxonomy_data_test.shape)

phylotype_data_train = phylotype_data[splitID_train]
phylotype_data_valid = phylotype_data[splitID_valid]
phylotype_data_test  = phylotype_data[splitID_test]

print(phylotype_data_train.shape)
print(phylotype_data_valid.shape)
print(phylotype_data_test.shape)

# other data sets

(1098, 9)
(326, 9)
(156, 9)
(1098, 527)
(326, 527)
(156, 527)
(1098, 1846)
(326, 1846)
(156, 1846)


# 3. Data Reshaper

## Reshape Input

In [12]:
def Data_Reshaper_Input(data, seq_length):
    
    numsubjects = len(np.unique(data['participant_id']))
    myvary = list(data.columns.values)[2:data.shape[1]]
    num_covariates = len(myvary)
    
    myinput = np.zeros((numsubjects, seq_length, num_covariates), dtype=np.float32)
    for i in range(num_covariates):
        data_wide = data.pivot_table(index=['participant_id'], columns='collect_period', values=myvary[i])
        data_wide = data_wide.sort_index(axis=1)
        data_wide = data_wide.fillna(0)
        tmpindex = data_wide._get_numeric_data().columns.values - 1
        tmpindex = tmpindex.tolist()
        # time varying variables need to impute all and no records are denoted as 0
        for j in range(numsubjects):
                myinput[j,tmpindex,i] = data_wide.iloc[[j]]
    return myinput

**Warning**: *Longer running time*

In [13]:
taxonomytrain_input = Data_Reshaper_Input(data=taxonomy_data_train, seq_length=4)
print(taxonomytrain_input.shape)
taxonomyvalid_input = Data_Reshaper_Input(data=taxonomy_data_valid, seq_length=4)
print(taxonomyvalid_input.shape)
taxonomytest_input = Data_Reshaper_Input(data=taxonomy_data_test, seq_length=4)
print(taxonomytest_input.shape)

phylotypetrain_input = Data_Reshaper_Input(data=phylotype_data_train, seq_length=4)
print(phylotypetrain_input.shape)
phylotypevalid_input = Data_Reshaper_Input(data=phylotype_data_valid, seq_length=4)
print(phylotypevalid_input.shape)
phylotypetest_input = Data_Reshaper_Input(data=phylotype_data_test, seq_length=4)
print(phylotypetest_input.shape)

(850, 4, 525)
(242, 4, 525)
(120, 4, 525)
(850, 4, 1844)
(242, 4, 1844)
(120, 4, 1844)


## Reshape output

### 1). Data_Reshaper_Output_ManytoMany_0

- reshape patients class labels from long to wide form;
- output array formulation, **one** columns;
- Label smoothing;
    - was_preterm: 0.5, 0.67, 0.83, 1;
    - not was_preterm: 0.5, 0.33, 0.17, 0;
    - no missing values;

In [56]:
def Data_Reshaper_Output_ManytoMany_0(data, seq_length, classlabel):

    num_samples = len(np.unique(data['participant_id']))
    
    data_wide = data.pivot_table(index=['participant_id'], columns='collect_period', values=classlabel)
    data_wide = data_wide.sort_index(axis=1)
    
    myoutput = np.zeros((num_samples, seq_length, 1), dtype=np.float32)
    for i in range(num_samples):
        tmp = data_wide.iloc[i,:]
        
        if np.nanmax(tmp) == 1:
            # label linear smoonthing from 0.5 to 1
            # fill all position 1 to have final labels equal to 1
            myoutput[i,:,0].fill(1)
            myoutput[i,:,0] = np.linspace(start=0.5, stop=1, num=seq_length)
        else:
            # label linear smoonthing from 0.5 to 0
            # fill all position 0 to have final labels equal to 0 
            #     but array alrady initialize as 0
            myoutput[i,:,0] = np.linspace(start=0.5, stop=0, num=seq_length)
            
    return myoutput

In [185]:
mytrain_output_0 = Data_Reshaper_Output_ManytoMany_0(data=meta_data_train, seq_length=4, classlabel="was_preterm")
print(mytrain_output_0.shape)
print(mytrain_output_0[2])

myvalid_output_0 = Data_Reshaper_Output_ManytoMany_0(data=meta_data_valid, seq_length=4, classlabel="was_preterm")
print(myvalid_output_0.shape)
print(myvalid_output_0[4])

mytest_output_0 = Data_Reshaper_Output_ManytoMany_0(data=meta_data_test, seq_length=4, classlabel="was_preterm")
print(mytest_output_0.shape)
print(mytest_output_0[4])

(850, 4, 1)
[[0.5       ]
 [0.33333334]
 [0.16666667]
 [0.        ]]
(242, 4, 1)
[[0.5      ]
 [0.6666667]
 [0.8333333]
 [1.       ]]
(120, 4, 1)
[[0.5      ]
 [0.6666667]
 [0.8333333]
 [1.       ]]


In [58]:
meta_data_train

Unnamed: 0,participant_id,collect_period,project,delivery_wk,collect_wk,age_imp,race_imp,was_preterm,was_early_preterm
0,A00003,4,1.0,40.0,30.000000,32.0,2.0,0.0,0.0
1,A00004,4,1.0,40.0,28.666667,25.0,5.0,0.0,0.0
2,A00005,4,1.0,41.0,27.500000,31.0,5.0,0.0,0.0
3,A00006,4,1.0,41.0,31.000000,28.0,5.0,0.0,0.0
4,A00008,3,1.0,35.0,17.000000,38.0,5.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
1577,J00111,3,10.0,40.0,17.000000,27.0,5.0,0.0,0.0
1578,J00112,3,10.0,39.0,19.000000,27.0,5.0,0.0,0.0
1579,J00113,2,10.0,41.0,16.000000,32.0,5.0,0.0,0.0
1580,J00115,3,10.0,42.0,18.000000,35.0,5.0,0.0,0.0


### 2). Data_Reshaper_Output_ManytoMany_1

- reshape patients class labels from long to wide form
- output array formulation, **two** columns, 
    - (0,0) indicating missing values;
    - (1,0) indicating preterm classes;
    - (0,1) indicating not preterm classes;

In [59]:
def Data_Reshaper_Output_ManytoMany_1(data, seq_length, classlabel):
    
    num_samples = len(np.unique(data['participant_id']))

    data_wide = data.pivot_table(index=['participant_id'], columns='collect_period', values=classlabel)
    data_wide = data_wide.sort_index(axis=1)

    myoutput = np.zeros((num_samples, seq_length, 2), dtype=np.float32)
    myoutput[:,data_wide.columns.values-1,0] = data_wide
    myoutput[:,data_wide.columns.values-1,1] = 1 - data_wide
    myoutput[np.isnan(myoutput)] = 0
    
    return myoutput

In [184]:
mytrain_output_1 = Data_Reshaper_Output_ManytoMany_1(data=meta_data_train, seq_length=4, classlabel="was_preterm")
print(mytrain_output_1.shape)
print(mytrain_output_1[2])

myvalid_output_1 = Data_Reshaper_Output_ManytoMany_1(data=meta_data_valid, seq_length=4, classlabel="was_preterm")
print(myvalid_output_1.shape)
print(myvalid_output_1[4])

mytest_output_1 = Data_Reshaper_Output_ManytoMany_1(data=meta_data_test, seq_length=4, classlabel="was_preterm")
print(mytest_output_1.shape)
print(mytest_output_1[4])

(850, 4, 2)
[[0. 0.]
 [0. 0.]
 [0. 0.]
 [0. 1.]]
(242, 4, 2)
[[0. 0.]
 [0. 0.]
 [1. 0.]
 [0. 0.]]
(120, 4, 2)
[[0. 0.]
 [0. 0.]
 [0. 0.]
 [1. 0.]]


### 3). Data_Reshaper_Output_ManytoMany

- reshape patients class labels from long to wide form
- output array formulation, **one** column, 
    - 0 indicating missing values;
    - 1 indicating preterm classes;
    - 2 indicating not preterm classes;

In [61]:
def Data_Reshaper_Output_ManytoMany_2(data, seq_length, classlabel):
    
    num_samples = len(np.unique(data['participant_id']))
    data_wide = data.pivot_table(index=['participant_id'], columns='collect_period', values=classlabel)
    data_wide = data_wide.sort_index(axis=1)
    
    myoutput = np.zeros((num_samples, seq_length, 1), dtype=np.float32)
    myoutput[:,data_wide.columns.values-1,0] = data_wide + 1
    myoutput[np.isnan(myoutput)] = 0
    
    return myoutput

In [186]:
mytrain_output_2 = Data_Reshaper_Output_ManytoMany_2(data=meta_data_train, seq_length=4, classlabel="was_preterm")
print(mytrain_output_2.shape)
print(mytrain_output_2[2])

myvalid_output_2 = Data_Reshaper_Output_ManytoMany_2(data=meta_data_valid, seq_length=4, classlabel="was_preterm")
print(myvalid_output_2.shape)
print(myvalid_output_2[4])

mytest_output_2 = Data_Reshaper_Output_ManytoMany_2(data=meta_data_test, seq_length=4, classlabel="was_preterm")
print(mytest_output_2.shape)
print(mytest_output_2[4])

(850, 4, 1)
[[0.]
 [0.]
 [0.]
 [1.]]
(242, 4, 1)
[[0.]
 [0.]
 [2.]
 [0.]]
(120, 4, 1)
[[0.]
 [0.]
 [0.]
 [2.]]


### 4). Data_Reshaper_Output_ManytoOne_1

- reshape patients class labels from long to wide form
- output array formulation, each subject has **one** label, 
    - Then No missing values;
    - 0 indicating preterm classes;
    - 1 indicating not preterm classes;

In [63]:
def Data_Reshaper_Output_ManytoOne_1(data, classlabel):
    num_samples = len(np.unique(data['participant_id']))
    data_wide = data.pivot_table(index=['participant_id'], columns='collect_period', values=classlabel)
    data_wide = data_wide.sort_index(axis=1)

    myoutput = np.zeros((num_samples, 1, 1), dtype=np.float32)
    myoutput[:,0,0] = data_wide.max(axis=1)
    return myoutput

In [187]:
mytrain_output_3 = Data_Reshaper_Output_ManytoOne_1(data=meta_data_train, classlabel="was_preterm")
print(mytrain_output_3.shape)
print(mytrain_output_3[2])

myvalid_output_3 = Data_Reshaper_Output_ManytoOne_1(data=meta_data_valid, classlabel="was_preterm")
print(myvalid_output_3.shape)
print(myvalid_output_3[4])

mytest_output_3 = Data_Reshaper_Output_ManytoOne_1(data=meta_data_test, classlabel="was_preterm")
print(mytest_output_3.shape)
print(mytest_output_3[4])

(850, 1, 1)
[[0.]]
(242, 1, 1)
[[1.]]
(120, 1, 1)
[[1.]]
