# Import urine spectra with concentration

### Install project packages

In [1]:
%%bash
pip install -e ../.

Obtaining file:///data/ar1220/MscProjectNMR
Installing collected packages: MscProjectNMR
  Attempting uninstall: MscProjectNMR
    Found existing installation: MscProjectNMR 0
    Uninstalling MscProjectNMR-0:
      Successfully uninstalled MscProjectNMR-0
  Running setup.py develop for MscProjectNMR
Successfully installed MscProjectNMR-0


### Install required python modules

In [None]:
%%bash
pip install -r ../requirements.txt

### Import fucntions

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split, cross_validate

import joblib
import os

from tfrecords import write_tfrecords_concentrations, write_tfrecords_concentrations_single

In [2]:
tf.__version__

'2.4.2'

# I. Import data

## I.a. Large dataset with independent metabolites (10 000 samples)

### I.1.a load the data

In [2]:
filename_spectrum_large = '../data/concentration_data/Large_sample/Spectra_Mixt1.txt'
filename_concentrations_large = '../data/concentration_data/Large_sample/Concentrations_Mix1.txt'
data_spectrum_large = np.loadtxt(filename_spectrum_large, dtype=float)
data_concentrations_large = np.loadtxt(filename_concentrations_large, delimiter='\t', dtype=float,
                                usecols=range(1,data_spectrum_large.shape[1]+1))

In [3]:
#Convert into dataframes
df_spectrum_large = pd.DataFrame(data_spectrum_large).T
df_concentrations_large = pd.DataFrame(data_concentrations_large).T

### I.1.b Normalize the input

#### Define minimum and maximum value of spectrum

In [4]:
min_val = -50
max_val = 20000

In [5]:
norm_df_spectrum_large = (df_spectrum_large - min_val)/(max_val - min_val)

### I.1.c. Standardise the output

In [6]:
#Import mean concentration data and metabolites
filename_mean_concentrations = '../data/concentration_data/normal_urine.txt'
mean_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=1, skiprows=1)
sd_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=2, skiprows=1)

stand_df_concentrations_large = (df_concentrations_large - mean_concentrations)/sd_concentrations

In [7]:
print(norm_df_spectrum_large.shape)
print(stand_df_concentrations_large.shape)

(10000, 10000)
(10000, 48)


### I.1.d.  Shuffle the data

In [8]:
norm_df_spectrum_large, stand_df_concentrations_large = shuffle(norm_df_spectrum_large, stand_df_concentrations_large)

### I.1.e.  Split data into train and validation datasets

In [9]:
X_train_large, X_test_large, y_train_large, y_test_large = train_test_split(norm_df_spectrum_large,
                                                                            stand_df_concentrations_large,
                                                                            test_size=0.2)

### I.1.f.  Convert into tf.data

In [16]:
train_dataset_large = tf.data.Dataset.from_tensor_slices((X_train_large, y_train_large))
val_dataset_large = tf.data.Dataset.from_tensor_slices((X_test_large, y_test_large))

print(train_dataset_large.element_spec)
print(val_dataset_large.element_spec)

(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))
(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))


In [15]:
train_dataset_large_single = [tf.data.Dataset.from_tensor_slices((X_train_large,
                                                                  y_train_large[y_train_large.columns[i]]))
                              for i in range(48)]
val_dataset_large_single = [tf.data.Dataset.from_tensor_slices((X_test_large,
                                                                y_test_large[y_test_large.columns[i]]))
                              for i in range(48)]

print(train_dataset_large_single[0].element_spec)
print(val_dataset_large_single[0].element_spec)

(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))
(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))


### I.1.g.  Write tf.Record

In [28]:
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Large_sample/train',
                               dataset=train_dataset_large, number=32)
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Large_sample/validation',
                               dataset=val_dataset_large, number=8)

In [18]:
for i in range(48):
    if not os.path.exists('../data/tfrecords/Concentrations_data/Large_sample_single/metabolite_{}/train'.format(i)):
        os.makedirs('../data/tfrecords/Concentrations_data/Large_sample_single/metabolite_{}/train'.format(i))
    if not os.path.exists('../data/tfrecords/Concentrations_data/Large_sample_single/metabolite_{}/validation'.format(i)):
        os.makedirs('../data/tfrecords/Concentrations_data/Large_sample_single/metabolite_{}/validation'.format(i))

In [19]:
for i in range(48):
    write_tfrecords_concentrations_single('../data/tfrecords/Concentrations_data/Large_sample_single/metabolite_{}/train'.format(i),
                               dataset=train_dataset_large_single[i], number=32)
    write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Large_sample_single/metabolite_{}/validation'.format(i),
                               dataset=val_dataset_large_single[i], number=8)

## I.2 Large dataset with correlated metabolites (10 000 samples)

### I.2.a load the data

In [20]:
filename_spectrum_large_corr = '../data/concentration_data/Large_correlated/Spectra_Mixt1.txt'
filename_concentrations_large_corr = '../data/concentration_data/Large_correlated/Concentrations_Mix1.txt'
data_spectrum_large_corr = np.loadtxt(filename_spectrum_large_corr, dtype=float)
data_concentrations_large_corr = np.loadtxt(filename_concentrations_large_corr, delimiter='\t', dtype=float,
                                usecols=range(1,data_spectrum_large_corr.shape[1]+1))

In [21]:
#Convert into dataframes
df_spectrum_large_corr = pd.DataFrame(data_spectrum_large_corr).T
df_concentrations_large_corr = pd.DataFrame(data_concentrations_large_corr).T

### I.2.b Normalize the input

In [22]:
norm_df_spectrum_large_corr = (df_spectrum_large_corr - min_val)/(max_val - min_val)

### I.2.c Standardise the output

In [23]:
#Import mean concentration data and metabolites
filename_mean_concentrations = '../data/concentration_data/normal_urine.txt'
mean_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=1, skiprows=1)
sd_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=2, skiprows=1)

stand_df_concentrations_large_corr = (df_concentrations_large_corr - mean_concentrations)/sd_concentrations

In [24]:
print(norm_df_spectrum_large_corr.shape)
print(stand_df_concentrations_large_corr.shape)

(10000, 10000)
(10000, 48)


### I.2.d Shuffle the data

In [25]:
norm_df_spectrum_large_corr, stand_df_concentrations_large_corr = shuffle(norm_df_spectrum_large_corr,
                                                                          stand_df_concentrations_large_corr)

### I.2.e Split data into train and validation datasets

In [26]:
X_train_large_corr, X_test_large_corr, y_train_large_corr, y_test_large_corr = train_test_split(norm_df_spectrum_large_corr,
                                                                                                stand_df_concentrations_large_corr,
                                                                                                test_size=0.2)

### I.2.f Convert into tf.data

In [17]:
train_dataset_large_corr = tf.data.Dataset.from_tensor_slices((X_train_large_corr, y_train_large_corr))
val_dataset_large_corr = tf.data.Dataset.from_tensor_slices((X_test_large_corr, y_test_large_corr))

print(train_dataset_large_corr.element_spec)
print(val_dataset_large_corr.element_spec)

(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))
(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))


In [27]:
train_dataset_large_corr_single = [tf.data.Dataset.from_tensor_slices((X_train_large_corr,
                                                                y_train_large_corr[y_train_large_corr.columns[i]]))
                              for i in range(48)]
val_dataset_large_corr_single = [tf.data.Dataset.from_tensor_slices((X_test_large_corr,
                                                                y_test_large_corr[y_test_large_corr.columns[i]]))
                              for i in range(48)]

print(train_dataset_large_corr_single[0].element_spec)
print(val_dataset_large_corr_single[0].element_spec)

(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))
(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))


### I.2.g Write tf.Record

In [18]:
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Large_correlated/train',
                               dataset=train_dataset_large_corr, number=32)
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Large_correlated/validation',
                               dataset=val_dataset_large_corr, number=8)

In [28]:
import os

for i in range(48):
    if not os.path.exists('../data/tfrecords/Concentrations_data/Large_corr_single/metabolite_{}/train'.format(i)):
        os.makedirs('../data/tfrecords/Concentrations_data/Large_corr_single/metabolite_{}/train'.format(i))
    if not os.path.exists('../data/tfrecords/Concentrations_data/Large_corr_single/metabolite_{}/validation'.format(i)):
        os.makedirs('../data/tfrecords/Concentrations_data/Large_corr_single/metabolite_{}/validation'.format(i))

In [29]:
for i in range(48):
    write_tfrecords_concentrations_single('../data/tfrecords/Concentrations_data/Large_corr_single/metabolite_{}/train'.format(i),
                               dataset=train_dataset_large_corr_single[i], number=32)
    write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Large_corr_single/metabolite_{}/validation'.format(i),
                               dataset=val_dataset_large_corr_single[i], number=8)

## I.3 Small dataset with independent metabolites (1000 samples)

### I.3.a load the data

In [195]:
filename_spectrum_small = '../data/concentration_data/Small_sample/Spectra_Mixt1.txt'
filename_concentrations_small = '../data/concentration_data/Small_sample/Concentrations_Mix1.txt'
data_spectrum_small = np.loadtxt(filename_spectrum_small, dtype=float)
data_concentrations_small = np.loadtxt(filename_concentrations_small, delimiter='\t', dtype=float,
                                usecols=range(1,data_spectrum_small.shape[1]+1))

In [196]:
#Convert into dataframes
df_spectrum_small = pd.DataFrame(data_spectrum_small).T
df_concentrations_small = pd.DataFrame(data_concentrations_small).T

### I.3.b Normalize the input

In [197]:
norm_df_spectrum_small = (df_spectrum_small - min_val)/(max_val - min_val)

### I.3.c Standardise the output

In [198]:
#Import mean concentration data and metabolites
filename_mean_concentrations = '../data/concentration_data/normal_urine.txt'
mean_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=1, skiprows=1)
sd_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=2, skiprows=1)

stand_df_concentrations_small = (df_concentrations_small - mean_concentrations)/sd_concentrations

In [199]:
print(norm_df_spectrum_small.shape)
print(stand_df_concentrations_small.shape)

(1000, 10000)
(1000, 48)


### I.3.d Shuffle the data

In [200]:
norm_df_spectrum_small, stand_df_concentrations_small = shuffle(norm_df_spectrum_small, stand_df_concentrations_small)

### I.3.e Split data into train and validation datasets

In [201]:
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(norm_df_spectrum_small,
                                                                            stand_df_concentrations_small,
                                                                            test_size=0.2)

### I.3.f Convert into tf.data

In [202]:
train_dataset_small = tf.data.Dataset.from_tensor_slices((X_train_small, y_train_small))
val_dataset_small = tf.data.Dataset.from_tensor_slices((X_test_small, y_test_small))

print(train_dataset_small.element_spec)
print(val_dataset_small.element_spec)

(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))
(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))


### I.3.g Write tf.Record

In [204]:
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Small_sample/train',
                               dataset=train_dataset_small, number=8)
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Small_sample/validation',
                               dataset=val_dataset_small, number=2)

## I.4 Small dataset with correlated metabolites (1000 samples)

### I.4.a load the data

In [25]:
filename_spectrum_small_corr = '../data/concentration_data/Small_correlated/Spectra_Mixt1.txt'
filename_concentrations_small_corr = '../data/concentration_data/Small_correlated/Concentrations_Mix1.txt'
data_spectrum_small_corr = np.loadtxt(filename_spectrum_small_corr, dtype=float)
data_concentrations_small_corr = np.loadtxt(filename_concentrations_small_corr, delimiter='\t', dtype=float,
                                usecols=range(1,data_spectrum_small_corr.shape[1]+1))

In [26]:
#Convert into dataframes
df_spectrum_small_corr = pd.DataFrame(data_spectrum_small_corr).T
df_concentrations_small_corr = pd.DataFrame(data_concentrations_small_corr).T

### I.4.b Normalize the input

In [27]:
norm_df_spectrum_small_corr = (df_spectrum_small_corr - min_val)/(max_val - min_val)

### I.4.c Standardise the output

In [28]:
#Import mean concentration data and metabolites
filename_mean_concentrations = '../data/concentration_data/normal_urine.txt'
mean_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=1, skiprows=1)
sd_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=2, skiprows=1)

stand_df_concentrations_small_corr = (df_concentrations_small_corr - mean_concentrations)/sd_concentrations

In [29]:
print(norm_df_spectrum_small_corr.shape)
print(stand_df_concentrations_small_corr.shape)

(1000, 10000)
(1000, 48)


### I.4.d Shuffle the data

In [30]:
norm_df_spectrum_small_corr, stand_df_concentrations_small_corr = shuffle(norm_df_spectrum_small_corr,
                                                                          stand_df_concentrations_small_corr)

### I.4.e Split data into train and validation datasets

In [31]:
X_train_small_corr, X_test_small_corr, y_train_small_corr, y_test_small_corr = train_test_split(norm_df_spectrum_small_corr,
                                                                                                stand_df_concentrations_small_corr,
                                                                                                test_size=0.2)

### I.4.f Convert into tf.data

In [57]:
train_dataset_small_corr = tf.data.Dataset.from_tensor_slices((X_train_small_corr, y_train_small_corr))
val_dataset_small_corr = tf.data.Dataset.from_tensor_slices((X_test_small_corr, y_test_small_corr))

print(train_dataset_small_corr.element_spec)
print(val_dataset_small_corr.element_spec)

(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))
(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))


### I.4.g Write tf.Record

In [58]:
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Small_correlated/train',
                               dataset=train_dataset_small_corr, number=8)
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Small_correlated/validation',
                               dataset=val_dataset_small_corr, number=2)

## I.5 Extra small dataset with independent metabolites (100 samples)

### I.5.a load the data

In [32]:
filename_spectrum_xsmall = '../data/concentration_data/Extra_small_sample/Spectra_Mixt1.txt'
filename_concentrations_xsmall = '../data/concentration_data/Extra_small_sample/Concentrations_Mix1.txt'
data_spectrum_xsmall = np.loadtxt(filename_spectrum_xsmall, dtype=float)
data_concentrations_xsmall = np.loadtxt(filename_concentrations_xsmall, delimiter='\t', dtype=float,
                                usecols=range(1,data_spectrum_xsmall.shape[1]+1))

In [33]:
#Convert into dataframes
df_spectrum_xsmall = pd.DataFrame(data_spectrum_xsmall).T
df_concentrations_xsmall = pd.DataFrame(data_concentrations_xsmall).T

### I.5.b Normalize the input

In [34]:
norm_df_spectrum_xsmall = (df_spectrum_xsmall - min_val)/(max_val - min_val)

### I.5.c Standardise the output

In [35]:
#Import mean concentration data and metabolites
filename_mean_concentrations = '../data/concentration_data/normal_urine.txt'
mean_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=1, skiprows=1)
sd_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=2, skiprows=1)

stand_df_concentrations_xsmall = (df_concentrations_xsmall - mean_concentrations)/sd_concentrations

In [36]:
print(norm_df_spectrum_xsmall.shape)
print(stand_df_concentrations_xsmall.shape)

(100, 10000)
(100, 48)


### I.5.d Shuffle the data

In [37]:
norm_df_spectrum_xsmall, stand_df_concentrations_xsmall = shuffle(norm_df_spectrum_xsmall,
                                                                  stand_df_concentrations_xsmall)

### I.5.e Split data into train and validation datasets

In [38]:
X_train_xsmall, X_test_xsmall, y_train_xsmall, y_test_xsmall = train_test_split(norm_df_spectrum_xsmall,
                                                                                stand_df_concentrations_xsmall,
                                                                                test_size=0.2)

### I.5.f Convert into tf.data

In [163]:
train_dataset_xsmall = tf.data.Dataset.from_tensor_slices((X_train_xsmall, y_train_xsmall))
val_dataset_xsmall = tf.data.Dataset.from_tensor_slices((X_test_xsmall, y_test_xsmall))

print(train_dataset_xsmall.element_spec)
print(.element_spec)

(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))
(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))


### I.5.g Write tf.Record

In [165]:
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Extra_small_sample/train',
                               dataset=train_dataset_xsmall, number=4)
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Extra_small_sample/validation',
                               dataset=val_dataset_xsmall, number=1)

## I.6 Extra small dataset with correlated metabolites (100 samples)

### I.6.a load the data

In [39]:
filename_spectrum_xsmall_corr = '../data/concentration_data/Extra_small_correlated/Spectra_Mixt1.txt'
filename_concentrations_xsmall_corr = '../data/concentration_data/Extra_small_correlated/Concentrations_Mix1.txt'
data_spectrum_xsmall_corr = np.loadtxt(filename_spectrum_xsmall_corr, dtype=float)
data_concentrations_xsmall_corr = np.loadtxt(filename_concentrations_xsmall_corr, delimiter='\t', dtype=float,
                                usecols=range(1,data_spectrum_xsmall_corr.shape[1]+1))

In [40]:
#Convert into dataframes
df_spectrum_xsmall_corr = pd.DataFrame(data_spectrum_xsmall_corr).T
df_concentrations_xsmall_corr = pd.DataFrame(data_concentrations_xsmall_corr).T

### I.6.b Normalize the input

In [41]:
norm_df_spectrum_xsmall_corr = (df_spectrum_xsmall_corr - min_val)/(max_val - min_val)

### I.6.c Standardise the output

In [42]:
#Import mean concentration data and metabolites
filename_mean_concentrations = '../data/concentration_data/normal_urine.txt'
mean_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=1, skiprows=1)
sd_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=2, skiprows=1)

stand_df_concentrations_xsmall_corr = (df_concentrations_xsmall_corr - mean_concentrations)/sd_concentrations

In [43]:
print(norm_df_spectrum_xsmall_corr.shape)
print(stand_df_concentrations_xsmall_corr.shape)

(100, 10000)
(100, 48)


### I.6.d Shuffle the data

In [44]:
norm_df_spectrum_xsmall_corr, stand_df_concentrations_xsmall_corr = shuffle(norm_df_spectrum_xsmall_corr,
                                                                            stand_df_concentrations_xsmall_corr)

### I.6.e Split data into train and validation datasets

In [45]:
X_train_xsmall_corr, X_test_xsmall_corr, y_train_xsmall_corr, y_test_xsmall_corr = train_test_split(norm_df_spectrum_xsmall_corr,
                                                                                                    stand_df_concentrations_xsmall_corr,
                                                                                                    test_size=0.2)

### I.6.f Convert into tf.data

In [174]:
train_dataset_xsmall_corr = tf.data.Dataset.from_tensor_slices((X_train_xsmall_corr, y_train_xsmall_corr))
val_dataset_xsmall_corr = tf.data.Dataset.from_tensor_slices((X_test_xsmall_corr, y_test_xsmall_corr))

print(train_dataset_xsmall_corr.element_spec)
print(val_dataset_xsmall_corr.element_spec)

(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))
(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))


### I.6.g Write tf.Record

In [175]:
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Extra_small_correlated/train',
                               dataset=train_dataset_xsmall_corr, number=4)
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Extra_small_correlated/validation',
                               dataset=val_dataset_xsmall_corr, number=1)

## I.7 Test dataset with independent metabolites (1000 samples)

### I.7.a load the data

In [10]:
filename_spectrum_test = '../data/concentration_data/Test_independent/Spectra_Mixt1.txt'
filename_concentrations_test = '../data/concentration_data/Test_independent/Concentrations_Mix1.txt'
data_spectrum_test = np.loadtxt(filename_spectrum_test, dtype=float)
data_concentrations_test = np.loadtxt(filename_concentrations_test, delimiter='\t', dtype=float,
                                usecols=range(1,data_spectrum_test.shape[1]+1))

In [11]:
#Convert into dataframes
df_spectrum_test = pd.DataFrame(data_spectrum_test).T
df_concentrations_test = pd.DataFrame(data_concentrations_test).T

### I.7.b Normalize the input

In [12]:
norm_df_spectrum_test = (df_spectrum_test - min_val)/(max_val - min_val)

### I.7.c Standardise the output

In [13]:
#Import mean concentration data and metabolites
filename_mean_concentrations = '../data/concentration_data/normal_urine.txt'
mean_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=1, skiprows=1)
sd_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=2, skiprows=1)

stand_df_concentrations_test = (df_concentrations_test - mean_concentrations)/sd_concentrations

In [14]:
print(norm_df_spectrum_test.shape)
print(stand_df_concentrations_test.shape)

(1000, 10000)
(1000, 48)


### I.7.d Shuffle the data

In [15]:
norm_df_spectrum_test, stand_df_concentrations_test = shuffle(norm_df_spectrum_test, stand_df_concentrations_test)

### I.7.e Convert into tf.data

In [197]:
test_dataset = tf.data.Dataset.from_tensor_slices((norm_df_spectrum_test, stand_df_concentrations_test))

print(test_dataset.element_spec)

(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))


### I.7.f Write tf.Record

In [67]:
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Test_independent',
                               dataset=test_dataset, number=10)

## I.8 Test dataset with correlated metabolites (1000 samples)

### I.8.a load the data

In [16]:
filename_spectrum_test_corr = '../data/concentration_data/Test_correlated/Spectra_Mixt1.txt'
filename_concentrations_test_corr = '../data/concentration_data/Test_correlated/Concentrations_Mix1.txt'
data_spectrum_test_corr = np.loadtxt(filename_spectrum_test_corr, dtype=float)
data_concentrations_test_corr = np.loadtxt(filename_concentrations_test_corr, delimiter='\t', dtype=float,
                                usecols=range(1,data_spectrum_test_corr.shape[1]+1))

In [17]:
#Convert into dataframes
df_spectrum_test_corr = pd.DataFrame(data_spectrum_test_corr).T
df_concentrations_test_corr = pd.DataFrame(data_concentrations_test_corr).T

### I.8.b Normalize the input

In [18]:
norm_df_spectrum_test_corr = (df_spectrum_test_corr - min_val)/(max_val - min_val)

### I.8.c Standardise the output

In [19]:
#Import mean concentration data and metabolites
filename_mean_concentrations = '../data/concentration_data/normal_urine.txt'
mean_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=1, skiprows=1)
sd_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=2, skiprows=1)

stand_df_concentrations_test_corr = (df_concentrations_test_corr - mean_concentrations)/sd_concentrations

In [20]:
print(norm_df_spectrum_test_corr.shape)
print(stand_df_concentrations_test_corr.shape)

(1000, 10000)
(1000, 48)


### I.8.d Shuffle the data

In [21]:
norm_df_spectrum_test_corr, stand_df_concentrations_test_corr = shuffle(norm_df_spectrum_test_corr,
                                                                        stand_df_concentrations_test_corr)

### I.8.e Convert into tf.data

In [204]:
test_corr_dataset = tf.data.Dataset.from_tensor_slices((norm_df_spectrum_test_corr,
                                                        stand_df_concentrations_test_corr))

print(test_corr_dataset.element_spec)

(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))


### I.8.f Write tf.Record

In [75]:
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Test_correlated',
                               dataset=test_corr_dataset, number=10)

## I.9 Test abnormal urine dataset with independent metabolites (1000 samples)

### I.9.a load the data

In [22]:
filename_spectrum_abn_test = '../data/concentration_data/Test_abnormal/Spectra_Mix2.txt'
filename_concentrations_abn_test = '../data/concentration_data/Test_abnormal/Concentrations_Mix2.txt'
data_spectrum_abn_test = np.loadtxt(filename_spectrum_abn_test, dtype=float)
data_concentrations_abn_test = np.loadtxt(filename_concentrations_abn_test, delimiter='\t', dtype=float,
                                usecols=range(1,data_spectrum_abn_test.shape[1]+1))

In [23]:
#Convert into dataframes
df_spectrum_abn_test = pd.DataFrame(data_spectrum_abn_test).T
df_concentrations_abn_test = pd.DataFrame(data_concentrations_abn_test).T

### I.9.b Normalize the input

In [24]:
norm_df_spectrum_abn_test = (df_spectrum_abn_test - min_val)/(max_val - min_val)

### I.9.c Standardise the output

In [25]:
#Import mean concentration data and metabolites
filename_mean_concentrations = '../data/concentration_data/normal_urine.txt'
mean_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=1, skiprows=1)
sd_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=2, skiprows=1)

stand_df_concentrations_abn_test = (df_concentrations_abn_test - mean_concentrations)/sd_concentrations

In [26]:
print(norm_df_spectrum_abn_test.shape)
print(stand_df_concentrations_abn_test.shape)

(1000, 10000)
(1000, 48)


### I.9.d Shuffle the data

In [27]:
norm_df_spectrum_abn_test, stand_df_concentrations_abn_test = shuffle(norm_df_spectrum_abn_test,
                                                                      stand_df_concentrations_abn_test)

### I.9.e Convert into tf.data

In [130]:
abn_test_dataset = tf.data.Dataset.from_tensor_slices((norm_df_spectrum_abn_test, stand_df_concentrations_abn_test))

print(abn_test_dataset.element_spec)

(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))


### I.9.f Write tf.Record

In [132]:
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Test_abnormal',
                               dataset=abn_test_dataset, number=10)

## I.10 Test abnormal urine dataset with correlated metabolites (1000 samples)

### I.10.a load the data

In [28]:
filename_spectrum_abn_test_corr = '../data/concentration_data/Test_abnormal_corr/Spectra_Mix2.txt'
filename_concentrations_abn_test_corr = '../data/concentration_data/Test_abnormal_corr/Concentrations_Mix2.txt'
data_spectrum_abn_test_corr = np.loadtxt(filename_spectrum_abn_test_corr, dtype=float)
data_concentrations_abn_test_corr = np.loadtxt(filename_concentrations_abn_test_corr, delimiter='\t', dtype=float,
                                usecols=range(1,data_spectrum_abn_test_corr.shape[1]+1))

In [29]:
#Convert into dataframes
df_spectrum_abn_test_corr = pd.DataFrame(data_spectrum_abn_test_corr).T
df_concentrations_abn_test_corr = pd.DataFrame(data_concentrations_abn_test_corr).T

### I.10.b Normalize the input

In [30]:
norm_df_spectrum_abn_test_corr = (df_spectrum_abn_test_corr - min_val)/(max_val - min_val)

### I.10.c Standardise the output

In [31]:
#Import mean concentration data and metabolites
filename_mean_concentrations = '../data/concentration_data/normal_urine.txt'
mean_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=1, skiprows=1)
sd_concentrations = np.loadtxt(filename_mean_concentrations, delimiter='\t', dtype=float, usecols=2, skiprows=1)

stand_df_concentrations_abn_test_corr = (df_concentrations_abn_test_corr - mean_concentrations)/sd_concentrations

In [32]:
print(norm_df_spectrum_abn_test_corr.shape)
print(stand_df_concentrations_abn_test_corr.shape)

(1000, 10000)
(1000, 48)


### I.10.d Shuffle the data

In [33]:
norm_df_spectrum_abn_test_corr, stand_df_concentrations_abn_test_corr = shuffle(norm_df_spectrum_abn_test_corr,
                                                                        stand_df_concentrations_abn_test_corr)

### I.10.e Convert into tf.data

In [139]:
abn_test_corr_dataset = tf.data.Dataset.from_tensor_slices((norm_df_spectrum_abn_test_corr,
                                                        stand_df_concentrations_abn_test_corr))

print(abn_test_corr_dataset.element_spec)

(TensorSpec(shape=(10000,), dtype=tf.float64, name=None), TensorSpec(shape=(48,), dtype=tf.float64, name=None))


### I.10.f Write tf.Record

In [140]:
write_tfrecords_concentrations('../data/tfrecords/Concentrations_data/Test_abnormal_corr',
                               dataset=abn_test_corr_dataset, number=10)