In [19]:
# Read the data from UIC mapping file, and get 2 columns: UICID and Phase, mapping it into dictionary
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Reading the file, store it in a variable as "dataset_prep_1"
# Command used: pd.read_csv
dataset_prep_1 = pd.read_csv("E:\\Recurrent Neural Network Workflow\\UICmapping.csv",encoding = 'latin1')
print(dataset_prep_1)

# Getting the 2 column by name
# Command used: df["column_name"]
UIC_ID_col = dataset_prep_1["#SampleID"]
UIC_Phase_col = dataset_prep_1["Phase"]

# Using the zip command to map 2 of the column into a "mapping_dict"
# Command used: dict(zip(column_1, column_2))
mapping_dict = dict(zip(UIC_ID_col, UIC_Phase_col))
print(UIC_ID_col)
print(UIC_Phase_col)
print(mapping_dict)

    #SampleID BarcodeSequence Sample_Code  Year    Year_ID    True_ID  \
0       UIC01    TTACCACATCTA         1.1  2016  Swimmer.1  Swimmer.1   
1       UIC02    TGGCATGTTGGT         1.2  2016  Swimmer.2  Swimmer.2   
2       UIC03    GCGGAGCACGTC         1.3  2016  Swimmer.3  Swimmer.3   
3       UIC04    GGTCCCGAAATT         1.4  2016  Swimmer.4  Swimmer.4   
4       UIC05    AAGTGCTTGGTA         1.5  2016  Swimmer.5  Swimmer.5   
..        ...             ...         ...   ...        ...        ...   
127     UIC95    CTTGTTGTTCTG       4.1.2  2017   Swimmer4  Swimmer.6   
128     UIC96    CGGAAACTCCAT       4.2.1  2017   Swimmer4  Swimmer.6   
129     UIC97    AACGACACGCTT       4.2.2  2017   Swimmer4  Swimmer.6   
130     UIC98    AACACGGTTTGA       4.3.1  2017   Swimmer4  Swimmer.6   
131     UIC99    CTAGGTCCGACT       4.3.2  2017   Swimmer4  Swimmer.6   

       Phase       Status      Ht      Wt  ...    VitaminC  VitaminD  \
0    Phase.1    In.Season  172.00  88.112  ...     

In [29]:
# Using the data of the mapping dict, transfer the data into the row title of the second set of data
# Converting all UIC_ID into seperate phase and sort it by index, output the new csv file

# Reading file 2 and store it as dataset_prep_2
dataset_prep_2 = pd.read_csv("E:\\Recurrent Neural Network Workflow\\BIOMtable.csv",encoding = 'latin1')

# Create a copy of dataframe dataset_prep_2
# Command used: dataframe.copy()
df_copy = dataset_prep_2.copy()


# Renaming the dataframe columns, mapping column UIC_ID to corresponding phase in column Phase
# Command used: dataframe.rename()
df_copy.rename(columns=mapping_dict, inplace=True)

# Sorting dataframe by Phase
# Command used: dataframe.sort_index(axis = 1)
df_copy = df_copy.sort_index(axis=1)

# Moving the column has microbiome name into the beginning of the dataframe after sorting
# Using pop to delete the column, then using insert to re-insert the column into the start
# Command used: dataframe.pop(column_to_move), dataframe.insert(column_to_add)
column_to_move = 'Unnamed: 0'
new_position = 0 
column_index = df_copy.pop(column_to_move)
df_copy.insert(new_position, column_to_move, column_index)

# Delete all the UIC_ID column that doesnt have phase mapping to it
string_to_exclude = 'UIC'
df_filtered = df_copy.drop(columns=[col for col in df_copy.columns if string_to_exclude in col])
print(df_filtered)

# Exporting file into a new csv, not including the index
# Command used: dataframe.to_csv('name.csv', index = False)

df_filtered.to_csv('New_mapping_file.csv', index=False)

      Unnamed: 0  Phase.1  Phase.1  Phase.1  Phase.1  Phase.1  Phase.1  \
0           1624        0        0        0        0        0        0   
1           1992        5        1        6        6        1        4   
2           4760        0        0        0        0        0        0   
3           7366        6        0        0        0        0        0   
4           9710        0        0        0        0        0        0   
...          ...      ...      ...      ...      ...      ...      ...   
13042    3600504        6        5      411        9      272      779   
13043    4447950        0        0        3        2        0        0   
13044    4453501        0       14        0        6        0        0   
13045     509621        0        0        0        0        0        0   
13046     782953        6        4        1        3        1      116   

       Phase.1  Phase.1  Phase.1  ...  Phase.6  Phase.6  Phase.6  Phase.6  \
0            0        0        0  

In [31]:
# Attirbute of the data:
# 1. Uneven number of columns for timeframe
# 2. Train data with multiple column of timeframe
# 3. Phase 1, phase 2, phase 3 is in season, phase 4 is competition, phase 5 and phase 6 is off-season

In [35]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam


# Set seed for numpy and tensorflow for reproduction
np.random.seed(123)
tf.random.set_seed(123)

dataset_train = pd.read_csv("E:\\Recurrent Neural Network Workflow\\New_mapping_file.csv")
df = dataset_train.copy()
print(df)

scaler = MinMaxScaler()
model = Sequential()

      Unnamed: 0  Phase.1  Phase.1.1  Phase.1.2  Phase.1.3  Phase.1.4  \
0           1624        0          0          0          0          0   
1           1992        5          1          6          6          1   
2           4760        0          0          0          0          0   
3           7366        6          0          0          0          0   
4           9710        0          0          0          0          0   
...          ...      ...        ...        ...        ...        ...   
13042    3600504        6          5        411          9        272   
13043    4447950        0          0          3          2          0   
13044    4453501        0         14          0          6          0   
13045     509621        0          0          0          0          0   
13046     782953        6          4          1          3          1   

       Phase.1.5  Phase.1.6  Phase.1.7  Phase.1.8  ...  Phase.6.4  Phase.6.5  \
0              0          0          0     