In [10]:
# Read the data from UIC mapping file, and get 2 columns: UICID and Phase, mapping it into dictionary
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Reading the file, store it in a variable as "dataset_prep_1"
# Command used: pd.read_csv
dataset_prep_1 = pd.read_csv("E:\\Recurrent Neural Network Workflow\\UICmapping.csv",encoding = 'latin1')


# Getting the 2 column by name
# Command used: df["column_name"]
UIC_ID_col = dataset_prep_1["#SampleID"]
UIC_Phase_col = dataset_prep_1["Phase"]

# Using the zip command to map 2 of the column into a "mapping_dict"
# Command used: dict(zip(column_1, column_2))
mapping_dict = dict(zip(UIC_ID_col, UIC_Phase_col))

# Print out and check out the result:
print(UIC_ID_col)
print(UIC_Phase_col)
print(mapping_dict)

In [11]:
# Using the data of the mapping dict, transfer the data into the row title of the second set of data
# Converting all UIC_ID into seperate phase and sort it by index, output the new csv file

# Reading file 2 and store it as dataset_prep_2
dataset_prep_2 = pd.read_csv("E:\\Recurrent Neural Network Workflow\\BIOMtable.csv",encoding = 'latin1')

# Create a copy of dataframe dataset_prep_2
# Command used: dataframe.copy()
df_copy = dataset_prep_2.copy()


# Renaming the dataframe columns, mapping column UIC_ID to corresponding phase in column Phase
# Command used: dataframe.rename()
df_copy.rename(columns=mapping_dict, inplace=True)

# Sorting dataframe by Phase
# Command used: dataframe.sort_index(axis = 1)
df_copy = df_copy.sort_index(axis=1)

# Moving the column has microbiome name into the beginning of the dataframe after sorting
# Using pop to delete the column, then using insert to re-insert the column into the start
# Command used: dataframe.pop(column_to_move), dataframe.insert(column_to_add)
column_to_move = 'Unnamed: 0'
new_position = 0 
column_index = df_copy.pop(column_to_move)
df_copy.insert(new_position, column_to_move, column_index)

# Delete all the UIC_ID column that doesnt have phase mapping to it
string_to_exclude = 'UIC'
df_filtered = df_copy.drop(columns=[col for col in df_copy.columns if string_to_exclude in col])

# Create a variable that calculate the sum of row in the dataframe
taxa_sums = df_filtered.sum(axis=1)

# Edit the dataframe, removing all single OTS
df_filtered = df_filtered[taxa_sums > 1]

# Exporting file into a new csv, not including the index
# Command used: dataframe.to_csv('name.csv', index = False)
df_filtered.to_csv('New_mapping_file.csv', index=False)

# Print out the dataframe:
# print(df_filtered)

In [12]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam

# Set seed for numpy and tensorflow for reproduction
np.random.seed(123)
tf.random.set_seed(123)

dataset_train = pd.read_csv("E:\\Recurrent Neural Network Workflow\\New_mapping_file.csv")
df = dataset_train.copy()
#print(df)

# Getting total of each Phase for easier training:
# Total phase 1: 
df['Phase 1 Total'] = df.iloc[:, 1:10].sum(axis=1)
# Total phase 2:
df['Phase 2 Total'] = df.iloc[:, 10:50].sum(axis=1)
# Total phase 3:
df['Phase 3 Total'] = df.iloc[:, 50:91].sum(axis=1)
# Total phase 4:
df['Phase 4 Total'] = df.iloc[:,91:105].sum(axis=1)
# Total phase 5:
df['Phase 5 Total'] = df.iloc[:,105:119].sum(axis=1)
# Total phase 6:
df['Phase 6 Total'] = df.iloc[:,119:133].sum(axis=1)

# Getting all phase into one new dataframe and output
new_df = pd.concat([df['Phase 1 Total'] , df['Phase 2 Total'] , df['Phase 3 Total'] , df['Phase 4 Total'] , df['Phase 5 Total'], df['Phase 6 Total']], axis=1)
#print(new_df)

# Getting 70% of data for training, output random sample size, as well as set seed
sample_size = int(0.7 * len(new_df))
sampled_data = new_df.sample(n=sample_size, random_state = 123)
print(sampled_data)

# Convert data into numpy
data = new_df.values
num_otus, num_phases = data.shape

time_steps = 2
num_features = data.shape[1]

# Prepare data for RNN
x, y = [], []

for i in range(len(data) - time_steps):
    x.append(data[i:i+time_steps])
    y.append(data[i+time_steps])

x = np.array(x)
y = np.array(y)

# Define the RNN model
model = Sequential()
model.add(LSTM(units=64, input_shape=(time_steps, num_features)))
model.add(Dense(units=num_features))

# Compile the model
model.compile(loss="mean_squared_error", optimizer="adam", metrics=["mean_absolute_error"])

# Train the model
history = model.fit(x, y, epochs=100, batch_size= 32)  

# Make predictions
predictions = model.predict(x)

# Print the predictions
print(predictions)
