In [None]:
import pandas as pd
import tensorflow as tf
import tensorflowjs as tfjs
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

In [None]:
def get_compiled_model():
    model = tf.keras.Sequential([
        tf.keras.layers.LayerNormalization(input_shape=(1, 30)),
        tf.keras.layers.Conv1D(30, 1, activation='tanh', input_shape=(1, 30)),
        tf.keras.layers.MaxPooling1D(pool_size=1),
        tf.keras.layers.LSTM(30, activation='tanh'),
        tf.keras.layers.Dense(30, activation='tanh'),
        tf.keras.layers.Dense(1, activation='sigmoid')
      ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    return model

def get_gene_data(gene_csv_df, gene):
    """
    Given the gene csv dataframe and the gene as a string (i.e. "n_gene", "s_gene", etc.), this function returns the training and test X and y sets.
    NOTE: This function does not work with Rp-Cy5 due to Rp-Cy5 only having AMPLIFIED or REPEAT as result classes.
    """
    undesirables = "['neg', 'PC', '', 0, 'unknown']"
    binary = "['AMPLIFIED', 'NON-AMPLIFIED']"
    X = gene_csv_df.query(f'sample_id != {undesirables}').query(f'n_gene_result == {binary}') \
        .drop(["sample_id", "plate_id", "n_gene_result", "well_position", "well", f"{gene}_delta_cycle1", 
               f"{gene}_delta_cycle2", f"{gene}_delta_cycle3", f"{gene}_delta_cycle4", f"{gene}_delta_cycle5", 
               f"{gene}_delta_cycle6", f"{gene}_delta_cycle7", f"{gene}_delta_cycle8", f"{gene}_delta_cycle9", 
               f"{gene}_delta_cycle10"], axis=1)
    y = gene_csv_df.query(f'sample_id != {undesirables}').query(f'{gene}_result == {binary}')[f'{gene}_result'] \
        .replace({"AMPLIFIED": 1, "NON-AMPLIFIED": 0})

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
    
    return X_train, X_test, y_train, y_test

In [None]:
# get n_gene data
n_gene = pd.read_csv("n_gene.csv")
n_gene = n_gene.dropna()

X_train, X_test, y_train, y_test = get_gene_data(n_gene, "n_gene")

In [None]:
# reshape n_gene data
X_train_shape = X_train.values.reshape(87099, 1, 30)
y_train_shape = y_train.values.reshape(87099, 1)
X_test_shape = X_test.values.reshape(87099, 1, 30)
y_test_shape= y_test.values.reshape(87099, 1)

In [None]:
# create n_gene model
class_weights = {0: 1, 1: 25}
n_gene_model = get_compiled_model()
n_gene_model.fit(X_train_shape, y_train_shape, epochs=200, shuffle=True, class_weight=class_weights, batch_size=1000)

In [None]:
# test n_gene model
y_preds = n_gene_model.predict_classes(X_test_shape)
print(f"Balanced Accuracy Score for N Gene model: {balanced_accuracy_score(y_test_shape, y_preds)}")

In [None]:
# save n_gene model
n_gene_model.save("n_gene_model.h5")
tfjs.converters.save_keras_model(n_gene_model, "[path to save model]")

In [None]:
# get s_gene data
s_gene = pd.read_csv("s_gene.csv")
s_gene = s_gene.dropna()

X_train, X_test, y_train, y_test = get_gene_data(s_gene, "s_gene")

In [None]:
# reshape s_gene data
X_train_shape = X_train.values.reshape(87224, 1, 30)
y_train_shape = y_train.values.reshape(87224, 1)
X_test_shape = X_test.values.reshape(87224, 1, 30)
y_test_shape= y_test.values.reshape(87224, 1)

In [None]:
# Create s_gene model
class_weights = {0: 1, 1: 26}
s_gene_model = get_compiled_model()
s_gene_model.fit(X_train_shape, y_train_shape, epochs=200, shuffle=True, class_weight=class_weights, batch_size=1000)

In [None]:
# test s_gene model
y_preds = s_gene_model.predict_classes(X_test_shape)
print(f"Balanced Accuracy Score for S Gene model: {balanced_accuracy_score(y_test_shape, y_preds)}")

In [None]:
# save s_gene model
s_gene_model.save("s_gene_model.h5")
tfjs.converters.save_keras_model(s_gene_model, "[path to save model]")

In [None]:
# Get orf1ab data
orf1ab = pd.read_csv("orf1ab.csv")
orf1ab = orf1ab.dropna()

X_train, X_test, y_train, y_test = get_gene_data(orf1ab, "orf1ab")

In [None]:
# reshape orf1ab data
X_train_shape = X_train.values.reshape(87123, 1, 30)
y_train_shape = y_train.values.reshape(87123, 1)
X_test_shape = X_test.values.reshape(87123, 1, 30)
y_test_shape= y_test.values.reshape(87123, 1)

In [None]:
# Create orf1ab model
class_weights = {0: 1, 1: 25}
orf1ab_model = get_compiled_model()
orf1ab_model.fit(X_train_shape, y_train_shape, epochs=200, shuffle=True, class_weight=class_weights, batch_size=1000)

In [None]:
# test orf1ab model
y_preds = orf1ab_model.predict_classes(X_test_shape)
print(f"Balanced Accuracy Score for ORF1ab model: {balanced_accuracy_score(y_test_shape, y_preds)}")

In [None]:
# save orf1ab model
orf1ab_model.save("orf1ab_model.h5")
tfjs.converters.save_keras_model(orf1ab_model, "[path to save model]")

In [None]:
# Get ms2 data
ms2 = pd.read_csv("ms2.csv")
ms2 = ms2.dropna()

X_train, X_test, y_train, y_test = get_gene_data(ms2, "ms2")

In [None]:
# reshape ms2 data
X_train_shape = X_train.values.reshape(87176, 1, 30)
y_train_shape = y_train.values.reshape(87176, 1)
X_test_shape = X_test.values.reshape(87177, 1, 30)
y_test_shape= y_test.values.reshape(87177, 1)

In [None]:
# Create ms2 model
class_weights = {0: 7, 1: 1}
ms2_model = get_compiled_model()
ms2_model.fit(X_train_shape, y_train_shape, epochs=1000, shuffle=True, class_weight=class_weights, batch_size=1000)

In [None]:
# test ms2 model
y_preds = ms2_model.predict_classes(X_test_shape)
print(f"Balanced Accuracy Score for MS2 model: {balanced_accuracy_score(y_test_shape, y_preds)}")

In [None]:
# save ms2 model
ms2_model.save("ms2_model.h5")

In [None]:
# Get rp_cy5 data
rp_cy5 = pd.read_csv("rp_cy5.csv")
rp_cy5 = rp_cy5.dropna()
undesirables = "['neg', 'PC', '', 0, 'unknown']"
binary = "['AMPLIFIED', 'REPEAT']"
X = rp_cy5.query(f'sample_id != {undesirables}').query(f'rp_cy5_result == {binary}') \
    .drop(["sample_id", "plate_id", "rp_cy5_result", "well_position", "well", "rp_cy5_delta_cycle1", "rp_cy5_delta_cycle2", 
           "rp_cy5_delta_cycle3", "rp_cy5_delta_cycle4", "rp_cy5_delta_cycle5", "rp_cy5_delta_cycle6", 
           "rp_cy5_delta_cycle7", "rp_cy5_delta_cycle8", "rp_cy5_delta_cycle9", "rp_cy5_delta_cycle10"], axis=1)
y = rp_cy5.query(f'sample_id != {undesirables}').query(f'rp_cy5_result == {binary}')[f'rp_cy5_result'] \
    .replace({"AMPLIFIED": 1, "REPEAT": 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [None]:
# Reshape rp_cy5 data
X_train_shape = X_train.values.reshape(86768, 1, 30)
y_train_shape = y_train.values.reshape(86768, 1)
X_test_shape = X_test.values.reshape(86768, 1, 30)
y_test_shape = y_test.values.reshape(86768, 1)

In [None]:
# Create rp_cy5 model
class_weights = {0: 252, 1: 1}
rp_cy5_model = get_compiled_model()
rp_cy5_model.fit(X_train_shape, y_train_shape, epochs=100, shuffle=True, class_weight=class_weights, batch_size=1000)

In [None]:
# test rp_cy5 model
y_preds = rp_cy5_model.predict_classes(X_test_shape)
print(f"Balanced Accuracy Score for Rp-Cy5 model: {balanced_accuracy_score(y_test_shape, y_preds)}")

In [None]:
# Save rp_cy5 model
rp_cy5_model.save("rp_cy5_model.h5")
tfjs.converters.save_keras_model(rp_cy5_model, "[path to save model]")