In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import tensorflow as tf
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

In [3]:
def load_cv_data(index):
  x_train = np.load(f"drive/MyDrive/DreamWalker/pre_train/Cla/kmer_train_{index}.npz")["data"]
  y_train = np.load(f"drive/MyDrive/DreamWalker/pre_train/Cla/lineage_train_{index}.npz")["data"]
  x_val = np.load(f"drive/MyDrive/DreamWalker/pre_train/Cla/kmer_val_{index}.npz")["data"]
  y_val = np.load(f"drive/MyDrive/DreamWalker/pre_train/Cla/lineage_val_{index}.npz")["data"]
  return x_train, y_train, x_val, y_val

In [4]:
vocab = np.load("drive/MyDrive/DreamWalker/model_weights/LineageTV_vocal.npy")

In [10]:
def create_ann(vocab_size, length):
    encoder_inputs = tf.keras.layers.Input(shape=(1024,))
    x = tf.keras.layers.RepeatVector(length, name="RepeatVector")(encoder_inputs)
    x = tf.keras.layers.GRU(1024, return_sequences=True, dropout=0.2, name="GRU0")(x)
    x = tf.keras.layers.GRU(1024, return_sequences=True, dropout=0.2, name="GRU1")(x)
    x = tf.keras.layers.Dense(vocab_size, activation="softmax")(x)
    model = tf.keras.models.Model(encoder_inputs,x)
    model.compile(optimizer=tf.keras.optimizers.Adam(0.0001), loss='sparse_categorical_crossentropy',metrics=["accuracy"])
    return model

In [12]:
EPOCHS = 20
ann_accuracy_records = {}
vocab_size = len(vocab)
for j in range(5):
    # train
    x_train, y_train, x_val, y_val = load_cv_data(j)
    length = y_train.shape[1]
    model = create_ann(vocab_size, length)
    model.fit(x_train, y_train, batch_size=32, epochs=EPOCHS, verbose=0, validation_data=(x_val, y_val))
    # val
    ann_pred = model.predict(x_val, verbose=0)
    pred_argmax = np.argmax(ann_pred, axis=2)
    accuracy_record = []
    for i in range(length):
        accuracy = accuracy_score(pred_argmax[:,i], y_val[:,i])
        accuracy_record.append(accuracy)
    print(accuracy_record)
    ann_accuracy_records[j] = accuracy_record

[0.9977203647416414, 0.9936676798378926, 0.9934143870314083, 0.9875886524822695, 0.9736575481256332, 0.9516210739614995, 0.7160587639311043, 0.9924012158054711, 0.9954407294832827, 0.9929078014184397, 0.9878419452887538, 0.9766970618034447, 0.9690982776089159, 0.9825227963525835, 0.9984802431610942, 0.99822695035461, 0.993920972644377, 0.9962006079027356, 0.9962006079027356]
[0.9992401215805471, 0.9962006079027356, 0.9944275582573455, 0.9878419452887538, 0.9761904761904762, 0.950354609929078, 0.7001013171225937, 0.9924012158054711, 0.9951874366767984, 0.9934143870314083, 0.9918946301925026, 0.9842958459979737, 0.9782168186423505, 0.9858156028368794, 0.99822695035461, 0.99822695035461, 0.9951874366767984, 0.995694022289767, 0.9967071935157041]
[0.997467071935157, 0.9959473150962512, 0.9941742654508612, 0.9911347517730497, 0.975177304964539, 0.9521276595744681, 0.7302431610942249, 0.9924012158054711, 0.9941742654508612, 0.9946808510638298, 0.9893617021276596, 0.9850557244174265, 0.980496

In [17]:
pd.DataFrame(ann_accuracy_records).transpose().to_csv("drive/MyDrive/DreamWalker/pre_train/cla_cv_accuracy.csv", index=False)

In [14]:
x_train = np.load("drive/MyDrive/DreamWalker/pre_train/Cla/kmer_train.npz")["data"]
y_train = np.load("drive/MyDrive/DreamWalker/pre_train/Cla/lineage_train.npz")["data"]
x_test = np.load("drive/MyDrive/DreamWalker/pre_train/Cla/kmer_test.npz")["data"]
y_test = np.load("drive/MyDrive/DreamWalker/pre_train/Cla/lineage_test.npz")["data"]
# train
model = create_ann(vocab_size, y_train.shape[1])
model.fit(x_train, y_train, batch_size=32, epochs=20, verbose=0)
# test
ann_pred = model.predict(x_test, verbose=0)
pred_argmax = np.argmax(ann_pred, axis=2)
accuracy_record = []
for i in range(y_test.shape[1]):
    accuracy = accuracy_score(pred_argmax[:,i], y_test[:,i])
    accuracy_record.append(accuracy)
print(accuracy_record)

[0.9995440729483283, 0.9977203647416414, 0.9969604863221885, 0.9930091185410335, 0.9797872340425532, 0.9617021276595744, 0.7480243161094224, 0.9954407294832827, 0.9972644376899696, 0.9972644376899696, 0.9936170212765958, 0.9864741641337386, 0.9828267477203647, 0.9875379939209726, 0.997872340425532, 0.9990881458966565, 0.9954407294832827, 0.9965045592705167, 0.9954407294832827]


In [15]:
x = np.load("drive/MyDrive/DreamWalker/pre_train/Cla/kmer_whole.npz")["data"]
y = np.load("drive/MyDrive/DreamWalker/pre_train/Cla/lineage_whole.npz")["data"]
model.fit(x, y, batch_size=32, epochs=20, verbose=0)

<keras.src.callbacks.History at 0x7842cbb6e980>

In [16]:
# model.save("drive/MyDrive/DreamWalker/model_weights/classifier.keras")

In [18]:
path = "drive/MyDrive/DreamWalker/model_weights/ClassifierWeights"
for i, layer in enumerate(model.layers):
    weights = layer.get_weights()
    np.savez_compressed(f'{path}/layer_{i}_weights', weights=weights)

  val = np.asanyarray(val)
