In [1]:
!git clone https://github.com/AvonYangXX1/DreamWalker.git

Cloning into 'DreamWalker'...
remote: Enumerating objects: 250, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 250 (delta 2), reused 36 (delta 1), pack-reused 211[K
Receiving objects: 100% (250/250), 863.48 MiB | 21.31 MiB/s, done.
Resolving deltas: 100% (14/14), done.
Updating files: 100% (110/110), done.


In [2]:
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import os

## Generator

In [3]:
# Generator
def build_generator(seq_length, depth, latent_dim):
    inputs0 = layers.Input(shape=(latent_dim,), name="Input0")
    x = layers.Dense(256, activation='relu', name="Dense0")(inputs0)
    x = layers.BatchNormalization(name="Norm0")(x)
    x = layers.Dense(256, activation='relu', name="Dense1")(x)
    x = layers.BatchNormalization(name="Norm1")(x)
    x = layers.Dense(256, activation='relu', name="Dense2")(x)
    x = layers.BatchNormalization(name="Norm2")(x)
    x = layers.Dense(256, activation='relu', name="Dense3")(x)
    x = layers.BatchNormalization(name="Norm3")(x)
    x = layers.Dense(256, activation='relu', name="Dense4")(x)
    x = layers.Dense(seq_length*depth, activation='linear', name="DenseResize")(x)
    x = layers.Reshape((seq_length, depth), name="Reshape")(x)
    # x = layers.RepeatVector(seq_length, name="RepeatVector")(x)
    # x = layers.LSTM(256, return_sequences=True, name="GRU0")(x)
    x = layers.Dense(depth, activation="softmax", name="Output")(x)
    model = tf.keras.models.Model(inputs=inputs0, outputs=x)
    return model

In [4]:
# Discriminator
def build_discriminator(seq_length, depth):
    model = tf.keras.Sequential(name="discriminator")
    model.add(layers.Conv1D(32, 5, name="Conv1D"))
    model.add(layers.Flatten(name="Flatten"))
    model.add(layers.Dense(512, activation='relu', name="Dense0"))
    model.add(layers.Dropout(0.3, name="Dropout"))
    model.add(layers.Dense(256, activation='relu', name="Dense1"))
    model.add(layers.Dense(1, activation='sigmoid', name="Output"))
    return model

In [5]:
# GAN
def compile_gan(generator, discriminator):
    discriminator.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=[tf.keras.metrics.FalsePositives(),
                                   tf.keras.metrics.FalseNegatives()])
    discriminator.trainable = False
    gan_input0 = layers.Input(shape=(latent_dim,))
    gan_output = discriminator(generator(gan_input0))
    gan = tf.keras.Model(gan_input0, gan_output)
    gan.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(1e-4))
    return gan

## PreTrain

In [16]:
pretrain_sequences = np.load(f"{path}/enc_uniprot.npz")['data']

In [7]:
print(pep_decoder)

[[21  6  2 ...  0  0  0]
 [21  4 15 ...  0  0  0]
 [21  4 15 ...  0  0  0]
 ...
 [21 11 17 ...  0  0  0]
 [21 18  5 ...  0  0  0]
 [21 12  5 ...  0  0  0]]


In [18]:
num_group = 20
size = 1078141/num_group
for i in range(num_group):
    start = int(size * i)
    end = int(size * (i+1))
    np.save(f"/content/DreamWalker/data/processed_data/GAN/gan_train_data/group_{i}", pretrain_sequences [start:end])

In [37]:
def train_gan(generator, discriminator, gan, path, epochs, batch_size, latent_dim, demo_noise):
    for epoch in range(epochs):
        files = os.listdir(path)
        # print(files)
        for file in files:
            seq = np.load(f"{path}/{file}", allow_pickle=True)
            # print(seq)
            # print(seq.shape)
            seq = tf.one_hot(seq.squeeze(), depth=43)
            total_d_loss = 0
            total_g_loss = 0
            num_batches = int(seq.shape[0] / batch_size)


            for i in range(0, seq.shape[0], batch_size):
                real_sequences = seq[i:i + batch_size]
                current_batch_size = real_sequences.shape[0]

                # Generate Fake sequence
                noise = (np.random.rand(current_batch_size, latent_dim)-0.5)*2
                generated_sequences = generator.predict(noise, verbose=0)

                # Labels for real and fake data
                real_labels = np.ones((current_batch_size, 1))
                fake_labels = np.zeros((current_batch_size, 1))

                # Train discriminator
                discriminator.trainable = True
                d_loss_real = discriminator.train_on_batch(real_sequences, real_labels)
                d_loss_fake = discriminator.train_on_batch(generated_sequences, fake_labels)
                d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
                discriminator.trainable = False

                # Train generator
                g_loss = gan.train_on_batch(noise, np.ones((current_batch_size, 1)))

                total_d_loss += d_loss
                total_g_loss += g_loss
                # print(f"Epoch {epoch+1}/{epochs}; {file}; Batch {i}/{num_batches}; FP {d_loss[1]/current_batch_size:.4f}; FN {d_loss[2]/current_batch_size:.4f}; G_loss {g_loss:.4f}")

            # demo_seq = generator(demo_noise)
            # demo_seq = tf.math.argmax(demo_seq, axis=2)
            # demo_seq = pep_decoder(demo_seq).numpy().astype('str')
            # demo_seq = ["".join(chars) for chars in demo_seq]
            # print(demo_seq[0])
            print(f"Epoch {epoch+1}/{epochs}; FP {total_d_loss[1]/seq.shape[0]:.4f}; FN {total_d_loss[2]/seq.shape[0]:.4f}; G_Loss {total_g_loss/num_batches:.4f}")
            # del seq
            # generator.save(f"drive/MyDrive/MIT687/Generator.keras")
            # discriminator.save(f"drive/MyDrive/MIT687/Discriminator.keras")

In [38]:
# def train_gan(generator, discriminator, gan, path, epochs, batch_size, latent_dim, demo_noise):

#     for epoch in range(epochs):
#       # Load pre-trained sequences
#         pretrain_sequences = np.load(f"{path}/enc_uniprot.npz")['data']
#         seq  = tf.one_hot(pretrain_sequences.squeeze(), depth=43)
#         total_d_loss = 0
#         total_g_loss = 0
#         num_batches = int(np.ceil(pretrain_sequences.shape[0] / batch_size))

#         for i in range(0, seq.shape[0], batch_size):
#                 real_sequences = seq[i:i + batch_size]
#                 current_batch_size = real_sequences.shape[0]

#                   # Generate Fake sequence
#                 noise = (np.random.rand(current_batch_size, latent_dim)-0.5)*2
#                 generated_sequences = generator.predict(noise, verbose=0)

#                 # Labels for real and fake data
#                 real_labels = np.ones((current_batch_size, 1))
#                 fake_labels = np.zeros((current_batch_size, 1))

#                 # Train discriminator
#                 discriminator.trainable = True
#                 d_loss_real = discriminator.train_on_batch(real_sequences, real_labels)
#                 d_loss_fake = discriminator.train_on_batch(generated_sequences, fake_labels)
#                 d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
#                 discriminator.trainable = False

#                 # Train generator
#                 g_loss = gan.train_on_batch(noise, np.ones((current_batch_size, 1)))

#                 total_d_loss += d_loss
#                 total_g_loss += g_loss
#                 # print(f"Epoch {epoch+1}/{epochs}; {file}; Batch {i}/{num_batches}; FP {d_loss[1]/current_batch_size:.4f}; FN {d_loss[2]/current_batch_size:.4f}; G_loss {g_loss:.4f}")

#         demo_seq = generator(demo_noise)
#         demo_seq = tf.math.argmax(demo_seq, axis=2)
#         demo_seq = pep_decoder(demo_seq).numpy().astype('str')
#         demo_seq = ["".join(chars) for chars in demo_seq]
#         print(demo_seq[0])
#         print(f"Epoch {epoch+1}/{epochs}; FP {total_d_loss[1]/seq.shape[0]:.4f}; FN {total_d_loss[2]/seq.shape[0]:.4f}; G_Loss {total_g_loss/num_batches:.4f}")
#         del seq

In [39]:
latent_dim = 32
seq_length = 40
depth = 43

path = "/content/DreamWalker/data/processed_data/GAN/gan_train_data"
np.random.seed(8701)
demo_noise = noise = (np.random.rand(1, latent_dim)-0.5)*2

In [40]:
generator = build_generator(seq_length, depth, latent_dim)
discriminator = build_discriminator(seq_length,depth)
gan = compile_gan(generator, discriminator)

In [41]:
train_gan(generator, discriminator, gan, path=path, epochs=5, batch_size=22, latent_dim=latent_dim, demo_noise=demo_noise)


Epoch 1/5; FP 0.0021; FN 0.0027; G_Loss 9.8022
Epoch 1/5; FP 0.0029; FN 0.0035; G_Loss 9.2211
Epoch 1/5; FP 0.0022; FN 0.0029; G_Loss 11.9018
Epoch 1/5; FP 0.0019; FN 0.0033; G_Loss 10.2826
Epoch 1/5; FP 0.0015; FN 0.0029; G_Loss 10.7482
Epoch 1/5; FP 0.0018; FN 0.0021; G_Loss 17.3736
Epoch 1/5; FP 0.0019; FN 0.0024; G_Loss 15.9485
Epoch 1/5; FP 0.0011; FN 0.0016; G_Loss 21.3331
Epoch 1/5; FP 0.0003; FN 0.0003; G_Loss 50.8056
Epoch 1/5; FP 0.0001; FN 0.0001; G_Loss 42.4288
Epoch 1/5; FP 0.0000; FN 0.0000; G_Loss 123.6645
Epoch 1/5; FP 0.0001; FN 0.0001; G_Loss 95.0598
Epoch 1/5; FP 0.0001; FN 0.0001; G_Loss 135.1748
Epoch 1/5; FP 0.0000; FN 0.0000; G_Loss 74.2955
Epoch 1/5; FP 0.0005; FN 0.0005; G_Loss 120.3222
Epoch 1/5; FP 0.0009; FN 0.0009; G_Loss 47.4669
Epoch 1/5; FP 0.0017; FN 0.0018; G_Loss 33.4660
Epoch 1/5; FP 0.0020; FN 0.0021; G_Loss 20.0430
Epoch 1/5; FP 0.0017; FN 0.0020; G_Loss 18.0887
Epoch 1/5; FP 0.0020; FN 0.0027; G_Loss 15.6475
Epoch 2/5; FP 0.0020; FN 0.0025; G_Loss

In [46]:
aa_vocal = np.load("/content/DreamWalker/model_weights/PepTV_vocal.npy")
pep_decoder = tf.keras.layers.StringLookup(vocabulary=aa_vocal[1:], invert=True, oov_token='')

In [47]:
# After GAN is trained
def generate_sequences(generator, latent_dim, num_sequences):
    noise = (np.random.rand(num_sequences, latent_dim)-0.5)*2
    generated_sequences = generator.predict(noise, verbose=0)
    return onehot2seq(generated_sequences)

def onehot2seq(onehot):
    demo_seq = tf.math.argmax(onehot, axis=2)
    demo_seq = pep_decoder(demo_seq).numpy().astype('str')
    demo_seq = ["".join(chars) for chars in demo_seq]
    return demo_seq

In [50]:
num_sequences=100
generated_seqs = generate_sequences(generator, latent_dim, num_sequences=num_sequences)
generated_seqs

['MCLSIIQGGKGSR',
 'MSTDSRAVYRVPGAAGWIAAQSWGQQS',
 'GPVLLPESNRDCDS',
 'MWTKLSEVNPSK',
 'MRIKKERSRKRHDYGHKR',
 'CAVPAGKVVGIVD',
 'ALSSFSKNLKS',
 'ALSYIVGNFVDIL',
 'FWVKKWTFNRSP',
 'CVKGVLPFVWPPGNDGWDAASAWEQ',
 'MDTKIEGRFVEQG',
 'MLLLYVSGTKCFGCGV',
 'CQQGEDPVLAKEEELGVRGLEGWDTPQPAQPTE',
 'GLVYFLLFLVIYPLAVL',
 'MDHSLPPAFAGVRRLW',
 'GGVKESCVNISD',
 'ADPACSKVLRR',
 'AYSARLKGLMIMNPLLPSLEQPRVQ',
 'GKVIEAKFLAAK',
 'MHRRKWKRRCF',
 'CPGCKKKDEGKPDEEAERSRWP',
 'AREIRGMWISKPKINPAGFRRRMDR',
 'MLRHRVQGCSCSADVT',
 'MEWRHDPSDKAVTDYGTGS',
 'MPVASVIKRRVPVALAVVGAGAWTGASPFQPT',
 'MAAFISGLYKD',
 'LPWLDGLDSLVVTWCAPMAAGAG',
 'MLPMYVSGPVISTSDWD',
 'CLSIVCCAPALERSGVVCGGAP',
 'GYWAVAKVNCSL',
 'MLPPAVPACLGR',
 'MNLLINGLPKDYGVSQVIKT',
 'MLPKRAFRGRNR',
 'CFKLASNVKIVRLVDVV',
 'MLTEQVPCPAGV',
 'MHQRRKKVRRC',
 'GPVPAWKVNRR',
 'CLVILVESHADETEEREEMQFKPPVM',
 'MNDMIVEQEIVQDNWV',
 'CEYAPGPKLTLDVQLLVRTKS',
 'GPMGARERPRVRP',
 'CPVKASIVNRVP',
 'CNLQVEGLVVSYGESWVYKS',
 'MLWRAGSRPRRRR',
 'MLLPPNPPPVCGR',
 'MRAKIITRIRNMNNISYLVII

In [51]:
generator.save("/content/DreamWalker/model_weights/GANWeights/PeptideGenerator.keras")
discriminator.save("/content/DreamWalker/model_weights/GANWeights/PeptideDiscriminator.keras")