<a href="https://colab.research.google.com/github/EarlLem/462-GAN/blob/main/Neural_architecture.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Первая рабочая версия

## Импорты

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
import numpy as np
from scipy.stats import binom, uniform
import random

## Генератор ридов

In [None]:
def get_rand_ind(left: int, right: int, count: int) -> list:
    res_indexes = set()
    while len(res_indexes) < count:
        new_index = random.randrange(left, right)
        if new_index not in res_indexes:
            res_indexes.add(new_index)
    return list(res_indexes)


## Генерация датасета

In [None]:
# фиксированная длина ридов и фиксированная ошибка
read_length = 60
#интервал ошибок
a, b = (0.001, 0.05)
#размер тренировочного набора и тестового
train_size = 1000
test_size = 500
#количество ридов
N_reads = 1000

In [None]:
train_y = np.random.uniform(low=0.0, high=1.0, size=train_size)
train_y = train_y.round().astype('int32')
test_y = np.random.uniform(low=0.0, high=1.0, size=test_size)
test_y = test_y.round().astype('int32')

In [None]:
def dataset_generator(dataset_identifier):
  dataset = np.empty((dataset_identifier.shape[0], N_reads, read_length))
  sum_rl = 0
  for i in range(1, read_length+1):
    sum_rl += 1 / i
  for _i in range(dataset_identifier.shape[0]):
    data = []
    err = np.exp(random.uniform(np.log(a), np.log(b)))
    corr = read_length * err / sum_rl
    if dataset_identifier[_i] == 0:
      for _j in range(N_reads):
        k = binom.rvs(read_length, err)
        err_lst = get_rand_ind(0, read_length, k)
        res = [1 if i in err_lst else 0 for i in range(read_length)]
        data.append(res)
    elif dataset_identifier[_i] == 1:
      for _j in range(N_reads):
        cube = [random.uniform(0, 1) for i in range(read_length)]
        p = [1/x for x in range(1, read_length+1)]
        res = [1 if cube[i] < p[i] * corr else 0 for i in range(read_length)]
        data.append(res)
    data = np.array(data)
    dataset[_i] = data
  return dataset

In [None]:
train_x = dataset_generator(train_y)
test_x = dataset_generator(test_y)
#тесил что форма совпадает с параметрами(работает долго, сначала советую позапускать при малых N_reads)
print(train_x.shape, train_x[0].shape, train_x[0][0].shape)

In [None]:
#Разбиение массива тренироваочных данных на блоки
BATCH_SIZE = 50
reads_tensor = tf.convert_to_tensor(train_x)
train_dataset = tf.data.Dataset.from_tensor_slices(reads_tensor).batch(BATCH_SIZE)

## Инициализация и обучение модели

In [None]:
#Находит среднее от каждой строки тензора
def meanlayer(tensors):
  out = tf.reduce_mean(tensors, axis=1)
  return out

In [None]:
model = Sequential()
#Сверточный слой выделяющий вектор признаков
model.add(tf.keras.layers.Conv1D(read_length/4, 1, activation='relu', input_shape=(N_reads, read_length)))
#Слой, подсчитывающий среднее каждого вектора признаков
model.add(tf.keras.layers.Lambda(meanlayer))
#Плотные скрытые слои по read_length нейронов
model.add(tf.keras.layers.Dense(read_length, activation='relu'))
model.add(tf.keras.layers.Dense(read_length, activation='relu'))
#Выходной слой(ошибки 2 - поэтому размерность 2)
model.add(tf.keras.layers.Dense(2, activation=tf.nn.softmax))
#Сборка
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#Обучение
model.fit(train_x, train_y, epochs=20)

In [None]:
#Проверка на тестовом датасете
val_loss, val_acc = model.evaluate(test_x, test_y)
print(val_loss, val_acc)

In [None]:
from tensorflow import keras

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(N_reads, read_length)),
    keras.layers.Dense(45, activation=tf.nn.relu),
	keras.layers.Dense(20, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(train_x, train_y, epochs=50)

test_loss, test_acc = model.evaluate(X_test, y_test)
print('Test accuracy:', test_acc)

# GAN try 1

In [None]:
train_y = np.ones(train_size)
train_x = dataset_generator(train_y)
print(train_x.shape, train_x[0].shape, train_x[0][0].shape)

In [None]:
BATCH_SIZE = 50
reads_tensor = tf.convert_to_tensor(train_x)
train_dataset = tf.data.Dataset.from_tensor_slices(reads_tensor).batch(BATCH_SIZE)

In [None]:
def make_discriminator_model():
  model = Sequential()
  #Сверточный слой выделяющий вектор признаков
  model.add(tf.keras.layers.Conv1D(15, 1, activation='relu', input_shape=(N_reads, read_length)))
  #Слой, подсчитывающий среднее каждого вектора признаков
  model.add(tf.keras.layers.Lambda(meanlayer))
  #Плотные скрытые слои по read_length нейронов
  model.add(tf.keras.layers.Dense(read_length, activation='relu'))
  model.add(tf.keras.layers.Dense(read_length, activation='relu'))
  #Выходной слой
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
  return model

In [None]:
discriminator = make_discriminator_model()

In [None]:
def make_generator_model():
  model = Sequential()
  model.add(tf.keras.layers.Dense(N_reads, use_bias=False, input_shape=(N_reads, )))
  model.add(tf.keras.layers.LeakyReLU())
  model.add(tf.keras.layers.Reshape((1000,1)))
  model.add(tf.keras.layers.Conv1DTranspose(12, 1, padding="same"))
  model.add(tf.keras.layers.Conv1DTranspose(60, 1, padding="same", activation="sigmoid"))
  return model

In [None]:
generator = make_generator_model()

In [None]:
cross_entropy = tf.losses.BinaryCrossentropy(from_logits=True)

def discriminator_loss(real_output, fake_output):
  real_loss = cross_entropy(tf.ones_like(real_output), real_output)
  fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
  total_loss = real_loss + fake_loss
  return total_loss

def generator_loss(fake_output):
  return cross_entropy(tf.ones_like(fake_output), fake_output)

In [None]:
generator_optimizer = tf.keras.optimizers.Adam(1e-4)
discriminator_optimizer = tf.keras.optimizers.Adam(1e-4)

In [None]:
import time
EPOCHS = 50
noise_dim = N_reads

In [None]:
def train_step(matrices):
    noise = tf.random.normal([BATCH_SIZE, noise_dim])

    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
      generated_matrices = generator(noise, training=True)
      histogram(generated_matrices)

      real_output = discriminator(matrices, training=True)
      fake_output = discriminator(generated_matrices, training=True)

      gen_loss = generator_loss(fake_output)
      disc_loss = discriminator_loss(real_output, fake_output)

    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

    print("generator loss: ", np.mean(gen_loss))
    print("discriminator loss: ", np.mean(disc_loss))

In [None]:
def train(dataset, epochs):
  for epoch in range(epochs):
    start = time.time()

    for matrix_batch in dataset:
      train_step(matrix_batch)

    print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start))

In [None]:
train(train_dataset, EPOCHS)

# GAN try Тая

In [None]:
def dataset_generator(dataset_identifier):
  dataset = np.empty((dataset_identifier.shape[0], N_reads, read_length))
  sum_rl = 0
  for i in range(1, read_length+1):
    sum_rl += 1 / i
  for _i in range(dataset_identifier.shape[0]):
    data = []
    err = np.exp(random.uniform(np.log(a), np.log(b)))
    corr = read_length * err / sum_rl
    if dataset_identifier[_i] == 0:
      for _j in range(N_reads):
        k = binom.rvs(read_length, err)
        err_lst = get_rand_ind(0, read_length, k)
        res = [1 if i in err_lst else 0 for i in range(read_length)]
        data.append(res)
    elif dataset_identifier[_i] == 1:
      for _j in range(N_reads):
        cube = [random.uniform(0, 1) for i in range(read_length)]
        p = [1/x for x in range(1, read_length+1)]
        res = [1 if cube[i] < p[i] * corr else 0.1 for i in range(read_length)]
        data.append(res)
    data = np.array(data)
    dataset[_i] = data
  return dataset

In [None]:
def get_rand_ind(left: int, right: int, count: int) -> list:
    res_indexes = set()
    while len(res_indexes) < count:
        new_index = random.randrange(left, right)
        if new_index not in res_indexes:
            res_indexes.add(new_index)
    return list(res_indexes)

In [None]:
#Находит среднее от каждой строки тензора
def meanlayer(tensors):
  out = tf.reduce_mean(tensors, axis=1)
  return out

In [None]:
# фиксированная длина ридов и фиксированная ошибка
read_length = 60
#интервал ошибок
a, b = (0.001, 0.05)
#размер тренировочного набора и тестового
train_size = 1000
test_size = 50
#количество ридов
N_reads = 1000

In [None]:
import matplotlib.pyplot as plt
def histogram(data, size):
  X = np.arange(read_length)
  # Y = np.sum(data, axis=(0, 1)) / (N_reads*size)
  Y = np.sum(data, axis=(1, 0))
  plt.plot(X, Y, "o")
  plt.show()

In [None]:
# y_train = np.random.uniform(low=0.6, high=1, size=train_size)
# y_train = y_train.round().astype('int32')
y_train = np.ones(train_size)
X_train = dataset_generator(y_train)

In [None]:
X_train.shape

In [None]:
X_train[0]

In [None]:
histogram(X_train, train_size)

In [None]:
batch_size = 100
epoches = 200
const = 4

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import UpSampling2D
from tensorflow.keras.layers import Lambda
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Conv2D, Conv1DTranspose, Conv1D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.optimizers import SGD

import numpy as np

import math


def generator_model():
    model = Sequential()
    # model.add(Conv1D(read_length, kernel_size=3, padding='same'))
    # model.add(Activation('relu'))
    model.add(Dense(int(read_length / const)))
    model.add(Activation('relu'))
    model.add(Conv1DTranspose(read_length, kernel_size=3, padding='same', input_shape=(N_reads, int(read_length / const))))
    model.add(Activation('relu'))

    return model

def discriminator_model():
    model = Sequential()
    
    model.add(Conv1D(15, 3, activation='relu'))
    model.add(Lambda(meanlayer))
    model.add(Dense(read_length, activation='relu'))
    model.add(Dense(read_length, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model    

def generator_containing_discriminator(generator, discriminator):
    model = Sequential()
    model.add(generator)
    discriminator.trainable = False
    model.add(discriminator)
    return model    

In [None]:
def train(BATCH_SIZE):

    discriminator = discriminator_model()
    generator = generator_model()
    discriminator_on_generator = \
        generator_containing_discriminator(generator, discriminator)
    # d_optim = SGD(lr=0.0005, momentum=0.9, nesterov=True)
    d_optim = tf.keras.optimizers.Adam(lr=0.0003, beta_1=0.5)
    g_optim = tf.keras.optimizers.Adam(lr=0.0003, beta_1=0.5)
    # g_optim = SGD(lr=0.0005, momentum=0.9, nesterov=True)
    generator.compile(loss='binary_crossentropy', optimizer='adam')
    discriminator_on_generator.compile(
        loss='binary_crossentropy', optimizer=g_optim)
    discriminator.trainable = True
    discriminator.compile(loss='sparse_categorical_crossentropy', optimizer=d_optim)
    # noise = np.zeros((BATCH_SIZE, 100))
    for epoch in range(epoches):
        print("Epoch is", epoch)
        print("Number of batches", int(X_train.shape[0]/BATCH_SIZE))
        for index in range(int(X_train.shape[0]/BATCH_SIZE)):
            noise = np.array(np.random.uniform(0, 1, size=(BATCH_SIZE, N_reads, int(read_length / const ))))
            # noise = np.array(np.random.uniform(0, 1, size=(BATCH_SIZE, N_reads, read_length)))
            
            data_batch = X_train[index*BATCH_SIZE:(index+1)*BATCH_SIZE]
            # print(data_batch.shape)
            generated_data = generator.predict(noise, verbose=0)
            histogram(generated_data, BATCH_SIZE)#graph
            X = np.concatenate((data_batch, generated_data))
            y = np.array([1] * BATCH_SIZE + [0] * BATCH_SIZE).reshape(2*BATCH_SIZE, 1)
            d_loss = discriminator.train_on_batch(X, y)
            print("batch %d d_loss : %f" % (index, d_loss))
            noise = np.array(np.random.uniform(0, 1, size=(batch_size, N_reads, int(read_length / const))))
            # noise = np.array(np.random.uniform(0, 1, size=(BATCH_SIZE, N_reads, read_length)))    
            discriminator.trainable = False
            g_loss = discriminator_on_generator.train_on_batch(noise, np.array([1] * BATCH_SIZE))
            discriminator.trainable = True
            print("batch %d g_loss : %f" % (index, g_loss))
            if index % 10 == 9:
                generator.save_weights('generator', True)
                discriminator.save_weights('discriminator', True)

In [None]:
train(BATCH_SIZE=batch_size)

# Ган by Лидия

In [None]:
def define_generator():
  model = Sequential()
  model.add(tf.keras.layers.Conv1D(15, 1, activation='relu', input_shape=(N_reads, read_length)))
  model.add(tf.keras.layers.Lambda(meanlayer, (N_reads, 1)))
  model.add(tf.keras.layers.Dense(2))
  model.add(tf.keras.layers.Dense(read_length, activation = 'relu'))
  model.add(tf.keras.layers.BatchNormalization(momentum=0.8))
  model.add(tf.keras.layers.Dense(read_length/4))
  #model.add(tf.keras.layers.Dense(4, activation='sigmoid'))
  model.add(tf.keras.layers.Dense(np.prod(train_x[0].shape), activation='tanh'))
  model.add(tf.keras.layers.Reshape(train_x[0].shape))
  return model

def define_gan(g_model, d_model):
	# make weights in the discriminator not trainable
	d_model.trainable = False
	# connect them
	model = Sequential()
	# add generator
	model.add(g_model)
	# add the discriminator
	model.add(d_model)
	# compile model
	opt = tf.keras.optimizers.Adam(lr=0.0002, beta_1=0.5)
	model.compile(loss='binary_crossentropy', optimizer=opt)
	return model

# generate points in latent space as input for the generator
def generate_latent_points(latent_dim, n_samples, n):
  # generate points in the latent space
  x_input = np.random.randint(60, size = latent_dim * n_samples*n)
  # reshape into a batch of inputs for the network
  x_input = x_input.reshape(n_samples, n, latent_dim)
  return x_input

def generate_fake_samples(g_model, latent_dim, n_samples, n):
	# generate points in latent space
	x_input = generate_latent_points(latent_dim, n_samples, n)
	# predict outputs
	X = g_model.predict(x_input)
	# create 'fake' class labels (0)
	y = np.zeros((n_samples, 1))
	return X, y

def histogram(data):
  X = np.arange(1, read_length + 1)
  Y = np.sum(data, axis=(0, 1))
  plt.plot(X, Y, "o")
  plt.show()

def summarize_performance(epoch, g_model, d_model, X, Y):
	# evaluate discriminator on real examples
	_, acc_real = d_model.evaluate(X, Y, verbose=0)
	# prepare fake examples
	x_fake, y_fake = generate_fake_samples(g_model, read_length, N_reads, N_reads)
	# evaluate discriminator on fake examples
  #x_fake = tf.math.round(x_fake)
	_, acc_fake = d_model.evaluate(x_fake, y_fake, verbose=0)
	# summarize discriminator performance
	print('>Accuracy real: %.0f%%, fake: %.0f%%' % (acc_real*100, acc_fake*100))
	# save the generator model tile file
	filename = 'generator_model_%03d.h5' % (epoch+1)
	g_model.save(filename)

# train the generator and discriminator
def train(g_model, d_model, gan_model, train_x, train_y, n_epochs=100, n_batch=50):
  bat_per_epo = int(train_x.shape[0] / n_batch)
  ax = np.arange(1, 60, 1)
  # manually enumerate epochs
  for i in range(n_epochs):
  # enumerate batches over the training set
    for j in range(bat_per_epo):
      d_loss1, _ = d_model.train_on_batch(train_x, train_y)
      # generate 'fake' examples
      X_fake, y_fake = generate_fake_samples(g_model, read_length, N_reads, N_reads)
      # update discriminator model weights
      X_fake = tf.math.round(X_fake)
      d_loss2, _ = d_model.train_on_batch(X_fake, y_fake)
      # prepare points in latent space as input for the generator
      X_gan = generate_latent_points(read_length, N_reads, N_reads)
      # create inverted labels for the fake samples
      y_gan = np.ones((N_reads, 1))
      # update the generator via the discriminator's error
      g_loss = gan_model.train_on_batch(X_gan, y_gan)
      # summarize loss on this batch
      #print('>%d, %d/%d, d1=%.3f, d2=%.3f g=%.3f' %
      #(i+1, j+1, bat_per_epo, d_loss1, d_loss2, g_loss))
      # evaluate the model performance, sometimes
    if (i+1) % 10 == 0:
      summarize_performance(i, g_model, d_model, train_x, train_y)
      histogram(X_fake)

def define_discriminator():
  model = Sequential()
  #Сверточный слой выделяющий вектор признаков
  model.add(tf.keras.layers.Conv1D(15, 1, activation='relu', input_shape=(N_reads, read_length)))
  #Слой, подсчитывающий среднее каждого вектора признаков
  model.add(tf.keras.layers.Lambda(meanlayer))
  #Плотные скрытые слои по read_length нейронов
  model.add(tf.keras.layers.Dense(read_length, activation='relu'))
  model.add(tf.keras.layers.Dense(read_length, activation='relu'))
  #Выходной слой
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
#Сборка
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [None]:
d_model = define_discriminator()
g_model = define_generator()
# create the gan
gan_model = define_gan(g_model, descriminator)
# train model
train(g_model, d_model, gan_model, train_x, train_y)