In [1]:
import numpy as np
import pandas as pd
import scanpy.api as sc
import anndata



import os
import scipy
from scipy import sparse
import sys

%matplotlib inline 
import matplotlib.pyplot as plt
import datetime as datetime

import tensorflow as tf
import keras
import tensorboard
from keras.layers import Dense, Flatten, Reshape, BatchNormalization
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential
from keras.optimizers import Adam
from keras.optimizers import RMSprop



from sklearn.preprocessing import MinMaxScaler

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from umap import UMAP
import seaborn as sns 

sc.settings.verbosity = 3 
sc.settings.set_figure_params(dpi=80)  # low dpi (dots per inch) yields small inline figures
sc.logging.print_versions()




KeyboardInterrupt: 

# open tensorboard and set log dir 

In [None]:
log_dir = "./tmp/logs/"
summary_writer = tf.summary.create_file_writer(logdir=log_dir)
%load_ext tensorboard
%tensorboard --logdir ./tmp/logs/

# Reading in already preprocessed input data

In [None]:
scaler = MinMaxScaler(feature_range=(-1, 1))

exp_mat = pd.read_csv("Gan_input_3_mg.csv", sep='\t', index_col = 0)
exp_mat = exp_mat.T

input_matrix = np.genfromtxt('Gan_input_3_mg.csv', skip_header=1)

input_matrix = input_matrix.T
input_matrix.shape


input_matrix = np.delete(input_matrix, (0), axis=0)

scaler.fit(input_matrix)
input_matrix = scaler.transform(input_matrix)

i  = np.random.randint(0, input_matrix[0].shape, 500)
validation = input_matrix[i]
input_matrix = np.delete(input_matrix, i, 0)






In [None]:
input_matrix.shape

In [None]:
# Model input dimensions
z_dim = 100
cell = input_matrix[1].shape
out_dim = cell[0]

In [None]:
input_matrix[1]

In [None]:
# Generator as a function. Returns the generator network model.

def build_generator(cell, z_dim) :
  
  # Defines the model
  model = Sequential()

  # Adds a dense layer of 64 neurons with input_dim equal to z_dim
  model.add(Dense(128, input_dim = z_dim))
  

  # Apply Leaky ReLU activaion function 
  model.add(LeakyReLU(alpha=0.2))
    
  model.add(BatchNormalization())

  model.add(Dense(200))
    
  model.add(LeakyReLU(alpha=0.2))
  
  model.add(BatchNormalization())

  # Adds another fully connected layer - output layer 
  model.add(Dense(out_dim, activation = 'tanh'))

  model.add(Reshape(cell))

  return model

In [None]:
# Defining the Discrimator network

def build_discriminator(cell) :

  model = Sequential()

  model.add(Dense(250, input_shape = cell))
  
  model.add(BatchNormalization())
    
  model.add(LeakyReLU(alpha=0.2))

  model.add(Dense(250))
    
  model.add(LeakyReLU(alpha=0.2))
  
  model.add(BatchNormalization())

  model.add(Dense(1, activation = 'sigmoid'))

  return model

In [None]:
# Building the GAN

def build_gan(generator, discrimator) :

  model = Sequential()

  model.add(generator)
  model.add(discrimator)

  return model

discrimator = build_discriminator(cell)
discrimator.compile(loss = 'binary_crossentropy',
                    optimizer = Adam(),
                    metrics = ['accuracy'])

generator = build_generator(cell, z_dim)
discrimator.trainable = False

gan = build_gan(generator, discrimator)
gan.compile(loss='binary_crossentropy', optimizer=Adam())


In [None]:
def generate_cells(number_cells) :
    
    # Draws from random normal.
    z = np.random.normal(0, 1, (1, z_dim))
    
    # predcts using enerator network.
    gen_cell = generator.predict(z)
    
    # Transform back from MinMax scaling between -1 and 1 (necesscary for tanh activation function) 
    gen_cell = scaler.inverse_transform(gen_cell)


   # Repeat of above procedure inside a loop. 
    for cell in range(num_cells -1 ):
    
            z = np.random.normal(0, 1, (1, z_dim))
    
            gen_cells_tmp = generator.predict(z)
    
            gen_cells_tmp = scaler.inverse_transform(gen_cells_tmp)
        
          #  Appends subsequent generated cells to form an array of generated cells
            gen_cell = np.append(gen_cell, gen_cells_tmp, axis=0)
    
    # Converts array to matrice object
    gen_cell = np.asmatrix(gen_cell)
    
    # converts matrice to pandas dataframe
    gen_cell = pd.DataFrame(gen_cell)
    
    # Loop to create cell names (theres deinitely a better way to do this)
    cell_names = []
    for i in range(num_cells ):
        cell_names.append("cell-{}".format(i +1))
        
        
    # Creates final dataframe with gene names and cell IDs
    gen_cell = pd.DataFrame(data=gen_cell.values, columns=exp_mat.columns, index = cell_names)
    
    return gen_cell


In [None]:
losses = []
accuracies = []
iteration_checkpoints = []
Generated_cells = []
sample_checkpoints = []

d_hist = []
g_loss_hist = []

def train(iterations, batch_size, sample_interval, generate_cells_every, num_cells) :

  X_train =  input_matrix

  real = np.ones((batch_size, 1))
  fake = np.zeros((batch_size, 1))

  for iteration in range(iterations) :

    idx = np.random.randint(X_train.shape[0], size = batch_size)
    real_cell = X_train[idx]

    z = np.random.normal(0, 1, (batch_size, z_dim))
    gen_cell = generator.predict(z)

    d_loss_real = discrimator.train_on_batch(real_cell, real)
    d_loss_fake = discrimator.train_on_batch(gen_cell, fake)
    d_loss, accuracy = 0.5 * np.add(d_loss_real, d_loss_fake)
    d_hist.append(d_loss)
    

    z = np.random.normal(0,1, (batch_size, 100))
    gen_imgs = generator.predict(z)
    g_loss = gan.train_on_batch(z, real)
    g_loss_hist.append(g_loss)
    
    with summary_writer.as_default():
      tf.summary.scalar('d_loss', data=d_loss, step=iteration+1)
      tf.summary.scalar('g_loss', data=g_loss, step=iteration+1)
      tf.summary.scalar('accuracy', data=accuracy, step=iteration+1)


    if (iteration + 1) % sample_interval == 0:

      
      accuracies.append(100.0 * accuracy)
      iteration_checkpoints.append(iteration + 1)

      print("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" %
            (iteration + 1, d_loss, 100.0 * accuracy, g_loss))
        
    if (iteration +1) % generate_cells_every == 0 or (iteration +1) == 1000:
        gen_cell = generate_cells(500)
        Generated_cells.append(gen_cell)
        sample_checkpoints.append(iteration +1)
      

In [None]:
!mkdir figures/Vanilla_GAN_793_genes_results

In [None]:
def plot_history(c1_hist, g_hist, name):
    plt.plot(c1_hist, label='Discriminator Loss', )
    #plt.plot(g_hist, label='Generator loss')
    plt.legend(fontsize = 'small')
    plt.savefig('Discriminator_{}'.format(name))
    plt.show()
    plt.plot(g_hist, label='Generator loss', color = "orange")
    plt.legend(fontsize = 'small')
    plt.savefig('Generator_loss_{}'.format(name))
    plt.show()
    plt.close()

In [None]:
# running the model

# Setting hyper parameters
iterations = 20000
batch_size = 32
sample_interval = 1000
generate_cells_every = 5000
num_cells = 500

train(iterations, batch_size, sample_interval, generate_cells_every, num_cells)

In [None]:
plot_history(d_hist, g_loss_hist, "Vanilla_gan_loss_graph.png")

In [2]:
i = 0
for dataset in Generated_cells:
    checkpoint = sample_checkpoints[i]
    gen = dataset.T
    corr = gen.corr()
    corr = corr.to_numpy()
    corr= corr.flatten()
    plt.figure()        
    plt.hist(corr, density=True, range = (0,1))  # `density=False` would make counts
    plt.title(label = "{} Iterations".format(checkpoint))
    plt.ylabel('Density')
    plt.xlabel('Correlation')
    plt.savefig('figures/Vanilla_GAN_793_genes_results/{}'.format(checkpoint))
    i = i +1;    

NameError: name 'Generated_cells' is not defined

In [None]:
i = 0

for dataset in Generated_cells :
    checkpoint = sample_checkpoints[i]
    test = dataset
    adata = sc.AnnData(test)
    adata.X
    sc.pp.neighbors(adata)
    sc.tl.umap(adata)
    sc.tl.louvain(adata, resolution=1, key_added='louvain_r1')
    sc.pl.umap(adata, color='louvain_r1', save = "vanilla_GAN_greyscale_Vanilla_GAN_UMAP_at_{}.png".format(checkpoint), title = "{} iterations".format(checkpoint))
    sc.tl.rank_genes_groups(adata, 'louvain_r1', method='logreg')
    sc.pl.rank_genes_groups(adata, n_genes=20,  save =  "Vanilla_GAN_greyscale_Vanilla_GAN_Gene_rank_at_{}.png".format(checkpoint))
    result = adata.uns['rank_genes_groups']
    groups = result['names'].dtype.names
    pd.DataFrame({group + '_' + key[:1]: result[key][group]
    for group in groups for key in ['names', 'scores']}).head(10)
    i = i +1
        


In [None]:
val = pd.DataFrame(val)

for dataset in Generated_cells:
    gen = dataset
    
    corr = gen.corrwith(val, axis = 1) 
    
    corr = np.array(list(corr))
    
    
    plt.figure()        
    plt.hist(corr, density=True, range = (0,1))  # `density=False` would make counts
    plt.ylabel('Density')
    plt.xlabel('Correlation');   

            
            
        