Data Analysis Pipeline:


*  Clone the repo to carry the dependencies over and to invoke imagene.py.
*  Import all necessary modules.


*   Simulate data for ancient and moderate strength selection selrange = seq 0 200 200 / timerange = 0.1 100kya
*   Build and compile baseline model.


*   Train model on training data and gather metrics.

*   Test trained network on unseen data.
*  













In [1]:
# Cloning the GitHub repository
!git clone https://github.com/Djinho/EvoNet-CNN-Insight.git

# Change directories into the specified directory
%cd EvoNet-CNN-Insight/Model_training_3/Ancient_moderate



Cloning into 'EvoNet-CNN-Insight'...
remote: Enumerating objects: 2536, done.[K
remote: Counting objects: 100% (409/409), done.[K
remote: Compressing objects: 100% (301/301), done.[K
remote: Total 2536 (delta 203), reused 216 (delta 108), pack-reused 2127[K
Receiving objects: 100% (2536/2536), 4.62 GiB | 29.18 MiB/s, done.
Resolving deltas: 100% (589/589), done.
Updating files: 100% (379/379), done.
/content/EvoNet-CNN-Insight/Model_training_3/Ancient_moderate


In [2]:
import os
import gzip
import _pickle as pickle

import numpy as np
import scipy.stats
import arviz

import tensorflow as tf
from tensorflow.keras import models, layers, optimizers, regularizers
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import load_model
from tensorflow.keras import layers, models, optimizers



# Set seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

from sklearn.model_selection import train_test_split


import itertools
import matplotlib.pyplot as plt
import skimage.transform
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_curve, auc
import pydot  # Optional, but required by keras to plot the model

In [3]:
%run -i ../ImaGene.py

ImaGene uses msms to run simulations for training. Use ../generate_dataset.sh with ../params.txt (modify as needed).

 simulates 200,000 loci (80kbp) under neutral evolution or positive selection (1.5% selection coefficient). Mutation rate: 1.5e-8, recombination rate: 1e-8. Model follows Marth et al. 2004, sampling 198 chromosomal copies.

Specify directories for msms and simulation storage, then run the command. The script splits simulations into batches for training.

In [None]:
# if you wish to generate new training data, do not run otherwise
import subprocess
subprocess.call("bash ../generate_dataset.sh params_ANT_moderate.txt".split());

In [None]:
path = './'

In [None]:

path_sim = './'

In [None]:
file_sim = ImaFile(simulations_folder=path_sim + 'AM/Simulations1', nr_samples=198, model_name='Marth-3epoch-CEU');

Populate an ImaGene object by specifying the variable to estimate (selection_coeff_hetero) and the number of data points per class. Use 8000 data points per class as an example.

In [None]:
gene_sim = file_sim.read_simulations(parameter_name='selection_coeff_hetero', max_nrepl=8000);

In [None]:
gene_sim.filter_freq(0.01);
gene_sim.sort('rows_freq');


In [None]:
gene_sim.resize((198, 192));

In [None]:
gene_sim.convert(flip=True);


shuffle images before using them for training network.


In [None]:
gene_sim.subset(get_index_random(gene_sim));

In [None]:
gene_sim.targets = to_binary(gene_sim.targets);

In [None]:
gene_sim.save(file=path + 'gene_sim.binary')

In [None]:
gene_sim = load_imagene(file=path + 'gene_sim.binary')

In [None]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
from tensorflow.keras import backend as K

space = {
    'num_layers': hp.choice('num_layers', [1, 2, 3]),
    'filters_1': hp.choice('filters_1', [32, 64, 128]),
    'filters_2': hp.choice('filters_2', [32, 64, 128]),
    'filters_3': hp.choice('filters_3', [64, 128, 256]),
    'kernel_size': hp.choice('kernel_size', [(3,3), (5,5), (7,7)]),
    'pool_size': hp.choice('pool_size', [(2,2), (3,3)]),
    'dense_units': hp.choice('dense_units', [64, 128, 256]),
    'dropout': hp.uniform('dropout', 0.2, 0.5),
    'learning_rate': hp.loguniform('learning_rate', -4, -2)
}


In [None]:
def objective(params):
    # Preprocess the data for training batches
    losses = []
    val_losses = []
    accuracies = []
    val_accuracies = []
    total_epochs = 0

    for i in range(1, 10):
        file_sim = ImaFile(simulations_folder=path_sim + 'AM/Simulations' + str(i), nr_samples=198, model_name='Marth-3epoch-CEU')
        gene_sim = file_sim.read_simulations(parameter_name='selection_coeff_hetero', max_nrepl=8000)

        gene_sim.filter_freq(0.01)
        gene_sim.sort('rows_freq')
        gene_sim.resize((198, 192))
        gene_sim.convert(flip=True)

        gene_sim.subset(get_index_random(gene_sim))
        gene_sim.targets = to_binary(gene_sim.targets)

        model = models.Sequential()
        model.add(layers.Conv2D(filters=int(params['filters_1']), kernel_size=params['kernel_size'], activation='relu', input_shape=gene_sim.data.shape[1:]))
        model.add(layers.MaxPooling2D(pool_size=params['pool_size']))

        if params['num_layers'] > 1:
            model.add(layers.Conv2D(filters=int(params['filters_2']), kernel_size=params['kernel_size'], activation='relu'))
            model.add(layers.MaxPooling2D(pool_size=params['pool_size']))

        if params['num_layers'] > 2:
            model.add(layers.Conv2D(filters=int(params['filters_3']), kernel_size=params['kernel