In [None]:
#mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from __future__ import print_function
import numpy as np
import random
import pandas as pd
from scipy import stats
import sys, os
import logging
import tensorflow as tf
from keras import layers, regularizers
from keras.models import Model, Sequential
from keras.layers import *
from keras.regularizers import l1, l2, L1L2
from sklearn.metrics.pairwise import cosine_similarity
# Instead of keras.utils.np_utils, use tensorflow.keras.utils
from tensorflow.keras.utils import to_categorical # Import to_categorical directly
from keras.optimizers import SGD
from keras.callbacks import EarlyStopping, Callback, ModelCheckpoint, ReduceLROnPlateau
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn import datasets, linear_model
import itertools
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import math as m
import keras.backend as K
import sklearn
import keras
from sklearn.model_selection import train_test_split

In [None]:
nb_classes = 4  # Adjust based on your dataset
a = 0.03  # ISRU activation parameter
epochs = 100
batch_size = 1
# Custom ISRU activation function
def isru(x):
    return x / (tf.sqrt(1 + a * tf.square(x)))

In [None]:
# Load the encoded SNP matrix dataset
file_path = '/content/dtf101 - Sheet1.csv'
data1 = pd.read_csv(file_path)
file_path = '/content/drive/MyDrive/Copy of Copy of cleaned_snp_dataset_DTF.csv'
data2 = pd.read_csv(file_path)

In [None]:
data2 = data2.drop(columns = ['alleles','chrom','pos','strand','assembly#','center','protLSID','assayLSID','panelLSID','QCcode'])

In [None]:
data2 = data2.T

In [None]:
for i, row in data1.iterrows():
    if row['<Trait>'] in data2.index:
        # Append 'DTF_2019' from data1 to the matching index in data2
        data2.loc[row['<Trait>'], 'DTF_2019'] = row['DTF_2019']

In [None]:
data2.dropna(inplace=True)

In [None]:
#clear ram
import gc
gc.collect()

201

In [None]:
encoding_dict = {'N': -1, 'A': 0, 'C': 1, 'G': 2, 'T': 3}

def encode_batch(data, columns):
    for column in columns:
        data[column] = data[column].map(encoding_dict).fillna(data[column])
    return data

# Process in batches (adjust batch size to fit your RAM)
batch_size = 1000
object_columns = data2.select_dtypes(include='object').columns
for i in range(0, len(object_columns), batch_size):
    batch_columns = object_columns[i:i + batch_size]
    data2 = encode_batch(data2, batch_columns)

In [None]:
data2

In [None]:
# Split the data into train (80%) and test (20%) sets
SNP_matrix = data.iloc[:, 4:].apply(pd.to_numeric, errors='coerce').values
pheno = pd.to_numeric(data.iloc[:, 1], errors='coerce').values

# One-hot encode the SNP matrix
arr_SNP = np.empty((SNP_matrix.shape[0], SNP_matrix.shape[1], nb_classes))
for i in range(SNP_matrix.shape[0]):
    arr_SNP[i] = np.eye(nb_classes)[pd.to_numeric(SNP_matrix[i], downcast='signed')]
trainSNP, testSNP, trainPheno, testPheno = train_test_split(arr_SNP, pheno, test_size=0.4, random_state=42)

# Define the model
inputs = Input(shape=(trainSNP.shape[1], nb_classes))

x = Conv1D(10, 4, padding='same', activation='linear', kernel_initializer='TruncatedNormal',
           kernel_regularizer=regularizers.l2(0.1), bias_regularizer=regularizers.l2(0.01))(inputs)
x = Conv1D(10, 20, padding='same', activation='linear', kernel_initializer='TruncatedNormal',
           kernel_regularizer=regularizers.l2(0.1), bias_regularizer=regularizers.l2(0.01))(x)
x = Dropout(0.75)(x)

shortcut = Conv1D(10, 4, padding='same', activation='linear', kernel_initializer='TruncatedNormal',
                  kernel_regularizer=regularizers.l2(0.1), bias_regularizer=regularizers.l2(0.01))(inputs)
x = layers.add([shortcut, x])

x = Conv1D(10, 4, padding='same', activation='linear', kernel_initializer='TruncatedNormal',
           kernel_regularizer=regularizers.l2(0.1), bias_regularizer=regularizers.l2(0.01))(x)
x = Dropout(0.75)(x)
x = Flatten()(x)
x = Dropout(0.75)(x)

outputs = Dense(1, activation=isru, bias_regularizer=regularizers.l2(0.01),
                kernel_initializer='TruncatedNormal', name='out')(x)

model = Model(inputs=inputs, outputs=outputs)
model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['mae'])

# Train the model (no validation set, just training)
early_stopping = EarlyStopping(monitor='loss', patience=5, mode='min')

history = model.fit(trainSNP, trainPheno, batch_size=1, epochs=100,  shuffle=True)

# Predict and calculate correlation
pred = model.predict(testSNP)
pred.shape = (pred.shape[0],)
corr = pearsonr(pred, testPheno)[0]

# Output the results
print(f"Pearson correlation coefficient on the test set: {corr:.4f}")

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

def compile_saliency_function(model):
    # Define a new model that returns the gradients of the output with respect to the input
    inp = model.input  # Get the input of the model
    outp = model.output  # Get the output of the model

    # Create a function to compute gradients using TensorFlow's GradientTape
    def saliency_function(input_data):
      with tf.GradientTape() as tape:
          tape.watch(input_data)
          predictions = model(input_data)
          max_outp = tf.reduce_max(predictions, axis=1)
      saliency = tape.gradient(max_outp, input_data)
      return saliency

    return saliency_function

def show_images_plot(saliency, outname):
    plt.figure(figsize=(15, 8), facecolor='w')

    plt.subplot(2, 1, 1)
    x = np.median(saliency, axis=0)
    plt.plot(x, 'b.')

    # Check if x has at least 11 elements before trying to access index 10
    line = sorted(x, reverse=True)[10] if len(x) > 10 else sorted(x, reverse=True)[-1]
    # if the list has less than 11 elements, get the last element instead

    plt.axhline(y=line, color='b', linestyle='--')
    plt.ylabel('Saliency Value', fontsize=15)

    plt.subplot(2, 1, 2)
    # plt.plot(wald, 'r.')

    # Check if wald has at least 11 elements before trying to access index 10
    # line = sorted(wald, reverse=True)[10] if len(wald) > 10 else sorted(wald, reverse=True)[-1]
    # if the list has less than 11 elements, get the last element instead

    plt.axhline(y=line, color='r', linestyle='--')

    plt.xlabel('SNPs', fontsize=15)
    plt.ylabel('Wald', fontsize=15)

    plt.savefig(outname)
    plt.clf()
    plt.cla()
    plt.close()


testSNP_tensor = tf.convert_to_tensor(testSNP, dtype=tf.float32)

saliency_fn = compile_saliency_function(model)
saliency_output = saliency_fn(testSNP_tensor)  # Call the saliency function with the tensor

saliency_values  = np.abs(saliency_output.numpy()).max(axis=-1)
# Optionally, you can define the Wald statistic for plotting
#wald_statistics = np.random.rand(testSNP.shape[1])  # Replace this with actual Wald statistics if available
output_image_name = 'saliency_plotxx.png'
show_images_plot(saliency_values, output_image_name)

In [None]:
SNP_matrix.shape

(9, 1020)

In [None]:
x = np.median(saliency_values,axis=0)
line = sorted(x, reverse=True)[10] if len(x) > 10 else sorted(x, reverse=True)[-1]

In [None]:
x

array([0.00081995, 0.00161227, 0.00123106, ..., 0.00261605, 0.00282956,
       0.00616691], dtype=float32)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Load the image using mpimg.imread
img = mpimg.imread('/content/saliency_plot.png')

# Display the image using plt.imshow
plt.imshow(img)
plt.show()

In [None]:
from sklearn.model_selection import LeaveOneOut
nb_classes = 4  # Adjust based on your dataset
a = 0.03  # ISRU activation parameter
epochs = 100
batch_size = 1

# Custom ISRU activation function
def isru(x):
    return x / (tf.sqrt(1 + a * tf.square(x)))

# Load the encoded SNP matrix dataset
file_path = '/content/encoded_SNP_matrix.csv'
data = pd.read_csv(file_path)

In [None]:
data =

Unnamed: 0,Folds,Normalized_DTF,index,DTF,S1_12107,S1_12152,S1_12345,S1_22061,S1_22142,S1_37908,...,S1_846429,S1_848286,S1_848455,S1_848475,S1_853390,S1_853511,S1_853551,S1_854269,S1_854608,S1_854910
0,1,-0.455842,MP_7_GW-1,30,-1,-1,-1,-1,-1,-1,...,-1,3,-1,-1,2,2,0,-1,-1,-1
1,2,-0.911685,MP_4_SOY-523,28,-1,-1,-1,-1,-1,-1,...,-1,3,-1,-1,2,-1,-1,-1,-1,-1
2,3,-1.139606,MP_4_SOY-520,27,-1,-1,-1,-1,-1,-1,...,-1,3,1,1,2,2,-1,-1,-1,-1
3,4,0.227921,MP_7_GW-6,33,-1,-1,-1,-1,-1,-1,...,-1,3,1,1,2,2,-1,-1,-1,-1
4,5,-0.911685,MP_7_GW-7,28,-1,-1,-1,-1,-1,-1,...,-1,2,-1,1,-1,2,-1,-1,-1,-1
5,6,2.05129,MP_3_SOY_460,41,-1,-1,1,-1,-1,-1,...,-1,2,-1,-1,3,2,-1,-1,-1,-1
6,7,0.683763,MP_7_GW-9,35,-1,-1,-1,-1,-1,-1,...,-1,-1,1,1,-1,2,0,-1,-1,-1
7,8,0.227921,MP_7_GW-10,33,-1,-1,-1,-1,-1,-1,...,-1,2,0,1,3,-1,-1,-1,-1,-1
8,9,0.227921,MP_7_GW-12,33,-1,-1,-1,-1,-1,-1,...,3,3,-1,1,2,2,-1,-1,-1,-1


In [None]:
1from sklearn.model_selection import LeaveOneOut
nb_classes = 4  # Adjust based on your dataset
a = 0.03  # ISRU activation parameter
epochs = 100
batch_size = 1

# Custom ISRU activation function
def isru(x):
    return x / (tf.sqrt(1 + a * tf.square(x)))

# Load the encoded SNP matrix dataset
file_path = '/content/encoded_SNP_matrix.csv'
data = pd.read_csv(file_path)

# Separate features (SNP matrix) and labels (Phenotypes)
SNP_matrix = data.iloc[:, 4:].apply(pd.to_numeric, errors='coerce').values
pheno = pd.to_numeric(data.iloc[:, 1], errors='coerce').values

# One-hot encode the SNP matrix
arr_SNP = np.empty((SNP_matrix.shape[0], SNP_matrix.shape[1], nb_classes))
for i in range(SNP_matrix.shape[0]):
    arr_SNP[i] = np.eye(nb_classes)[pd.to_numeric(SNP_matrix[i], downcast='signed')]

# Define the model creation function for LOOCV
def create_model():
    inputs = Input(shape=(arr_SNP.shape[1], nb_classes))
    x = Conv1D(10, 4, padding='same', activation='linear', kernel_initializer='TruncatedNormal',
               kernel_regularizer=regularizers.l2(0.1), bias_regularizer=regularizers.l2(0.01))(inputs)
    x = Conv1D(10, 20, padding='same', activation='linear', kernel_initializer='TruncatedNormal',
               kernel_regularizer=regularizers.l2(0.1), bias_regularizer=regularizers.l2(0.01))(x)
    x = Dropout(0.75)(x)

    shortcut = Conv1D(10, 4, padding='same', activation='linear', kernel_initializer='TruncatedNormal',
                      kernel_regularizer=regularizers.l2(0.1), bias_regularizer=regularizers.l2(0.01))(inputs)
    x = layers.add([shortcut, x])

    x = Conv1D(10, 4, padding='same', activation='linear', kernel_initializer='TruncatedNormal',
               kernel_regularizer=regularizers.l2(0.1), bias_regularizer=regularizers.l2(0.01))(x)
    x = Dropout(0.75)(x)
    x = Flatten()(x)
    x = Dropout(0.75)(x)

    outputs = Dense(1, activation=isru, bias_regularizer=regularizers.l2(0.01),
                    kernel_initializer='TruncatedNormal', name='out')(x)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['mae'])

    return model

# Leave-One-Out Cross-Validation (LOOCV)
loo = LeaveOneOut()
y_true, y_pred = [], []

for train_index, test_index in loo.split(arr_SNP):
    trainSNP, testSNP = arr_SNP[train_index], arr_SNP[test_index]
    trainPheno, testPheno = pheno[train_index], pheno[test_index]

    model = create_model()

    model.fit(trainSNP, trainPheno, batch_size=1, epochs=100, shuffle=True, verbose=0)

    pred = model.predict(testSNP)
    y_pred.append(pred[0][0])
    y_true.append(testPheno[0])

# Calculate Pearson correlation
corr = pearsonr(y_pred, y_true)[0]

# Output the results
print(f"Pearson correlation coefficient (LOOCV): {corr:.4f}")

# Calculate Mean Squared Error
mse = mean_squared_error(y_true, y_pred)
print(f"Mean Squared Error (LOOCV): {mse:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 285ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 292ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 184ms/step
Pearson correlation coefficient (LOOCV): -0.4656
Mean Squared Error (LOOCV): 1.0341


In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

def compile_saliency_function(model):
    # Define a new model that returns the gradients of the output with respect to the input
    inp = model.input  # Get the input of the model
    outp = model.output  # Get the output of the model

    # Create a function to compute gradients using TensorFlow's GradientTape
    def saliency_function(input_data):
      with tf.GradientTape() as tape:
          tape.watch(input_data)
          predictions = model(input_data)
          max_outp = tf.reduce_max(predictions, axis=1)
      saliency = tape.gradient(max_outp, input_data)
      return saliency

    return saliency_function

def show_images_plot(saliency, wald, outname):
    plt.figure(figsize=(15, 8), facecolor='w')

    plt.subplot(2, 1, 1)
    x = np.median(saliency, axis=-1)
    plt.plot(x, 'b.')

    # Check if x has at least 11 elements before trying to access index 10
    line = sorted(x, reverse=True)[10] if len(x) > 10 else sorted(x, reverse=True)[-1]
    # if the list has less than 11 elements, get the last element instead

    plt.axhline(y=line, color='b', linestyle='--')
    plt.ylabel('Saliency Value', fontsize=15)

    plt.subplot(2, 1, 2)
    plt.plot(wald, 'r.')

    # Check if wald has at least 11 elements before trying to access index 10
    line = sorted(wald, reverse=True)[10] if len(wald) > 10 else sorted(wald, reverse=True)[-1]
    # if the list has less than 11 elements, get the last element instead

    plt.axhline(y=line, color='r', linestyle='--')

    plt.xlabel('SNPs', fontsize=15)
    plt.ylabel('Wald', fontsize=15)

    plt.savefig(outname)
    plt.clf()
    plt.cla()
    plt.close()


testSNP_tensor = tf.convert_to_tensor(testSNP, dtype=tf.float32)

saliency_fn = compile_saliency_function(model)
saliency_output = saliency_fn(testSNP_tensor)  # Call the saliency function with the tensor

saliency_values  = np.abs(saliency_output.numpy()).max(axis=-1)
# Optionally, you can define the Wald statistic for plotting
wald_statistics = np.random.rand(testSNP.shape[1])  # Replace this with actual Wald statistics if available
output_image_name = 'saliency_plot.png'
show_images_plot(saliency_values, wald_statistics, output_image_name)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Load the image using mpimg.imread
img = mpimg.imread('/content/saliency_plot.png')
plt.imshow(img)
plt.show()