In [1]:
# Base Imports
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
# Train test split and performance measures
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,accuracy_score
# class balancing
from sklearn.utils.class_weight import compute_sample_weight
# SVD and scaler
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
# Tensorflow model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Bidirectional, Dropout, LSTM

In [2]:
Names = pd.read_csv("C:/Users/ritwi/OneDrive/Documents/MS_DataScience/Thesis/Project/Data/Sample/sample_function.csv")
Names = Names.drop('t_tax_seq_id', axis=1)
Names = Names.drop_duplicates()

In [3]:
Function = pd.read_csv("C:/Users/ritwi/OneDrive/Documents/MS_DataScience/Thesis/Project/Data/Sample/sample_function.csv")
Function = Function.drop('go_function',axis=1)
final_table_columns = ['GO:0008150','GO:0005488','GO:0003824','GO:0005515',
                       'GO:0005575','GO:0008152','GO:0009987','GO:0003674',
                       'GO:0110165','GO:0071704']
Function = Function[Function['go_go_id'].isin(final_table_columns)]

In [4]:
one_hot_encoded = pd.get_dummies(Function, columns=['go_go_id'])
one_hot_encoded = one_hot_encoded.groupby('t_tax_seq_id').sum().reset_index()
one_hot_encoded.columns = one_hot_encoded.columns.str.replace('go_go_id_', '')

In [5]:
Seq = pd.read_csv("C:/Users/ritwi/OneDrive/Documents/MS_DataScience/Thesis/Project/Data/Sample/sample.csv")
Seq = Seq.drop('tax_id',axis=1)

In [6]:
Merge_df = pd.merge(Seq, one_hot_encoded, left_on='tax_seq_id', right_on='t_tax_seq_id', how='inner')

In [7]:
Merge_df = Merge_df.drop(columns=['t_tax_seq_id'])
Merge_df.set_index('tax_seq_id', inplace=True)
#Merge_df['seq_length'] = Merge_df['seq'].apply(lambda x: len(str(x)))
#Merge_df = Merge_df[Merge_df['seq_length']<=500]
#Merge_df = Merge_df.drop(columns=['seq_length'])

In [8]:
X = Merge_df.drop(columns=Merge_df.columns[1:])
Y = Merge_df.drop(columns=Merge_df.columns[0])
total_words = 24

In [9]:
max_length_column1 = 500

In [10]:
len(X)

3131

In [11]:
def tokenized(sequence):
    tokenizer = {'A': 1,'C': 2,'D': 3,'E': 4,'F': 5,'G': 6,
               'H': 7,'I': 8,'K': 9,'L': 10,'M': 11,'N': 12,
               'P': 13,'Q': 14,'R': 15,'S': 16,'T': 17,
               'V': 18,'W': 19,'Y': 20, 'O':21, 'U':22, 'Z':23}
    sequence_values = []
    for i, letter in enumerate(sequence):
        if i < 500 and letter in tokenizer:
            sequence_values.append(tokenizer[letter])
    return sequence_values

In [12]:
def n_gramer(seq_list,max_length_column1):
    result_list_of_lists = [[0] * (len(seq_list) - j - 1) + seq_list[:j + 1] for j in range(len(seq_list))]
    padded = padding(result_list_of_lists,max_length_column1)
    padded = np.array(padded)
    return padded

In [13]:
def padding(result_list_of_lists,max_length):
    result_2d_array = [sublist + [0] * (max_length - len(sublist)) for sublist in result_list_of_lists]
    return result_2d_array

In [14]:
for index, row in X.iterrows():
    seq = row['seq']
    temp = tokenized(seq)
    temp = n_gramer(temp, max_length_column1)
    row['seq'] = temp

In [15]:
# Convert Data to a 3D array
def three_d_array(df):
    num_samples = len(df)
    max_rows = df['seq'].apply(lambda arr: arr.shape[0]).max()
    num_columns = df['seq'].iloc[0].shape[1]
    result = np.full((num_samples, max_rows, num_columns), np.nan)
    for i, seq in enumerate(df['seq']):
        start_row = max_rows - seq.shape[0]
        result[i, start_row:, :] = seq
    return result

In [16]:
new_X = three_d_array(X)
new_X = np.nan_to_num(new_X)

In [17]:
def S_V_D(X):
    n_components = 28
    X = StandardScaler().fit_transform(X)
    tsvd = TruncatedSVD(n_components=n_components)
    reduced_array = tsvd.fit_transform(X)
    reduced_array = reduced_array.T
    reduced_array = tsvd.fit_transform(reduced_array)
    reduced_array = reduced_array.T
    return reduced_array

In [18]:
reduced_X = np.empty((3131, 28, 28))
for i, row in enumerate(new_X):
    temp = row
    temp = S_V_D(temp)
    reduced_X[i] = temp

In [19]:
X_train, X_test, y_train, y_test = train_test_split(reduced_X, Y, test_size=0.2)

In [20]:
model = Sequential()
model.add(Bidirectional(LSTM(24, input_shape=(new_X.shape[1:]),activation='tanh',return_sequences=True)))
model.add(Dropout(0.5))
model.add(LSTM(32,activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(16,activation='tanh'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=[tf.keras.metrics.BinaryAccuracy()])

In [21]:
Records = pd.DataFrame(columns=['Go-Ids', 'F1-Score','Accuracy'])
for column in y_train.columns:
    temp = []
    a = y_train[column]
    b = y_test[column]
    sample_weights = compute_sample_weight(class_weight='balanced', y=a)
    history = model.fit(X_train,a,epochs=10,sample_weight =sample_weights)
    #history = model.fit(X_train,a,epochs=10)
    y_pred = model.predict(X_test)
    y_pred = [0 if value < 0.5 else 1 for value in y_pred]
    temp.append(column)
    temp.append(f1_score(b, y_pred, average='binary'))
    temp.append(accuracy_score(b,y_pred))
    Records.loc[len(Records)] = temp

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10


Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
Records = Records.merge(Names, left_on='Go-Ids', right_on='go_go_id')
Records = Records.drop(columns=['go_go_id'])

In [23]:
Records =  Records.iloc[:,[0,3,1,2]]

In [24]:
Records =  Records.sort_values(by=['F1-Score','Accuracy'], ascending=False)
Records

Unnamed: 0,Go-Ids,go_function,F1-Score,Accuracy
0,GO:0003674,molecular_function,0.78337,0.684211
1,GO:0003824,catalytic activity,0.580645,0.626794
6,GO:0008152,metabolic process,0.487603,0.604466
8,GO:0071704,organic substance metabolic process,0.470588,0.555024
2,GO:0005488,binding,0.448567,0.478469
7,GO:0009987,cellular process,0.422764,0.547049
5,GO:0008150,biological_process,0.393195,0.488038
9,GO:0110165,cellular anatomical entity,0.359862,0.704944
4,GO:0005575,cellular_component,0.339869,0.677831
3,GO:0005515,protein binding,0.330935,0.555024
