In [1]:
# Base Imports
import numpy as np
import pandas as pd
import os
import warnings
warnings.filterwarnings('ignore')
# SVD and scaler
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler,normalize
# Image creating and Processing
import matplotlib.pyplot as plt
from PIL import Image

In [2]:
Names = pd.read_csv("C:/Users/ritwi/OneDrive/Documents/MS_DataScience/Thesis/Project/Data/Sample/sample_function.csv")
Names = Names.drop('t_tax_seq_id', axis=1)
Names = Names.drop_duplicates()

In [3]:
Function = pd.read_csv("C:/Users/ritwi/OneDrive/Documents/MS_DataScience/Thesis/Project/Data/Sample/sample_function.csv")
Function = Function.drop('go_function',axis=1)
final_table_columns = ['GO:0008150','GO:0005488','GO:0003824','GO:0005515',
                       'GO:0005575','GO:0008152','GO:0009987','GO:0003674',
                       'GO:0110165','GO:0071704']
Function = Function[Function['go_go_id'].isin(final_table_columns)]

In [4]:
one_hot_encoded = pd.get_dummies(Function, columns=['go_go_id'])
one_hot_encoded = one_hot_encoded.groupby('t_tax_seq_id').sum().reset_index()
one_hot_encoded.columns = one_hot_encoded.columns.str.replace('go_go_id_', '')

In [5]:
Seq = pd.read_csv("C:/Users/ritwi/OneDrive/Documents/MS_DataScience/Thesis/Project/Data/Sample/sample.csv")
Seq = Seq.drop('tax_id',axis=1)

In [6]:
Merge_df = pd.merge(Seq, one_hot_encoded, left_on='tax_seq_id', right_on='t_tax_seq_id', how='inner')

In [7]:
Merge_df = Merge_df.drop(columns=['t_tax_seq_id'])
Merge_df.set_index('tax_seq_id', inplace=True)
Merge_df['seq_length'] = Merge_df['seq'].apply(lambda x: len(str(x)))
Merge_df = Merge_df[Merge_df['seq_length']<=500]
Merge_df = Merge_df.drop(columns=['seq_length'])

In [8]:
X = Merge_df.drop(columns=Merge_df.columns[1:])
Y = Merge_df.drop(columns=Merge_df.columns[0])
total_words = 24

In [9]:
max_length_column1 = X['seq'].apply(len).max()

In [10]:
def tokenized(sequence):
    tokenizer = {'A': 1,'C': 2,'D': 3,'E': 4,'F': 5,'G': 6,
               'H': 7,'I': 8,'K': 9,'L': 10,'M': 11,'N': 12,
               'P': 13,'Q': 14,'R': 15,'S': 16,'T': 17,
               'V': 18,'W': 19,'Y': 20, 'O':21, 'U':22, 'Z':23}
    sequence_values = []
    for letter in sequence:
        if letter in tokenizer:
            sequence_values.append(tokenizer[letter])
    return sequence_values

In [11]:
def range_finder(seq_list):
    mapper = {1:[1,8,10,11,18],2:[5,19,20],3:[12,2,14,16,17],
              4:[3,4],5:[15,7,9],6:[6,13],7:[21,22,23]}
    result = []
    current_group = []
    current_group_number = None
    for number in seq_list:
        for group_number, group_elements in mapper.items():
            if number in group_elements:
                if group_number != current_group_number:
                    if current_group:
                        result.append(current_group)
                    current_group = [number]
                    current_group_number = group_number
                else:
                    current_group.append(number)
                break
        else:
            if current_group:
                result.append(current_group)
            current_group = []
            current_group_number = None
    if current_group:
        result.append(current_group)
    return result

In [12]:
def n_gramer(seq_list,max_length_column1):
    input_sequence = []
    temp = []
    rang = range_finder(seq_list)
    for sublist in rang:
        temp.extend(sublist)
        input_sequence.append(temp.copy())
    input_sequence = padding(input_sequence,max_length_column1)
    return input_sequence

In [13]:
def padding(input_seq,max_length_column1):
    max_length = max_length_column1
    padded_array = np.zeros((len(input_seq), max_length))
    for i, row in enumerate(input_seq):
        start_index = max_length - len(row)
        padded_array[i, start_index:] = row
    return(padded_array)

In [14]:
for index, row in X.iterrows():
    seq = row['seq']
    temp = tokenized(seq)
    temp = n_gramer(temp, max_length_column1)
    row['seq'] = temp

In [15]:
#Convert Data to a 3D array
def three_d_array(df):
    num_samples = len(df)
    max_rows = df['seq'].apply(lambda arr: arr.shape[0]).max()
    num_columns = df['seq'].iloc[0].shape[1]
    result = np.full((num_samples, max_rows, num_columns), np.nan)
    for i, seq in enumerate(df['seq']):
        start_row = max_rows - seq.shape[0]
        result[i, start_row:, :] = seq
    return result

In [16]:
new_X = three_d_array(X)
new_X = np.nan_to_num(new_X)

In [17]:
def S_V_D(X):
    n_components = 28
    X = StandardScaler().fit_transform(X)
    tsvd = TruncatedSVD(n_components=n_components)
    reduced_array = tsvd.fit_transform(X)
    reduced_array = reduced_array.T
    reduced_array = tsvd.fit_transform(reduced_array)
    reduced_array = reduced_array.T
    positive_mask = reduced_array > 0
    reduced_array[~positive_mask] = -np.log(-reduced_array[~positive_mask])
    reduced_array[positive_mask] = np.log(reduced_array[positive_mask])
    return reduced_array

In [18]:
reduced_X = np.empty((2312, 28, 28))
for i, row in enumerate(new_X):
    temp = row
    temp = S_V_D(temp)
    reduced_X[i] = temp

In [19]:
for i , row in enumerate(reduced_X):
    fig = plt.figure(figsize=(1, 1))
    plt.imshow(row, cmap='Spectral', interpolation='nearest')
    plt.xticks([])
    plt.yticks([])
    plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
    plt.savefig(f'Heatmaps/{i}.png', bbox_inches='tight', pad_inches=0,dpi=28)
    plt.close()