## Load & Run Initialize Model (before training)

In [99]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"

import random
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

from utils.load_networkx import networkx_feat
from utils.macro_dataset import MacroDataset
from utils import macro_unsupervised as unsup
from utils.macro_supervised import MacroSupervised
from utils.macro_attribution import Attribution
from utils import plot

In [100]:
MON_SMILES = 'tables/SMILES_peptides_monomer.txt'
BOND_SMILES = 'tables/SMILES_peptides_bond.txt'

DESCRIPTORS = 'unique_descriptors.json'

TXT_DATA_PATH = 'dataset/classification/'
DF_PATH = 'tables/immuno_peptides.txt'
MODEL_PATH = './'
FIG_PATH = './'

In [3]:
FEAT = 'fp'
NUM_DESCRIPTORS = 126

SEED = 111
TASK = 'classification'
MODEL = 'AttentiveFP'
LABELNAME = 'immunogenic'
SPLIT = '0.6,0.2,0.2'
NORM = 'qt'

NUM_EPOCHS = 10
NUM_WORKERS = 1

SAVE_MODEL = False
SAVE_OPT = False
SAVE_CONFIG = False

PLOT_TYPE = 'val'

CUSTOM_PARAMS = {}

In [4]:
NX_GRAPHS = networkx_feat(
    TXT_DATA_PATH = TXT_DATA_PATH, 
    MON_SMILES = MON_SMILES, 
    BOND_SMILES = BOND_SMILES, 
    FEAT = FEAT, 
    DESCRIPTORS = DESCRIPTORS
)

In [5]:
dgl_dict = MacroDataset(
    DF_PATH = DF_PATH, 
    SEED = SEED, 
    TASK = TASK, 
    LABELNAME = LABELNAME, 
    MODEL = MODEL, 
    NX_GRAPHS = NX_GRAPHS, 
    NORM = NORM)

In [6]:
macro_supervised = MacroSupervised(
    MacroDataset = dgl_dict, 
    MON_SMILES = MON_SMILES, 
    BOND_SMILES = BOND_SMILES, 
    FEAT = FEAT, 
    NUM_DESCRIPTORS = NUM_DESCRIPTORS,
    SEED = SEED, 
    MODEL = MODEL, 
    SPLIT = SPLIT, 
    NUM_EPOCHS = NUM_EPOCHS, 
    NUM_WORKERS = NUM_WORKERS, 
    CUSTOM_PARAMS = CUSTOM_PARAMS, 
    MODEL_PATH = MODEL_PATH, 
    SAVE_MODEL = SAVE_MODEL, 
    SAVE_OPT = SAVE_OPT, 
    SAVE_CONFIG = SAVE_CONFIG)

Directory ./ already exists.


## Analyze Frequency Data From Train/Val/Split Procedure

In [241]:
import pandas as pd
from collections import Counter
import math
import numpy as np

In [322]:
df = pd.read_csv("tables/immuno_peptides.txt")
ID_to_seq = pd.Series(df.sequence.values,index=df.ID).to_dict()

In [323]:
sum(df.ID.str.count("UID"))
len(df.ID)

7652

In [490]:
split_data = pd.read_csv("split_data/111.txt")

train = split_data['train'].dropna()
val = split_data['val'].dropna()
test = split_data['test'].dropna()

print(len(train)) # num of datapoints
# sum(test.str.count("UID")) # num negative
# sum(train.str.count("GID")) # num positive
split_data = split_data.applymap(lambda x: ID_to_seq[x] if not pd.isna(x) else "")

4591


In [491]:
is_amd = split_data.applymap(lambda x: '_amd' in x if isinstance(x, str) else False)
aa_freq = split_data.applymap(lambda x: x.replace('_amd', ''))
aa_freq[is_amd] = split_data[is_amd].applymap(lambda x: x[:-5] if isinstance(x, str) else x).fillna("")
aa_freq = aa_freq.apply(lambda x: Counter(''.join(x)), axis=0)
total_aa = aa_freq.apply(lambda x: sum(x.values()))

aa_df = pd.DataFrame.from_dict(dict(aa_freq), orient='index').fillna(0)
aa_df = aa_df.apply(lambda x: round((x / total_aa)*100, 2), axis=0)
aa_df

# aa_freq = split_data[is_amd].applymap(lambda x: x[-5] if isinstance(x, str) else x).fillna("")

# aa_freq = train.str.replace('_amd', '')
# aa_freq[is_amd] = aa_freq[is_amd].map(lambda x: x[:-1])
# aa_freq = Counter(''.join(aa_freq))

Unnamed: 0,R,V,S,N,L,T,I,H,P,D,Y,G,K,F,M,A,C,E,W,Q
train,6.46,6.9,5.96,3.63,9.78,4.35,6.95,1.69,4.97,3.46,2.62,8.12,8.1,4.96,2.74,6.78,4.47,3.37,1.86,2.81
val,6.15,6.91,5.95,3.7,9.96,4.39,6.9,1.78,4.93,3.39,2.78,8.16,8.16,5.09,2.64,6.73,4.44,3.28,1.72,2.91
test,6.58,6.55,5.84,3.64,9.96,4.31,6.78,1.67,5.11,3.51,2.7,7.9,8.14,4.88,2.78,6.86,4.44,3.51,1.96,2.87


In [451]:
# print(split_data)
is_amd = split_data.applymap(lambda x: '_amd' in x if isinstance(x, str) else False)
amd_freq = split_data[is_amd].applymap(lambda x: x[-5] if isinstance(x, str) else x).fillna("")
amd_freq = amd_freq.apply(lambda x: Counter(''.join(x)), axis=0)
total_amd = amd_freq.apply(lambda x: sum(x.values()))

df = pd.DataFrame.from_dict(dict(amd_freq), orient='index').fillna(0)
df = df.apply(lambda x: round((x / total_amd)*100, 2), axis=0)
df = df.add_suffix('_amd')
df

Unnamed: 0,K_amd,L_amd,A_amd,W_amd,R_amd,F_amd,Q_amd,I_amd,V_amd,H_amd,C_amd,P_amd,S_amd,G_amd,N_amd,M_amd,Y_amd,D_amd,T_amd,E_amd
train,16.89,24.9,3.91,3.12,12.79,12.21,1.86,5.96,3.81,1.07,0.39,0.88,3.61,5.37,1.76,0.39,0.49,0.1,0.29,0.2
val,20.56,26.2,2.54,3.66,12.39,11.27,2.25,5.35,3.94,0.85,0.28,0.85,1.69,5.92,0.28,0.28,1.13,0.0,0.56,0.0
test,20.95,24.86,3.35,1.96,15.92,10.34,1.12,4.19,2.79,0.28,1.4,0.56,3.91,3.63,1.96,0.0,1.68,0.28,0.56,0.28


In [331]:
train_df = train.to_frame('ID')
train_df['sequence'] = train_df['ID'].map(lambda x: ID_to_seq[x])

In [498]:
class computeMetrics():
    def __init__(self, full_data_path, split_data_path):
        
        self._df = pd.read_csv(full_data_path)
        self._ID_to_seq = pd.Series(self._df.sequence.values,index=self._df.ID).to_dict()
        
        self._split_data = pd.read_csv(split_data_path)

        self._ID_df = pd.concat([self._df['ID'], self._split_data['train'], self._split_data['val'],
                               self._split_data['test']], axis=1, keys=['full_db', 'train', 'val', 'test'])

        self._seq_df = pd.concat([self._df['sequence'], self.get_sequence(self._split_data['train']), 
                                  self.get_sequence(self._split_data['val']), 
                                  self.get_sequence(self._split_data['test'])], axis=1, 
                                 keys=['full_db', 'train', 'val', 'test'])

        self._metrics_df = pd.DataFrame({'Split': ['Full Dataset (RAW)', 'Train', 'Validation', 'Test']})

    def get_sequence(self, series):
        return series.map(lambda x: self._ID_to_seq[x] if not pd.isna(x) else np.nan)

    def compare_labels(self):
        num_neg = self._ID_df.apply(lambda x: x.str.count("UID").sum(), axis=0)
        num_pos = self._ID_df.apply(lambda x: x.str.count("GID").sum(), axis=0)
        total = self._ID_df.apply(lambda x: x.count(), axis=0)

        percent_pos = round((num_pos / total)*100, 2)
        percent_neg = round((num_neg / total)*100, 2)

        self.add_metric(percent_pos.values, 'percent_pos')
        self.add_metric(percent_neg.values, 'percent_neg')

    def compare_AA_freq(self):

        # Num occurences of each amidated AA in set
        is_amd = self._seq_df.applymap(lambda x: '_amd' in x if isinstance(x, str) else False)
        amd_freq = self._seq_df[is_amd].applymap(lambda x: x[-5] if isinstance(x, str) else x).fillna("")
        amd_freq = amd_freq.apply(lambda x: Counter(''.join(x)), axis=0)

        total_amd = amd_freq.apply(lambda x: sum(x.values()))
        
        amd_df = pd.DataFrame.from_dict(dict(amd_freq), orient='index').fillna(0)
        amd_df = amd_df.apply(lambda x: round((x / total_amd)*100, 2), axis=0)
        amd_df = amd_df.add_suffix('_amd')

        # Num occurences of each AA in set
        aa_freq = self._seq_df.applymap(lambda x: x.replace('_amd', '') if isinstance(x, str) else x)
        # aa_freq = self._seq_df.applymap(lambda x: x.str.replace('_amd', ''))
        aa_freq[is_amd] = self._seq_df[is_amd].applymap(lambda x: x[:-5] if isinstance(x, str) else x).fillna("")
        aa_freq = aa_freq.apply(lambda x: Counter(''.join(x)), axis=0)
        total_aa = aa_freq.apply(lambda x: sum(x.values()))
        
        aa_df = pd.DataFrame.from_dict(dict(aa_freq), orient='index').fillna(0)
        aa_df = aa_df.apply(lambda x: round((x / total_aa)*100, 2), axis=0)
        
        self.add_metric(aa_df.values, aa_df.columns)
        self.add_metric(amd_df.values, amd_df.columns)

    def add_metric(self, data, col_name):
        self._metrics_df[col_name] = data
        print(self._metrics_df)
        

In [499]:
a = computeMetrics("tables/immuno_peptides.txt", "split_data/111.txt")
a.compare_labels()
a.compare_AA_freq()

                Split  percent_pos
0  Full Dataset (RAW)        22.70
1               Train        22.30
2          Validation        23.20
3                Test        23.38
                Split  percent_pos  percent_neg
0  Full Dataset (RAW)        22.70        77.30
1               Train        22.30        77.70
2          Validation        23.20        76.80
3                Test        23.38        76.62


AttributeError: 'str' object has no attribute 'str'

In [104]:
train = train.map(lambda x: ID_to_seq[x])

In [129]:
# Num occurences of each amidated AA in set
is_amd = train.str.contains('_amd')
amd_freq = train[is_amd].map(lambda x: x[len(x)-5])
amd_freq = Counter(''.join(amd_freq))

# Num occurences of each AA in set
aa_freq = train.str.replace('_amd', '')
aa_freq[is_amd] = aa_freq[is_amd].map(lambda x: x[:-1])
aa_freq = Counter(''.join(aa_freq))

total_amd = sum(amd_freq.values()) # Total amidated AA
total_aa = sum(aa_freq.values()) # Total AA

a = amd_freq.items()
for aa, count in a:
    print(str(aa) + ": " + str(round(count/total_amd*100,2)))

K: 16.89
L: 24.9
A: 3.91
W: 3.12
R: 12.79
F: 12.21
Q: 1.86
I: 5.96
V: 3.81
H: 1.07
C: 0.39
P: 0.88
S: 3.61
G: 5.37
N: 1.76
M: 0.39
Y: 0.49
D: 0.1
T: 0.29
E: 0.2


In [None]:
# Total counts
total_non_amidated = sum(non_amidated_counter.values())
total_amidated = sum(amidated_counter.values())

# Relative frequencies
non_amidated_freq = {aa: count / total_non_amidated for aa, count in non_amidated_counter.items()}
amidated_freq = {aa: count / total_amidated for aa, count in amidated_counter.items()}

In [19]:
# Distribution of Labels (= % amd), relative frequency of each AAs, relative frequency of each AMD among AMD

In [7]:
# for i in range(100):
#     random.randint(0,2**32 - 1)


        # self._seq_df = 
        # pd.concat([df1['c'], df2['c']], axis=1, keys=['df1', 'df2'])
        
        # self._train_data = self.create_df(split_data['train'].dropna())
        # self._val_data = self.create_df(split_data['val'].dropna())
        # self._test_data = self.create_df(split_data['test'].dropna())

    # def create_df(self, series):
    #     df = series.to_frame('ID')
    #     df['sequence'] = df['ID'].map(lambda x: ID_to_seq[x])
    #     return df