In [361]:
"""Calculate correlations between learned and known features."""

'Calculate correlations between learned and known features.'

In [362]:
import os
import numpy as np
import tensorflow as tf
import src.models.utils
import src.utils

class MaskedConv1D(tf.keras.layers.Conv1D):
    def __init__(self, filters, kernel_size, **kwargs):
        super().__init__(filters, kernel_size, **kwargs)
        self.supports_masking = True

In [363]:
alphabet = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
weights = {'0': 1, '1': 2, '-': 3}
batch_size = 32
model_acc = ["3_6_1", "6_2", "7_1_2", "7_2_2", "7_3_2"]

batch_data = src.models.utils.load_data('../../mobidb-pdb_validation/split_data/out/all_seqs.fasta', '../../mobidb-pdb_validation/split_data/out/all_labels.fasta')
batches = src.models.utils.BatchGenerator(batch_data, batch_size, alphabet, weights, shuffle=False)

seq_fasta = src.utils.read_fasta('../../mobidb-pdb_validation/split_data/out/all_seqs.fasta')

window_sizes = [10,20,30,40,50]

feature_dict = {"S_frac":lambda seq:get_X_frac(seq,"S"),
                "P_frac":lambda seq:get_X_frac(seq,"P"),
                "T_frac":lambda seq:get_X_frac(seq,"T"),
                "A_frac":lambda seq:get_X_frac(seq,"A"),
                "H_frac":lambda seq:get_X_frac(seq,"H"),
                "Q_frac":lambda seq:get_X_frac(seq,"Q"),
                "N_frac":lambda seq:get_X_frac(seq,"N"),
                "G_frac":lambda seq:get_X_frac(seq,"G"),
                "charge_frac":lambda seq:get_X_frac(seq,"DERK"),
                "net_charge":lambda seq:get_X_frac(seq,"RK")-get_X_frac(seq,"DE"),
                "RK_ratio":lambda seq:(get_X_frac(seq,"R")+1)/(get_X_frac(seq,"K")+1),
                "ED_ratio":lambda seq:(get_X_frac(seq,"E")+1)/(get_X_frac(seq,"D")+1),
                "acidic_frac":lambda seq:get_X_frac(seq,"DE"),
                "basic_frac":lambda seq:get_X_frac(seq,"RK"),
                "hydrophobicity":lambda seq:get_hydrophobicity(seq),
                "aliphatic_frac":lambda seq:get_X_frac(seq,"ALMIV"),
                "polarity":lambda seq:get_polarity(seq),
                "polar_frac":lambda seq:get_X_frac(seq,"QNSTGCH"),
                "chain_expanding_frac":lambda seq:get_X_frac(seq,"EDRKP"),
                "aromatic_frac":lambda seq:get_X_frac(seq,"FYW"),
                "disorder_promoting_frac":lambda seq:get_X_frac(seq,"TAGRDHQKSEP"),
                "Q_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"QQ"),
                "N_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"NN"),
                "S_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"SS"),
                "G_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"GG"),
                "E_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"EE"),
                "D_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"DD"),
                "K_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"KK"),
                "R_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"RR"),
                "P_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"PP"),
                "QN_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"QN"),
                "RG_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"RG"),
                "FG_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"FG"),
                "SG_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"SG"),
                "SR_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"SR"),
                "KAP_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"KAP"),
                "PTS_repeat_frac":lambda seq:get_pair_repeat_frac(seq,"PTS")}

In [364]:
# Load models as dictionary of mapping of model name to model
model_dict = {}
for acc in model_acc:
    model_name = f"mobidb-pdb_cnn_{acc}"
    model_path = f"../../models/{model_name}/out_model/{model_name}.h5"
    model = tf.keras.models.load_model(model_path, custom_objects={"MaskedConv1D":MaskedConv1D})
    model_dict[model_name] = model

In [365]:
# Calculate learned features
model_learned_features = {}  # {"model_name":[features learned by model for each protein]}
for model_key in model_dict:
    model = model_dict[model_key]
    layer = model.get_layer("conv1d2")
    feature_extractor = tf.keras.Model(inputs=model.inputs, outputs=layer.output)
    
    learned_features = []
    for input, _, training_weights, in batches:  # Predict method was acting strange, so extract individual batches
        features = feature_extractor(input).numpy()
        for protein_features, protein_weights in zip(features, training_weights):
            protein_features = protein_features[protein_weights != 0]  # Drop padding
            learned_features.append(protein_features)
    
    model_learned_features[model_key] = learned_features

KeyboardInterrupt: 

In [None]:
# Extract accessions
accessions = []
for accession, _, in seq_fasta:
    accessions.append(accession.split("|")[0][1:]) # Keep only acc

In [None]:
# Construct new record objects that contain known and learned features
records = {}  # {model_key:{accession:{seq:,labels:,learned_features:,known_features:{window_size:{known_feature:}}}}}
for model_key in model_dict:
    model_records = {}
    
    for accession, (seq, labels), learned_features in zip(accessions, batch_data, model_learned_features[model_key]):
        known_features = {}
        for window_size in window_sizes:
            window_features = {}
            with open(f"../generate_maps/out/window_size{window_size}/{accession}_feature_map{window_size}.tsv") as file:
                for feature in feature_dict:
                    window_features[feature] = []
            
                for line in file:
                    line_values = line.split("\t")[:-1]  # Remove \n
                    for feature, value in zip(feature_dict.keys(), line_values):
                        window_features[feature].append(value)
            
            known_features[f"window{window_size}"] = window_features
        
        model_records[accession] = {"seq": seq,
                                    "labels":labels,
                                    "learned_features":learned_features,
                                    "known_features": known_features}
              
    records[model_key] = model_records