In [1]:
import pandas as pd
import torch
import argparse
from torch.utils.data import DataLoader
import xlsxwriter
import numpy as np

from collections import defaultdict
import os

from deepcpf1_network import SeqDeepCpf1Net
from deepcpf1_network import SequenceDataset
from deepcpf1_network import predict
from deepcpf1_network import decoding

In [2]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [3]:
args ={'test' : "./data/test_gtoe_50.csv",
'output' : "output.csv",
'model_path': "./weights/T_50_corr_test",
'seed':1,
'sequence_length' : 29,
'kernel_size' :  5,
'pool_size' :2,
'no_cuda' : False,
}

args = dotdict(args)


model_state_paths = []
# Loading Model for Inference
print("Listing weights for the models")
for file in os.listdir(args.model_path):
    if file.endswith(".pt"):
        model_state_paths.append(os.path.join(args.model_path, file))

print(f"{len(model_state_paths)} models are loaded.")


Listing weights for the models
20 models are loaded.


In [4]:
use_cuda = not args.no_cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
device = torch.device("cuda" if use_cuda else "cpu")

test_kwargs = {'batch_size': 300, 'shuffle': False}

model_state_paths = []
# Loading Model for Inference
print("Listing weights for the models")
for file in os.listdir(args.model_path):
    if file.endswith(".pt"):
        model_state_paths.append(os.path.join(args.model_path, file))

print(f"{len(model_state_paths)} models are loaded.")

Listing weights for the models
20 models are loaded.


In [5]:
seq_deep_cpf1 = SeqDeepCpf1Net(args).to(device)

# Load test data
testing_data = SequenceDataset(csv_file=args.test, args=args)
test_dataloader = DataLoader(testing_data, **test_kwargs)

  torch.nn.init.xavier_uniform(self.Seq_deepCpf1_C1.weight)


    29 bp input sequence (2 bp + 5 bp TAM + 20 bp spacer + 2 bp)  \
0                        CGTTGATATCTCCGCCCTTGTGTTCTCAG             
1                        CTTTGATAATAGAACCTATTTCTGCTTGA             
2                        GATTGATGGTCGCCATAGTAGAACAATCC             
3                        AATTGATAAGATATCGGGGTCCGACGTCG             
4                        TGTTGATCTGTGGTAGCGGAAAGAAACTA             
..                                                 ...             
517                      CTTTGATCTGTGTATCCCCGTAGAGCGCA             
518                      CTTTGATATGATGGCCTAAGGCCAGCGCC             
519                      CGTTGATGCTAGGAAGAAACTGCATCAAT             
520                      GTTTGATAGAAAGCCGTCAAGATCAACCA             
521                      TCTTGATGACGTTAACGGTGAATTTCGAC             

     Indel freqeuncy\n(Background substracted, %)  CA  
0                                        9.688581 NaN  
1                                       11.940299 NaN  
2              

In [6]:
predictions_by_models = defaultdict()

predictions_by_models = {model_no : (sequence_vector, y_true, y_pred)}

The tuple is composed of numpy arrays

In [7]:
for idx, model_path in enumerate(model_state_paths):
    seq_deep_cpf1.load_state_dict(torch.load(model_path))
    
    print(f"Predicting on test data: {idx}/{len(model_state_paths)}-th model tested")
    sequence_vectors, y_true, y_pred = predict(seq_deep_cpf1,device,test_dataloader)
    predictions_by_models[idx] = (sequence_vectors, y_true, y_pred)
    

Predicting on test data: 0/20-th model tested
Predicting on test data: 1/20-th model tested
Predicting on test data: 2/20-th model tested
Predicting on test data: 3/20-th model tested
Predicting on test data: 4/20-th model tested
Predicting on test data: 5/20-th model tested
Predicting on test data: 6/20-th model tested
Predicting on test data: 7/20-th model tested
Predicting on test data: 8/20-th model tested
Predicting on test data: 9/20-th model tested
Predicting on test data: 10/20-th model tested
Predicting on test data: 11/20-th model tested
Predicting on test data: 12/20-th model tested
Predicting on test data: 13/20-th model tested
Predicting on test data: 14/20-th model tested
Predicting on test data: 15/20-th model tested
Predicting on test data: 16/20-th model tested
Predicting on test data: 17/20-th model tested
Predicting on test data: 18/20-th model tested
Predicting on test data: 19/20-th model tested


Make an output dataframe

In [8]:
test_df = pd.read_csv(args.test)
print(test_df)

    29 bp input sequence (2 bp + 5 bp TAM + 20 bp spacer + 2 bp)  \
0                        CGTTGATATCTCCGCCCTTGTGTTCTCAG             
1                        CTTTGATAATAGAACCTATTTCTGCTTGA             
2                        GATTGATGGTCGCCATAGTAGAACAATCC             
3                        AATTGATAAGATATCGGGGTCCGACGTCG             
4                        TGTTGATCTGTGGTAGCGGAAAGAAACTA             
..                                                 ...             
517                      CTTTGATCTGTGTATCCCCGTAGAGCGCA             
518                      CTTTGATATGATGGCCTAAGGCCAGCGCC             
519                      CGTTGATGCTAGGAAGAAACTGCATCAAT             
520                      GTTTGATAGAAAGCCGTCAAGATCAACCA             
521                      TCTTGATGACGTTAACGGTGAATTTCGAC             

     Indel freqeuncy\n(Background substracted, %)  CA  
0                                        9.688581 NaN  
1                                       11.940299 NaN  
2              

In [9]:
columns = list(test_df.columns)

In [13]:
for model_no, tup in predictions_by_models.items():
    sequence_vectors,y_true, y_pred = tup

    # Fetching original sequence data

    original_sequences = decoding(sequence_vectors, args.sequence_length)
    data = {columns[0]:original_sequences, f"{model_no}_true": y_true.squeeze(axis=1), f"{model_no}_pred": y_pred.squeeze(axis=1)}  # Dimension handling

    df = pd.DataFrame(data=data)

    test_df=  pd.merge(test_df, df, on= columns[0], how= 'left')
    


test_df.to_excel("pred_result.xlsx", engine ='xlsxwriter')