In [2]:
import pandas as pd
import torch
import argparse
from torch.utils.data import DataLoader
import xlsxwriter
import numpy as np

from collections import defaultdict
import os

from deepcpf1_network import SeqDeepCpf1Net
from deepcpf1_network import SequenceDataset
from deepcpf1_network import predict
from deepcpf1_network import decoding

In [3]:
class dotdict(dict):
    """dot.notation access to dictionary attributes"""
    __getattr__ = dict.get
    __setattr__ = dict.__setitem__
    __delattr__ = dict.__delitem__

In [5]:
args ={'test' : "./data/test.csv",
'output' : "output.csv",
'model_path': "./weights/",
'seed':1,
'sequence_length' : 34,
'kernel_size' :  5,
'pool_size' :2,
'no_cuda' : True,
}

args = dotdict(args)


model_state_paths = []
# Loading Model for Inference
print("Listing weights for the models")
for file in os.listdir(args.model_path):
    if file.endswith(".pt"):
        model_state_paths.append(os.path.join(args.model_path, file))

print(f"{len(model_state_paths)} models are loaded.")


Listing weights for the models
50 models are loaded.


In [6]:
use_cuda = not args.no_cuda and torch.cuda.is_available()
torch.manual_seed(args.seed)
device = torch.device("cuda" if use_cuda else "cpu")

test_kwargs = {'batch_size': 300, 'shuffle': False}

model_state_paths = []
# Loading Model for Inference
print("Listing weights for the models")
for file in os.listdir(args.model_path):
    if file.endswith(".pt"):
        model_state_paths.append(os.path.join(args.model_path, file))

print(f"{len(model_state_paths)} models are loaded.")

Listing weights for the models
50 models are loaded.


In [7]:
seq_deep_cpf1 = SeqDeepCpf1Net(args).to(device)

# Load test data
testing_data = SequenceDataset(csv_file=args.test, args=args)
test_dataloader = DataLoader(testing_data, **test_kwargs)

     34 bp synthetic target and target context sequence\n(4 bp + PAM + 23 bp protospacer + 3 bp)  \
0                    TTGCTTTAAAACTCGCAAGGCTTTCTGCTTGACC                                            
1                    CAGATTTAAAGACTTTCTGCTGTATTTGAGATGC                                            
2                    GTAGTTTAAAGCCTTTTTTATTGTATCTTGTTGC                                            
3                    CCCTTTTAAATTCTGTGCAGACCATAGTGCTGCT                                            
4                    TTACTTTAACACTCTCAGTTGGCCCATATTCACA                                            
...                                                 ...                                            
1287                 CACCTTTTTTGATTATGATTACGGTGCTCCCTGT                                            
1288                 CAGATTTTTTGCTGTTGGTGAAGGCCCTTGAAGA                                            
1289                 TTTCTTTTTTGGCATTGCGGAGCTTATACATTCC                                            


  torch.nn.init.xavier_uniform(self.Seq_deepCpf1_C1.weight)


In [8]:
predictions_by_models = defaultdict()

predictions_by_models = {model_no : (sequence_vector, y_true, y_pred)}

The tuple is composed of numpy arrays

In [9]:
for idx, model_path in enumerate(model_state_paths):
    seq_deep_cpf1.load_state_dict(torch.load(model_path))
    
    print(f"Predicting on test data: {idx}/{len(model_state_paths)}-th model tested")
    sequence_vectors, y_true, y_pred = predict(seq_deep_cpf1,device,test_dataloader)
    predictions_by_models[idx] = (sequence_vectors, y_true, y_pred)
    

Predicting on test data: 0/50-th model tested
Predicting on test data: 1/50-th model tested
Predicting on test data: 2/50-th model tested
Predicting on test data: 3/50-th model tested
Predicting on test data: 4/50-th model tested
Predicting on test data: 5/50-th model tested
Predicting on test data: 6/50-th model tested
Predicting on test data: 7/50-th model tested
Predicting on test data: 8/50-th model tested
Predicting on test data: 9/50-th model tested
Predicting on test data: 10/50-th model tested
Predicting on test data: 11/50-th model tested
Predicting on test data: 12/50-th model tested
Predicting on test data: 13/50-th model tested
Predicting on test data: 14/50-th model tested
Predicting on test data: 15/50-th model tested
Predicting on test data: 16/50-th model tested
Predicting on test data: 17/50-th model tested
Predicting on test data: 18/50-th model tested
Predicting on test data: 19/50-th model tested
Predicting on test data: 20/50-th model tested
Predicting on test data

Make an output dataframe

In [10]:
test_df = pd.read_csv(args.test)
print(test_df)

     34 bp synthetic target and target context sequence\n(4 bp + PAM + 23 bp protospacer + 3 bp)  \
0                    TTGCTTTAAAACTCGCAAGGCTTTCTGCTTGACC                                            
1                    CAGATTTAAAGACTTTCTGCTGTATTTGAGATGC                                            
2                    GTAGTTTAAAGCCTTTTTTATTGTATCTTGTTGC                                            
3                    CCCTTTTAAATTCTGTGCAGACCATAGTGCTGCT                                            
4                    TTACTTTAACACTCTCAGTTGGCCCATATTCACA                                            
...                                                 ...                                            
1287                 CACCTTTTTTGATTATGATTACGGTGCTCCCTGT                                            
1288                 CAGATTTTTTGCTGTTGGTGAAGGCCCTTGAAGA                                            
1289                 TTTCTTTTTTGGCATTGCGGAGCTTATACATTCC                                            


In [11]:
columns = list(test_df.columns)

In [12]:
for model_no, tup in predictions_by_models.items():
    sequence_vectors,y_true, y_pred = tup

    # Fetching original sequence data

    original_sequences = decoding(sequence_vectors, args.sequence_length)
    data = {columns[0]:original_sequences, f"{model_no}_true": y_true.squeeze(axis=1), f"{model_no}_pred": y_pred.squeeze(axis=1)}  # Dimension handling

    df = pd.DataFrame(data=data)

    test_df=  pd.merge(test_df, df, on= columns[0], how= 'left')
    


test_df.to_excel("pred_result.xlsx", engine ='xlsxwriter')