In [31]:
from Bio import SeqIO, Entrez
import os
from urllib.error import HTTPError
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from itertools import permutations, product
import functools
from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score
import tqdm
from numba import njit,jit
import numpy as np
from numpy import mean
from numpy import std
import pickle
from os import path
from sklearn.model_selection import cross_val_score
from warnings import simplefilter
from collections import OrderedDict
import threading
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action='ignore', category=FutureWarning)

Entrez.tool = "Zoonosis predictor"

Entrez.email = input("Enter an email address to use NCBI e-utils: ")


In [32]:
def resetkmerdict(permset)->OrderedDict:
        kmerdict = OrderedDict()
        for i in permset:
            kmerdict[i]=0
        return kmerdict

def assign_kmers_to_dict(st, permset, kmer):
    kmerdict=resetkmerdict(permset)
    # st = row[2] # tune for which column the sequence is in
    for j in range(len(st)-kmer+1):
        if not st[j:j+kmer] in permset: continue
        kmerdict[st[j:j+kmer]]+=1
    return kmerdict

def getTrainParams(mergedDf):
    kmer = 4
    print(mergedDf)
    s = product('acgt',repeat = kmer)
    permset = set(["".join(x) for x in list(s)])
    # print(permset)

    l = []
    
    for row in tqdm.tqdm(mergedDf.itertuples()):
        # print(row)
        l.append(assign_kmers_to_dict(row[2], permset, kmer))
        

    finalkmerdict=pd.DataFrame(l)
    # print(finalkmerdict)

    # print("finished")
    mergedDf.fillna(0, inplace=True)

    X = finalkmerdict
    Y = mergedDf['isZoonotic']

    print(X)
    print(Y)
    return train_test_split(X, Y, test_size=0.2, random_state=1)

In [33]:
def getSequences(mergedDf):
    accession_list = [] # maintain order
    isZoonotic_list = [] # maintain order
    accession_set = set()
    isZoonotic_set = set()


    for row in tqdm.tqdm(mergedDf.itertuples()):
        # row[13] = accession, row[15] = infects humans
        for single_acc in row[14].split("; "):
            if single_acc not in accession_set:
                accession_list.append(single_acc)
                isZoonotic_list.append(row[15])
                accession_set.add(single_acc)
                isZoonotic_set.add(row[15])

    ls = []

    # TODO: RUN MULTIPLE THREADS TO SPEED UP

    for index, ID in enumerate(tqdm.tqdm(accession_list[:100])): #only read the first 100 lol
        # multithread for speed up
        
        try:
            QueryHandle = Entrez.efetch(db="nucleotide", id=ID, 
                                        rettype="gb", retmode="text")
        except HTTPError as Error:
            if Error.code == 400:  # Bad request
                raise ValueError(f"Accession number not found: {ID}")
            else:
                raise

        SeqRec = SeqIO.read(QueryHandle, "genbank")
        # print(str(SeqRec.seq))
        ls.append({'accession': ID, 'sequence': str(SeqRec.seq).lower(), 'isZoonotic': isZoonotic_list[index]})

    df = pd.DataFrame(ls)

    return df
    


In [34]:
mergedDf = pd.read_csv("FinalData_Cleaned.csv")
sequences = getSequences(mergedDf)
X_train, X_test, y_train, y_test = getTrainParams(sequences)

913it [00:00, 554985.44it/s]
100%|██████████| 100/100 [00:43<00:00,  2.31it/s]


      accession                                           sequence  isZoonotic
0   NC_025403.1  accagagggaaaatataacaatgtcgttttatagcgatgtaaataa...           1
1   NC_025404.1  accagagggaaaattaagaaaggtcgttccaagacgacttaaaaga...           1
2   NC_028246.1  acggagaaaaacaaaaaaactatagtgattagataaataaggaaaa...           1
3   NC_002077.1  ttgcccactccctctctgcgcgctcgctcgctcggtggggcctgcg...           1
4   NC_006152.1  ctctcccccctgtcgcgttcgctcgctcgctggctcgtttgggggg...           1
..          ...                                                ...         ...
95  NC_004218.1  gtattaaatttttgtaagtcgttatggaattatttagtgacagtgg...           1
96  NC_004219.1  gtatttaaaattcatgtttttgcatcatggcgtgggttacgcaagc...           1
97  NC_004220.1  gtattaaaaattacaagaacctaacattgcaatggagatcttgaga...           1
98  NC_004221.1  gtatttaaaattatagaaagttctgaacctaggggtctttctgtct...           1
99  NC_004204.1  gtattaaaattcagcaattgtccaatttaggaaacattctgtttaa...           1

[100 rows x 3 columns]


100it [00:00, 317.72it/s]

    acca  atgt  catc  cctc  gcgg  tagg  acgt  aaaa  cgta  gacc  ...  gtac  \
0     77    80    95    38    12    28     5   148     5    33  ...    26   
1     90    73    93    39    11    34    11   131    12    52  ...    25   
2     57    79    54    22     7    35     4   296     9    20  ...    25   
3     34     6    27    35    25     5    20    26     6    31  ...     8   
4     31    15    20    32    21     2    11    30     5    28  ...    15   
..   ...   ...   ...   ...   ...   ...   ...   ...   ...   ...  ...   ...   
95     5    14     8     0     2     8     8    16    10     3  ...    10   
96     7     9     5     1     1     7     6    15     4     4  ...     2   
97     2    10     4     2     2     3    12    20     8     1  ...     9   
98     2     9     7     0     1     3    12    15     6     2  ...     4   
99     3     9     3     0     1     1     2     6     1     5  ...     2   

    ctgg  gtgc  cagt  gact  tccg  cgga  ggaa  ttat  cagc  
0     48    46  




In [38]:
best_gradBoost = pickle.load(open('curr_models/gradBoost.pkl', 'rb'))


kmer = 4
print(sequences)

s = product('acgt',repeat = kmer)
permset = set(["".join(x) for x in list(s)])

pred_arr = []
for ind, file in enumerate(os.listdir("./virome_contigs")):
    fasta_sequences = SeqIO.parse(open(f"./virome_contigs/{file}"),'fasta')

    fasta = [x for x in fasta_sequences][0]
    # print(fasta)
    
    name, sequence = fasta.id, str(fasta.seq)
    X_info = sequence.lower()

    oDict = assign_kmers_to_dict(X_info, permset, kmer) # convert ordereddict to pandas dataframe

    kmer_df = pd.DataFrame()

    for i in oDict:
        kmer_df.at[0, i]=oDict[i]
    cols_when_model_builds = best_gradBoost.feature_names_in_
    kmer_df=kmer_df[cols_when_model_builds]
    
    pred_arr.append(best_gradBoost.predict(kmer_df))
    
pred_arr = np.asarray(pred_arr)
    # print(best_gradBoost.predict(kmer_df), sequences.loc[ind]['isZoonotic'])
    # print(best_gradBoost.predict(kmer_df), sequences['isZoonotic'])
        # print(accuracy_score())

print(pred_arr[pred_arr ==1])
print(len(pred_arr))

print(sequences['isZoonotic'][sequences['isZoonotic'] == 1].to_numpy())
print(len(sequences['isZoonotic'][sequences['isZoonotic'] == 1].to_numpy()))

    # with open(output_file) as out_file:
    #     for fasta in fasta_sequences:
    #         name, sequence = fasta.id, str(fasta.seq)
    #         new_sequence = some_function(sequence)
    #         write_fasta(out_file)

      accession                                           sequence  isZoonotic
0   NC_025403.1  accagagggaaaatataacaatgtcgttttatagcgatgtaaataa...           1
1   NC_025404.1  accagagggaaaattaagaaaggtcgttccaagacgacttaaaaga...           1
2   NC_028246.1  acggagaaaaacaaaaaaactatagtgattagataaataaggaaaa...           1
3   NC_002077.1  ttgcccactccctctctgcgcgctcgctcgctcggtggggcctgcg...           1
4   NC_006152.1  ctctcccccctgtcgcgttcgctcgctcgctggctcgtttgggggg...           1
..          ...                                                ...         ...
95  NC_004218.1  gtattaaatttttgtaagtcgttatggaattatttagtgacagtgg...           1
96  NC_004219.1  gtatttaaaattcatgtttttgcatcatggcgtgggttacgcaagc...           1
97  NC_004220.1  gtattaaaaattacaagaacctaacattgcaatggagatcttgaga...           1
98  NC_004221.1  gtatttaaaattatagaaagttctgaacctaggggtctttctgtct...           1
99  NC_004204.1  gtattaaaattcagcaattgtccaatttaggaaacattctgtttaa...           1

[100 rows x 3 columns]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 

In [40]:
print(accuracy_score(sequences['isZoonotic'], pred_arr[:100]))

0.33


In [42]:
def getSingleSequence(accessionID):
    try:
        QueryHandle = Entrez.efetch(db="nucleotide", id=accessionID, 
                                    rettype="gb", retmode="text")
    except HTTPError as Error:
        if Error.code == 400:  # Bad request
            raise ValueError(f"Accession number not found: {accessionID}")
        else:
            raise

    SeqRec = SeqIO.read(QueryHandle, "genbank")
    print(str(SeqRec))

    oDict = assign_kmers_to_dict(X_info, permset, kmer) # convert ordereddict to pandas dataframe

    kmer_df = pd.DataFrame()

    for i in oDict:
        kmer_df.at[0, i]=oDict[i]
    # print(best_gradBoost.predict_proba(kmer_df))

    cols_when_model_builds = best_gradBoost.feature_names_in_
    kmer_df=kmer_df[cols_when_model_builds]
    
    print([round(x, 2) for x in best_gradBoost.predict_proba(kmer_df).tolist()[0]])
    print(best_gradBoost.predict(kmer_df).tolist()[0])

    # ls = []
    # ls.append({'accession': accessionID, 'sequence': str(SeqRec.seq).lower()})
    # df = pd.DataFrame(ls)
    # print(df)

getSingleSequence("PA544053")

ID: PA544053.1
Name: PA544053
Description: WO 2022071435-A/1: SARS-CoV-2 PROTEIN-DERIVED PEPTIDES AND VACCINES INCLUDING THE SAME
Number of features: 1
/molecule_type=DNA
/topology=linear
/data_file_division=PAT
/date=29-JUL-2022
/accessions=['PA544053']
/sequence_version=1
/keywords=['WO 2022071435-A/1']
/source=Severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)
/organism=Severe acute respiratory syndrome coronavirus 2
/taxonomy=['Viruses', 'Riboviria', 'Orthornavirae', 'Pisuviricota', 'Pisoniviricetes', 'Nidovirales', 'Cornidovirineae', 'Coronaviridae', 'Orthocoronavirinae', 'Betacoronavirus', 'Sarbecovirus']
/references=[Reference(title='SARS-CoV-2 PROTEIN-DERIVED PEPTIDES AND VACCINES INCLUDING THE SAME', ...)]
/comment=OS   Severe acute respiratory syndrome coronavirus 2 PN   WO
2022071435-A/1
PD   07-APR-2022
PF   29-SEP-2021 WO 2021JP035967
PR   30-SEP-2020 JP 2020-164630         ,30-APR-2021 JP
PR   JP2021/017159       ,
PR   25-AUG-2021 US 63/236927
PA   ONCOTHERAPY 