In [2]:
from Bio import SeqIO, Entrez
import os
from urllib.error import HTTPError
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
from itertools import permutations, product
from sklearn.metrics import accuracy_score, confusion_matrix, balanced_accuracy_score
import tqdm
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split, cross_val_score

from numpy import mean
from numpy import std
import pickle
from os import path
from sklearn.model_selection import cross_val_score
from warnings import simplefilter
from collections import OrderedDict
from sklearn.metrics import accuracy_score, auc, confusion_matrix, balanced_accuracy_score, precision_recall_curve, auc, roc_curve, roc_auc_score

from concurrent.futures import ThreadPoolExecutor, as_completed
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.ensemble import BalancedBaggingClassifier

from sklearn.model_selection import GridSearchCV

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action='ignore', category=FutureWarning)

Entrez.tool = "Zoonosis predictor"

Entrez.email = input("Enter an email address to use NCBI e-utils: ")

In [3]:
def saveModel(model, name, X_test, y_test, params=None, dir='models/curr_models', gradBoost=False, xgBoost=False):
    if not path.exists(f"{dir}/{name}.pkl"):
        print("does not exist")

        pickle.dump(model, open(f'{dir}/{name}.pkl', 'wb'))
    else:
        predictions = model.predict(X_test)
        currAcc = accuracy_score(y_test, predictions)

        pickled_model = pickle.load(open(f'{dir}/{name}.pkl', 'rb'))
        
        if gradBoost:
            # get features here 
            cols_when_model_builds = pickled_model.feature_names_in_
            X_test=X_test[cols_when_model_builds]
        elif xgBoost:
            # put features into the same order that the model was trained in
            cols_when_model_builds = pickled_model.get_booster().feature_names
            X_test=X_test[cols_when_model_builds]
        
        # .values?
        
        picklePredictions=pickled_model.predict(X_test)
        pickleAcc=accuracy_score(y_test, picklePredictions)
        
        if currAcc > pickleAcc:
            print("update!")

            # TP, FP, FN, TN
            print(confusion_matrix(y_test, picklePredictions).ravel())

            print("curr", currAcc, "pickle", pickleAcc)
            pickle.dump(model, open(f'{dir}/{name}.pkl', 'wb'))

            if params != None:
                pickle.dump(params, open(f'{dir}/{name}-params.pkl', 'wb'))
        else:
            print("no update")
            print("curr", currAcc, "pickle", pickleAcc)
            
            # TP, FP, FN, TN
            print(confusion_matrix(y_test, picklePredictions).ravel())

            model=pickled_model
    return model

In [4]:
def resetkmerdict(permset)->OrderedDict:
        kmerdict = OrderedDict()
        for i in permset:
            kmerdict[i]=0
        return kmerdict

def assign_kmers_to_dict(st, permset, kmer):
    kmerdict=resetkmerdict(permset)
    # st = row[2] # tune for which column the sequence is in
    for j in range(len(st)-kmer+1):
        if not st[j:j+kmer] in permset: continue
        kmerdict[st[j:j+kmer]]+=1
    return kmerdict

def getTrainParams(mergedDf, kmer):

    s = product('acgt',repeat = kmer)
    permset = set(["".join(x) for x in list(s)])
    # print(permset)

    l = []
    
    for row in tqdm.tqdm(mergedDf.itertuples()):
        # print(row)
        l.append(assign_kmers_to_dict(row[2], permset, kmer))

    finalkmerdict=pd.DataFrame(l)
    # print(finalkmerdict)

    # print("finished")
    # mergedDf.fillna(0, inplace=True)

    X = finalkmerdict
    X = X.apply(lambda x: (x-x.min())/(x.max()-x.min()), axis=1)
    Y = mergedDf['isZoonotic']

    

    # print(X)
    # print(Y)
    return (X, Y)

In [5]:
def queryKmer(ID, isZoonotic_list, index, everything):
    FileName = "{}.gb".format(ID)
    try:
        QueryHandle = Entrez.efetch(db="nucleotide", id=ID, 
                                    rettype="gb", retmode="text")
    except HTTPError as Error:
        if Error.code == 400:  # Bad request
            raise ValueError(f"Accession number not found: {ID}")
        else:
            raise

    SeqRec = SeqIO.read(QueryHandle, "genbank")
    info = {'accession': ID, 'sequence': str(SeqRec.seq).lower(), 'isZoonotic': isZoonotic_list[index]}
    everything.append(info)

    pickle.dump(info, open(f"sequences/{ID}.pkl", "wb"))

In [6]:
def getSequences(mergedDf):
    accession_list = [] # maintain order
    isZoonotic_list = [] # maintain order
    accession_set = set()
    isZoonotic_set = set()


    for row in tqdm.tqdm(mergedDf.itertuples()):
        # row[13] = accession, row[15] = infects humans
        for single_acc in row[14].split("; "):
            # print(single_acc)
            if single_acc not in accession_set:
                accession_list.append(single_acc)
                isZoonotic_list.append(0 if not row[16] else 1)
                # accession_set.add(single_acc)
                # isZoonotic_set.add(row[15])
                # print(0 if not row[16] else 1)

    print("passed local retrieval")
    # TODO: RUN MULTIPLE THREADS TO SPEED UP
    threads = []
    vals = []

    
    for index, ID in enumerate(tqdm.tqdm(accession_list[:1000])): #only read the first 100 lol
        # multithread for speed up
        queryKmer(ID, isZoonotic_list, index, vals)
        # x = threading.Thread(target=queryKmer, args=(ID, isZoonotic_list, index, vals))
        # threads.append(x)
        # x.start()

    # for index, thread in enumerate(tqdm.tqdm(threads)):
    #     thread.join()
    df = pd.DataFrame(vals)
    df.to_csv("data/nardus_sequences.csv")

    return df
    


In [41]:
gradBoost = pickle.load(open('models/nardus_gridsearch.pkl', 'rb'))
knn = pickle.load(open('models/curr_models/knn.pkl', 'rb'))
randforest = pickle.load(open('models/curr_models/randforest.pkl', 'rb'))
nardusother = pickle.load(open('models/curr_models/gradBoost.pkl', 'rb'))

print(gradBoost.best_params_, gradBoost.best_score_)

{'learning_rate': 0.1, 'max_depth': 9, 'min_samples_split': 20, 'n_estimators': 140, 'subsample': 0.9} 0.9525684121565281


In [42]:
def getSingleSequence(accessionID):
    try:
        QueryHandle = Entrez.efetch(db="nucleotide", id=accessionID, 
                                    rettype="gb", retmode="text")
    except HTTPError as Error:
        if Error.code == 400:  # Bad request
            raise ValueError(f"Accession number not found: {accessionID}")
        else:
            raise

    SeqRec = SeqIO.read(QueryHandle, "genbank")
    print(SeqRec.seq)
    X_info = SeqRec.seq.lower()
    kmer = 4
    s = product('acgt',repeat = kmer)
    permset = set(["".join(x) for x in list(s)])


    oDict = assign_kmers_to_dict(X_info, permset, kmer) # convert ordereddict to pandas dataframe

    kmer_df = pd.DataFrame()

    for i in oDict:
        kmer_df.at[0, i]=oDict[i]
    # print(best_gradBoost.predict_proba(kmer_df))
    kmer_df = kmer_df.apply(lambda x: (x-x.min())/(x.max()-x.min()), axis=1)

    best_gradBoost = pickle.load(open('models/nardus_testing/gradBoost.pkl', 'rb'))
    cols_when_model_builds = best_gradBoost.feature_names_in_
    kmer_df=kmer_df[cols_when_model_builds]
    
    print(best_gradBoost.predict_proba(kmer_df)[:,1])

    # ls = []
    # ls.append({'accession': accessionID, 'sequence': str(SeqRec.seq).lower()})
    # df = pd.DataFrame(ls)
    # print(df)
# ebola need to removce
getSingleSequence("NC_002549.1")
# west nile is ALREADY validated - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5658487/
getSingleSequence("MF797870")
# dengue need to remove
getSingleSequence("NC_001474")
# zika - DOES NOT have?? - check other strains though
getSingleSequence("MG645981")
# # salmonella - too large
# getSingleSequence("CP026379")

# nonzoonotic cattle thing - dataset does not include this?
getSingleSequence("NC_028246.1")

CGGACACACAAAAAGAAAGAAGAATTTTTAGGATCTTTTGTGTGCGAATAACTATGAGGAAGATTAATAATTTTCCTCTCATTGAAATTTATATCGGAATTTAAATTGAAATTGTTACTGTAATCACACCTGGTTTGTTTCAGAGCCACATCACAAAGATAGAGAACAACCTAGGTCTCCGAAGGGAGCAAGGGCATCAGTGTGCTCAGTTGAAAATCCCTTGTCAACACCTAGGTCTTATCACATCACAAGTTCCACCTCAGACTCTGCAGGGTGATCCAACAACCTTAATAGAAACATTATTGTTAAAGGACAGCATTAGTTCACAGTCAAACAAGCAAGATTGAGAATTAACCTTGGTTTTGAACTTGAACACTTAGGGGATTGAAGATTCAACAACCCTAAAGCTTGGGGTAAAACATTGGAAATAGTTAAAAGACAAATTGCTCGGAATCACAAAATTCCGAGTATGGATTCTCGTCCTCAGAAAATCTGGATGGCGCCGAGTCTCACTGAATCTGACATGGATTACCACAAGATCTTGACAGCAGGTCTGTCCGTTCAACAGGGGATTGTTCGGCAAAGAGTCATCCCAGTGTATCAAGTAAACAATCTTGAAGAAATTTGCCAACTTATCATACAGGCCTTTGAAGCAGGTGTTGATTTTCAAGAGAGTGCGGACAGTTTCCTTCTCATGCTTTGTCTTCATCATGCGTACCAGGGAGATTACAAACTTTTCTTGGAAAGTGGCGCAGTCAAGTATTTGGAAGGGCACGGGTTCCGTTTTGAAGTCAAGAAGCGTGATGGAGTGAAGCGCCTTGAGGAATTGCTGCCAGCAGTATCTAGTGGAAAAAACATTAAGAGAACACTTGCTGCCATGCCGGAAGAGGAGACAACTGAAGCTAATGCCGGTCAGTTTCTCTCCTTTGCAAGTCTATTCCTTCCGAAATTGGTAGTAGGAGAAAAGGCTTGCCTTGAGAAGGTTCAAAGGCAAATTCAA

In [87]:
def getFromSeq(X_info):
    kmer = 4
    s = product('acgt',repeat = kmer)
    permset = set(["".join(x) for x in list(s)])
    X_info = X_info.lower()

    oDict = assign_kmers_to_dict(X_info, permset, kmer) # convert ordereddict to pandas dataframe
    
    kmer_df = pd.DataFrame()

    for i in oDict:
        kmer_df.at[0, i]=oDict[i]
    # print(best_gradBoost.predict_proba(kmer_df))
    kmer_df = kmer_df.apply(lambda x: (x-x.min())/(x.max()-x.min()), axis=1)

    print(kmer_df)
    best_gradBoost = pickle.load(open('models/nardus_testing/gradBoost.pkl', 'rb'))
    cols_when_model_builds = best_gradBoost.feature_names_in_
    kmer_df=kmer_df[cols_when_model_builds]
    
    print(best_gradBoost.predict_proba(kmer_df)[:,1])
    other = pickle.load(open('models/curr_models/gradBoost.pkl', 'rb'))
    
    cols_when_model_builds = other.feature_names_in_
    kmer_df=kmer_df[cols_when_model_builds]
    
    print(other.predict_proba(kmer_df)[:,1])

    asdaf = pickle.load(open('models/curr_models/knn.pkl', 'rb'))
    
    print(asdaf.predict_proba(kmer_df)[:,1])

    asdaf = pickle.load(open('models/curr_models/randforest.pkl', 'rb'))
    
    print(asdaf.predict_proba(kmer_df)[:,1])

In [86]:
with open('test.txt') as f:
    lines = f.readlines()[0]
    print(lines)
    getFromSeq(lines)

accggatggccgcccgaaatttcgtggtgggcccccccttgtcggccaatgatattcgtctctcgaatcctagataagtggatagtgacgtatgcttttgtctttatatacgtcagccctaattttaaactttaaccatgtgggatcctttacttaacgaatttcctgaaaccgttcacggttttaggtgtatgcttgcggttaagtacttgttatcggtagaagctacatactctccggataccataggttatgatttgattcgtgatcttataggtgtcattcgtgccaagaactatgtcgaagcgtcctgcagatatagggattttcactcccgtctccaaggtacggcgccgtctgaacttcgacagcccgtatgccaaccgtgtgagtgcccccattgccctcgccacaaaccgaaggagagcatgggctcaaaggcccatgtatcggaagcccaggatgtacagaatgtacagaagccctgatgtgcctaaggggtgtgaaggcccgtgtaaggtccagtcattcgagaagaaaaatgatgttggtcattctggtactctgctctgtgtatctgatattacccgtggtaatgggcttactcatcgtgttgggaagagattttgtataaaatcagtttatataattggtaaaatatggatggatgaaaatattaagaccaagaatcacactaacaacgtgttattctggttggttagagatagacgacctggttcaaccccttatggatttcaggaggcattcaatatgtttgagaatgaacccagcacggcgactgttaagcaagaattaagagatcgtttgcaggttttacacagatttagtgcgactgttactggtggacaatatgcgtctaaggaacaagcgatcataaaacggttttggaagttgaatcatcatgtcacttacaatcatcaagagcaggctaaatatgagaatcatactgaaaatgctttattattgtatatggctgcta

In [43]:
df = pd.read_csv('data/info.csv')
X_train, X_test, y_train, y_test = train_test_split(df.loc[:, df.columns != 'isZoonotic'], df['isZoonotic'], test_size=0.6, random_state=None)