In [1]:
from itertools import permutations, product

import tqdm

import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from os import path

import matplotlib.pyplot as plt

from warnings import simplefilter
from collections import OrderedDict

if (os.path.abspath('').split('/')[-1] == 'project'):
    %cd utils
elif (os.path.abspath('').split('/')[-1] == 'train_and_vis'):
    %cd ../utils

import query_utils
import model_utils
import validation_utils

if (os.path.abspath('').split('/')[-1] == 'utils'):
    %cd ..

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action='ignore', category=FutureWarning)

/Users/benjaminli/Documents/coding/scires/project/utils
/Users/benjaminli/Documents/coding/scires/project


In [17]:
df = pd.read_csv('data/dataset1/human_infecting_virus', delimiter='\t', header=None)
df[['ID', 'DNA Sequence']] = df[0].str.split(expand=True)
df = df.drop(0, axis=1)
df['isZoonotic'] = 1


df2 = pd.read_csv('data/dataset1/Other_viruses', delimiter='\t', header=None)
df2[['ID', 'DNA Sequence']] = df2[0].str.split(expand=True)
df2 = df2.drop(0, axis=1)
df2['isZoonotic'] = 0


nardus = pd.read_csv('data/dataset2/nardus_sequences.csv')
nardus.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df2.reset_index(drop=True, inplace=True)

# inconsistency with tax IDS
mergedDf = pd.concat([df, df2], axis=0, ignore_index=True)
mergedDf = mergedDf.drop_duplicates(subset=['DNA Sequence'])
mergedDf.reset_index(drop=True, inplace=True)

In [18]:
print(mergedDf)

          ID                                       DNA Sequence  isZoonotic
0      37121  cttttaacaatcatagttttataaaagggtgtaaccgaagcggttt...           1
1     129724  atactttacaattatcttgtaaaaagtagggtgtaaccgaaaaggg...           1
2      59303  atggcggacgtgtgacatcaccgttcgctctttctaggatcctttg...           1
3     120686  ttgttctacttcttactcattattataaattataatgtttgtataa...           1
4      99565  gtgaatgatgatggcgtcgaaagacgtcgttgcagctactgctgct...           1
...      ...                                                ...         ...
9324  358769  gtatacgaggttagttcattctcgtatgcatgattggacaaattaa...           0
9325  268315  ggatccacagaactccttgtatgtacagccgcgggtacccacagca...           0
9326  130329  ctcatgtcattaataagaccatgcagaaaatgcaaatgaggcgaag...           0
9327  358812  gtatacgaggttagttcattctcgtatacacgattggacaaatcaa...           0
9328   11287  atggaaggaggaattcgtgcagcgttttcaggcctgaatgatgtta...           0

[9329 rows x 3 columns]


In [32]:

def resetkmerdict(permset)->OrderedDict:
        kmerdict = OrderedDict()
        for i in permset:
            kmerdict[i]=0
        return kmerdict

def assign_kmers_to_dict(row, permset, kmer):
    kmerdict=resetkmerdict(permset)
    st = row[2] # tune for which column the sequence is in
    for j in range(len(st)-kmer+1):
        if not st[j:j+kmer] in permset: continue
        kmerdict[st[j:j+kmer]]+=1
    return kmerdict

def getTrainParams(mergedDf, kmer, f):
    print(mergedDf)
    s = product('acgt',repeat = kmer)
    permset = set(["".join(x) for x in list(s)])

    l = []
    
    for row in tqdm.tqdm(mergedDf.itertuples()):
        l.append(assign_kmers_to_dict(row, permset, kmer))

    finalkmerdict=pd.DataFrame(l)
    
    # shouldn't need to fill NAs
    # mergedDf.fillna(0, inplace=True)

    X = finalkmerdict
    Y = mergedDf['isZoonotic']
    # also test simple average

    vec = pd.concat([X, Y], axis=1)
    vec.to_csv(f'data/{f}/kmers-{str(kmer)}.csv', index=False)
    
    # try other method as well
    X = X.apply(lambda x: (x-x.min())/(x.max()-x.min()), axis=1)
    # print(X.head())

    place = pd.concat([X, Y], axis=1)
    
    # print(place)

    place.to_csv(f'data/{f}/normalized-{str(kmer)}.csv', index=False)

    return train_test_split(X, Y, test_size=0.2, random_state=1)

In [34]:
for kmer in range(3, 7):
    # based on literature
    X_train, X_test, y_train, y_test = getTrainParams(mergedDf, kmer, "dataset1")
    zz = X_train.head()

    X_train, X_test, y_train, y_test = getTrainParams(nardus, kmer, "dataset2")
    zz = X_train.head()

          ID                                       DNA Sequence  isZoonotic
0      37121  cttttaacaatcatagttttataaaagggtgtaaccgaagcggttt...           1
1     129724  atactttacaattatcttgtaaaaagtagggtgtaaccgaaaaggg...           1
2      59303  atggcggacgtgtgacatcaccgttcgctctttctaggatcctttg...           1
3     120686  ttgttctacttcttactcattattataaattataatgtttgtataa...           1
4      99565  gtgaatgatgatggcgtcgaaagacgtcgttgcagctactgctgct...           1
...      ...                                                ...         ...
9324  358769  gtatacgaggttagttcattctcgtatgcatgattggacaaattaa...           0
9325  268315  ggatccacagaactccttgtatgtacagccgcgggtacccacagca...           0
9326  130329  ctcatgtcattaataagaccatgcagaaaatgcaaatgaggcgaag...           0
9327  358812  gtatacgaggttagttcattctcgtatacacgattggacaaatcaa...           0
9328   11287  atggaaggaggaattcgtgcagcgttttcaggcctgaatgatgtta...           0

[9329 rows x 3 columns]


9329it [01:33, 99.82it/s] 


               ID                                       DNA Sequence  \
0     NC_025403.1  accagagggaaaatataacaatgtcgttttatagcgatgtaaataa...   
1     NC_025404.1  accagagggaaaattaagaaaggtcgttccaagacgacttaaaaga...   
2     NC_028246.1  acggagaaaaacaaaaaaactatagtgattagataaataaggaaaa...   
3     NC_002077.1  ttgcccactccctctctgcgcgctcgctcgctcggtggggcctgcg...   
4     NC_006152.1  ctctcccccctgtcgcgttcgctcgctcgctggctcgtttgggggg...   
...           ...                                                ...   
1843  NC_007661.1  gttaaaactctcacacttatggtggaactggatctgcaaaaatggg...   
1844  NC_007662.1  gttaaaaatctggttgtatctactcttgaatgaacgtgcataaagc...   
1845  NC_007663.1  gttaaaatctggaaccaatatggaagggatttatgcacgtgctttt...   
1846  NC_007664.1  gttaaaaaagagtgcagatgtcgagaatagtcttactaacaccagg...   
1847  NC_007665.1  gttaaaaacctcaagatgcatgccgctattacttcaatgaaacgtg...   

      isZoonotic  
0              0  
1              0  
2              0  
3              1  
4              1  
...          ...  
18

1848it [00:06, 301.87it/s]


          ID                                       DNA Sequence  isZoonotic
0      37121  cttttaacaatcatagttttataaaagggtgtaaccgaagcggttt...           1
1     129724  atactttacaattatcttgtaaaaagtagggtgtaaccgaaaaggg...           1
2      59303  atggcggacgtgtgacatcaccgttcgctctttctaggatcctttg...           1
3     120686  ttgttctacttcttactcattattataaattataatgtttgtataa...           1
4      99565  gtgaatgatgatggcgtcgaaagacgtcgttgcagctactgctgct...           1
...      ...                                                ...         ...
9324  358769  gtatacgaggttagttcattctcgtatgcatgattggacaaattaa...           0
9325  268315  ggatccacagaactccttgtatgtacagccgcgggtacccacagca...           0
9326  130329  ctcatgtcattaataagaccatgcagaaaatgcaaatgaggcgaag...           0
9327  358812  gtatacgaggttagttcattctcgtatacacgattggacaaatcaa...           0
9328   11287  atggaaggaggaattcgtgcagcgttttcaggcctgaatgatgtta...           0

[9329 rows x 3 columns]


9329it [01:37, 95.23it/s] 


               ID                                       DNA Sequence  \
0     NC_025403.1  accagagggaaaatataacaatgtcgttttatagcgatgtaaataa...   
1     NC_025404.1  accagagggaaaattaagaaaggtcgttccaagacgacttaaaaga...   
2     NC_028246.1  acggagaaaaacaaaaaaactatagtgattagataaataaggaaaa...   
3     NC_002077.1  ttgcccactccctctctgcgcgctcgctcgctcggtggggcctgcg...   
4     NC_006152.1  ctctcccccctgtcgcgttcgctcgctcgctggctcgtttgggggg...   
...           ...                                                ...   
1843  NC_007661.1  gttaaaactctcacacttatggtggaactggatctgcaaaaatggg...   
1844  NC_007662.1  gttaaaaatctggttgtatctactcttgaatgaacgtgcataaagc...   
1845  NC_007663.1  gttaaaatctggaaccaatatggaagggatttatgcacgtgctttt...   
1846  NC_007664.1  gttaaaaaagagtgcagatgtcgagaatagtcttactaacaccagg...   
1847  NC_007665.1  gttaaaaacctcaagatgcatgccgctattacttcaatgaaacgtg...   

      isZoonotic  
0              0  
1              0  
2              0  
3              1  
4              1  
...          ...  
18

1848it [00:06, 286.86it/s]


          ID                                       DNA Sequence  isZoonotic
0      37121  cttttaacaatcatagttttataaaagggtgtaaccgaagcggttt...           1
1     129724  atactttacaattatcttgtaaaaagtagggtgtaaccgaaaaggg...           1
2      59303  atggcggacgtgtgacatcaccgttcgctctttctaggatcctttg...           1
3     120686  ttgttctacttcttactcattattataaattataatgtttgtataa...           1
4      99565  gtgaatgatgatggcgtcgaaagacgtcgttgcagctactgctgct...           1
...      ...                                                ...         ...
9324  358769  gtatacgaggttagttcattctcgtatgcatgattggacaaattaa...           0
9325  268315  ggatccacagaactccttgtatgtacagccgcgggtacccacagca...           0
9326  130329  ctcatgtcattaataagaccatgcagaaaatgcaaatgaggcgaag...           0
9327  358812  gtatacgaggttagttcattctcgtatacacgattggacaaatcaa...           0
9328   11287  atggaaggaggaattcgtgcagcgttttcaggcctgaatgatgtta...           0

[9329 rows x 3 columns]


9329it [01:41, 92.22it/s] 


               ID                                       DNA Sequence  \
0     NC_025403.1  accagagggaaaatataacaatgtcgttttatagcgatgtaaataa...   
1     NC_025404.1  accagagggaaaattaagaaaggtcgttccaagacgacttaaaaga...   
2     NC_028246.1  acggagaaaaacaaaaaaactatagtgattagataaataaggaaaa...   
3     NC_002077.1  ttgcccactccctctctgcgcgctcgctcgctcggtggggcctgcg...   
4     NC_006152.1  ctctcccccctgtcgcgttcgctcgctcgctggctcgtttgggggg...   
...           ...                                                ...   
1843  NC_007661.1  gttaaaactctcacacttatggtggaactggatctgcaaaaatggg...   
1844  NC_007662.1  gttaaaaatctggttgtatctactcttgaatgaacgtgcataaagc...   
1845  NC_007663.1  gttaaaatctggaaccaatatggaagggatttatgcacgtgctttt...   
1846  NC_007664.1  gttaaaaaagagtgcagatgtcgagaatagtcttactaacaccagg...   
1847  NC_007665.1  gttaaaaacctcaagatgcatgccgctattacttcaatgaaacgtg...   

      isZoonotic  
0              0  
1              0  
2              0  
3              1  
4              1  
...          ...  
18

1848it [00:06, 270.74it/s]


          ID                                       DNA Sequence  isZoonotic
0      37121  cttttaacaatcatagttttataaaagggtgtaaccgaagcggttt...           1
1     129724  atactttacaattatcttgtaaaaagtagggtgtaaccgaaaaggg...           1
2      59303  atggcggacgtgtgacatcaccgttcgctctttctaggatcctttg...           1
3     120686  ttgttctacttcttactcattattataaattataatgtttgtataa...           1
4      99565  gtgaatgatgatggcgtcgaaagacgtcgttgcagctactgctgct...           1
...      ...                                                ...         ...
9324  358769  gtatacgaggttagttcattctcgtatgcatgattggacaaattaa...           0
9325  268315  ggatccacagaactccttgtatgtacagccgcgggtacccacagca...           0
9326  130329  ctcatgtcattaataagaccatgcagaaaatgcaaatgaggcgaag...           0
9327  358812  gtatacgaggttagttcattctcgtatacacgattggacaaatcaa...           0
9328   11287  atggaaggaggaattcgtgcagcgttttcaggcctgaatgatgtta...           0

[9329 rows x 3 columns]


9329it [01:53, 82.31it/s] 


               ID                                       DNA Sequence  \
0     NC_025403.1  accagagggaaaatataacaatgtcgttttatagcgatgtaaataa...   
1     NC_025404.1  accagagggaaaattaagaaaggtcgttccaagacgacttaaaaga...   
2     NC_028246.1  acggagaaaaacaaaaaaactatagtgattagataaataaggaaaa...   
3     NC_002077.1  ttgcccactccctctctgcgcgctcgctcgctcggtggggcctgcg...   
4     NC_006152.1  ctctcccccctgtcgcgttcgctcgctcgctggctcgtttgggggg...   
...           ...                                                ...   
1843  NC_007661.1  gttaaaactctcacacttatggtggaactggatctgcaaaaatggg...   
1844  NC_007662.1  gttaaaaatctggttgtatctactcttgaatgaacgtgcataaagc...   
1845  NC_007663.1  gttaaaatctggaaccaatatggaagggatttatgcacgtgctttt...   
1846  NC_007664.1  gttaaaaaagagtgcagatgtcgagaatagtcttactaacaccagg...   
1847  NC_007665.1  gttaaaaacctcaagatgcatgccgctattacttcaatgaaacgtg...   

      isZoonotic  
0              0  
1              0  
2              0  
3              1  
4              1  
...          ...  
18

1848it [00:07, 237.70it/s]
