In [4]:
from itertools import permutations, product

import tqdm

import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from os import path

import matplotlib.pyplot as plt

from warnings import simplefilter
from collections import OrderedDict
from ctgan import CTGANSynthesizer

if (os.path.abspath('').split('/')[-1] == 'project'):
    %cd utils
elif (os.path.abspath('').split('/')[-1] == 'train_and_vis'):
    %cd ../utils

import query_utils
import model_utils
import validation_utils

if (os.path.abspath('').split('/')[-1] == 'utils'):
    %cd ..

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action='ignore', category=FutureWarning)

/Users/benjaminli/Documents/coding/scires/project/utils
/Users/benjaminli/Documents/coding/scires/project


In [5]:
df = pd.read_csv('data/dataset1/human_infecting_virus', delimiter='\t', header=None)
df[['ID', 'DNA Sequence']] = df[0].str.split(expand=True)
df = df.drop(0, axis=1)
df['isZoonotic'] = 1


df2 = pd.read_csv('data/dataset1/Other_viruses', delimiter='\t', header=None)
df2[['ID', 'DNA Sequence']] = df2[0].str.split(expand=True)
df2 = df2.drop(0, axis=1)
df2['isZoonotic'] = 0


nardus = pd.read_csv('data/dataset2/nardus_sequences.csv')
nardus.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df2.reset_index(drop=True, inplace=True)

# inconsistency with tax IDS
mergedDf = pd.concat([df, df2, nardus], axis=0, ignore_index=True)
mergedDf = mergedDf.drop_duplicates(subset=['DNA Sequence'])
mergedDf.reset_index(drop=True, inplace=True)

In [12]:
print(mergedDf)

          ID                                       DNA Sequence  isZoonotic
0      37121  cttttaacaatcatagttttataaaagggtgtaaccgaagcggttt...           1
1     129724  atactttacaattatcttgtaaaaagtagggtgtaaccgaaaaggg...           1
2      59303  atggcggacgtgtgacatcaccgttcgctctttctaggatcctttg...           1
3     120686  ttgttctacttcttactcattattataaattataatgtttgtataa...           1
4      99565  gtgaatgatgatggcgtcgaaagacgtcgttgcagctactgctgct...           1
...      ...                                                ...         ...
9324  358769  gtatacgaggttagttcattctcgtatgcatgattggacaaattaa...           0
9325  268315  ggatccacagaactccttgtatgtacagccgcgggtacccacagca...           0
9326  130329  ctcatgtcattaataagaccatgcagaaaatgcaaatgaggcgaag...           0
9327  358812  gtatacgaggttagttcattctcgtatacacgattggacaaatcaa...           0
9328   11287  atggaaggaggaattcgtgcagcgttttcaggcctgaatgatgtta...           0

[9329 rows x 3 columns]


In [7]:

def resetkmerdict(permset)->OrderedDict:
        kmerdict = OrderedDict()
        for i in permset:
            kmerdict[i]=0
        return kmerdict

def assign_kmers_to_dict(row, permset, kmer):
    kmerdict=resetkmerdict(permset)
    st = row[2] # tune for which column the sequence is in
    for j in range(len(st)-kmer+1):
        if not st[j:j+kmer] in permset: continue
        kmerdict[st[j:j+kmer]]+=1
    return kmerdict

def getTrainParams(mergedDf, kmer, f, synthetic_pos=0, synthetic_neg=0, save_reg=False, save_new=False):
    print(mergedDf)
    s = product('acgt',repeat = kmer)
    permset = set(["".join(x) for x in list(s)])

    l = []
    
    for row in tqdm.tqdm(mergedDf.itertuples()):
        l.append(assign_kmers_to_dict(row, permset, kmer))

    finalkmerdict=pd.DataFrame(l)
    
    # shouldn't need to fill NAs
    # mergedDf.fillna(0, inplace=True)

    X = finalkmerdict
    Y = mergedDf['isZoonotic']
    # also test simple average

    # feature defs
    vec = pd.concat([X, Y], axis=1)
    
    if save_reg:
        vec.to_csv(f'data/{f}/kmers-{str(kmer)}.csv', index=False)
    
    X_norm = X.apply(lambda x: (x-x.min())/(x.max()-x.min()), axis=1)

    place = pd.concat([X_norm, Y], axis=1)

    if save_reg:
        place.to_csv(f'data/{f}/normalized-{str(kmer)}.csv', index=False)

    div = X.apply(lambda x: x/(len(x)-kmer+1), axis=1)
    div = pd.concat([div, Y], axis=1)
    if save_new:
        div.to_csv(f'data/{f}/lengthdiv-{str(kmer)}.csv', index=False)

    # generate based on normalized data
    if (synthetic_neg != 0):
        # # check if current model is better than pickled model
        # posGanModel.save('models/curr_models/posGanModel.pkl')
        notZoonotic = place.loc[place['isZoonotic']==0]
        notZoonotic = isZoonotic.loc[:, isZoonotic.columns != 'isZoonotic']
        # print(notZoonotic)
        negGanModel = CTGANSynthesizer(batch_size=60, epochs=10, verbose=True)
        negGanModel.fit(notZoonotic)

        # negGanModel.save('models/curr_models/negGanModel.pkl')
        # generate negative samples
        print('Generating negative samples...')
        negSamples = negGanModel.sample(synthetic_neg)
        negSamples['isZoonotic'] = 0
        print('Negative samples generated')
        print(negSamples)
        print(negSamples.shape)
        place = pd.concat([place, negSamples], axis=0, ignore_index=True)
        return train_test_split(place.drop('isZoonotic', axis=1), place['isZoonotic'], test_size=0.2, random_state=1)

    # generate based on normalized data
    if (synthetic_pos != 0):
        isZoonotic = place.loc[place['isZoonotic']==1]
        isZoonotic = isZoonotic.loc[:, isZoonotic.columns != 'isZoonotic']
        # print(isZoonotic)

        posGanModel = CTGANSynthesizer(batch_size=60, epochs=10, verbose=True)
        posGanModel.fit(isZoonotic)

        print('Generating negative samples...')
        posSamples = posGanModel.sample(synthetic_neg)
        posSamples['isZoonotic'] = 1
        print('Negative samples generated')
        print(posSamples)
        print(posSamples.shape)
        place = pd.concat([place, posSamples], axis=0, ignore_index=True)
        return train_test_split(place.drop('isZoonotic', axis=1), place['isZoonotic'], test_size=0.2, random_state=1)

        # negGanModel.save('models/curr_models/negGanModel.pkl')

    return train_test_split(X, Y, test_size=0.2, random_state=1)

In [8]:
for kmer in range(3, 7):
    # based on literature
    X_train, X_test, y_train, y_test = getTrainParams(mergedDf, kmer, "merged", save_reg=True, save_new=True)
    zz = X_train.head()

                ID                                       DNA Sequence  \
0            37121  cttttaacaatcatagttttataaaagggtgtaaccgaagcggttt...   
1           129724  atactttacaattatcttgtaaaaagtagggtgtaaccgaaaaggg...   
2            59303  atggcggacgtgtgacatcaccgttcgctctttctaggatcctttg...   
3           120686  ttgttctacttcttactcattattataaattataatgtttgtataa...   
4            99565  gtgaatgatgatggcgtcgaaagacgtcgttgcagctactgctgct...   
...            ...                                                ...   
10622  NC_007661.1  gttaaaactctcacacttatggtggaactggatctgcaaaaatggg...   
10623  NC_007662.1  gttaaaaatctggttgtatctactcttgaatgaacgtgcataaagc...   
10624  NC_007663.1  gttaaaatctggaaccaatatggaagggatttatgcacgtgctttt...   
10625  NC_007664.1  gttaaaaaagagtgcagatgtcgagaatagtcttactaacaccagg...   
10626  NC_007665.1  gttaaaaacctcaagatgcatgccgctattacttcaatgaaacgtg...   

       isZoonotic  
0               1  
1               1  
2               1  
3               1  
4               1  
...

10627it [01:23, 127.09it/s] 


                ID                                       DNA Sequence  \
0            37121  cttttaacaatcatagttttataaaagggtgtaaccgaagcggttt...   
1           129724  atactttacaattatcttgtaaaaagtagggtgtaaccgaaaaggg...   
2            59303  atggcggacgtgtgacatcaccgttcgctctttctaggatcctttg...   
3           120686  ttgttctacttcttactcattattataaattataatgtttgtataa...   
4            99565  gtgaatgatgatggcgtcgaaagacgtcgttgcagctactgctgct...   
...            ...                                                ...   
10622  NC_007661.1  gttaaaactctcacacttatggtggaactggatctgcaaaaatggg...   
10623  NC_007662.1  gttaaaaatctggttgtatctactcttgaatgaacgtgcataaagc...   
10624  NC_007663.1  gttaaaatctggaaccaatatggaagggatttatgcacgtgctttt...   
10625  NC_007664.1  gttaaaaaagagtgcagatgtcgagaatagtcttactaacaccagg...   
10626  NC_007665.1  gttaaaaacctcaagatgcatgccgctattacttcaatgaaacgtg...   

       isZoonotic  
0               1  
1               1  
2               1  
3               1  
4               1  
...

10627it [01:22, 128.87it/s] 


                ID                                       DNA Sequence  \
0            37121  cttttaacaatcatagttttataaaagggtgtaaccgaagcggttt...   
1           129724  atactttacaattatcttgtaaaaagtagggtgtaaccgaaaaggg...   
2            59303  atggcggacgtgtgacatcaccgttcgctctttctaggatcctttg...   
3           120686  ttgttctacttcttactcattattataaattataatgtttgtataa...   
4            99565  gtgaatgatgatggcgtcgaaagacgtcgttgcagctactgctgct...   
...            ...                                                ...   
10622  NC_007661.1  gttaaaactctcacacttatggtggaactggatctgcaaaaatggg...   
10623  NC_007662.1  gttaaaaatctggttgtatctactcttgaatgaacgtgcataaagc...   
10624  NC_007663.1  gttaaaatctggaaccaatatggaagggatttatgcacgtgctttt...   
10625  NC_007664.1  gttaaaaaagagtgcagatgtcgagaatagtcttactaacaccagg...   
10626  NC_007665.1  gttaaaaacctcaagatgcatgccgctattacttcaatgaaacgtg...   

       isZoonotic  
0               1  
1               1  
2               1  
3               1  
4               1  
...

10627it [01:25, 124.35it/s]


                ID                                       DNA Sequence  \
0            37121  cttttaacaatcatagttttataaaagggtgtaaccgaagcggttt...   
1           129724  atactttacaattatcttgtaaaaagtagggtgtaaccgaaaaggg...   
2            59303  atggcggacgtgtgacatcaccgttcgctctttctaggatcctttg...   
3           120686  ttgttctacttcttactcattattataaattataatgtttgtataa...   
4            99565  gtgaatgatgatggcgtcgaaagacgtcgttgcagctactgctgct...   
...            ...                                                ...   
10622  NC_007661.1  gttaaaactctcacacttatggtggaactggatctgcaaaaatggg...   
10623  NC_007662.1  gttaaaaatctggttgtatctactcttgaatgaacgtgcataaagc...   
10624  NC_007663.1  gttaaaatctggaaccaatatggaagggatttatgcacgtgctttt...   
10625  NC_007664.1  gttaaaaaagagtgcagatgtcgagaatagtcttactaacaccagg...   
10626  NC_007665.1  gttaaaaacctcaagatgcatgccgctattacttcaatgaaacgtg...   

       isZoonotic  
0               1  
1               1  
2               1  
3               1  
4               1  
...

10627it [01:33, 113.20it/s]


In [29]:
X_train, X_test, y_train, y_test = getTrainParams(mergedDf, kmer=4, f="merged", save_new=True)

                ID                                       DNA Sequence  \
0            37121  cttttaacaatcatagttttataaaagggtgtaaccgaagcggttt...   
1           129724  atactttacaattatcttgtaaaaagtagggtgtaaccgaaaaggg...   
2            59303  atggcggacgtgtgacatcaccgttcgctctttctaggatcctttg...   
3           120686  ttgttctacttcttactcattattataaattataatgtttgtataa...   
4            99565  gtgaatgatgatggcgtcgaaagacgtcgttgcagctactgctgct...   
...            ...                                                ...   
10622  NC_007661.1  gttaaaactctcacacttatggtggaactggatctgcaaaaatggg...   
10623  NC_007662.1  gttaaaaatctggttgtatctactcttgaatgaacgtgcataaagc...   
10624  NC_007663.1  gttaaaatctggaaccaatatggaagggatttatgcacgtgctttt...   
10625  NC_007664.1  gttaaaaaagagtgcagatgtcgagaatagtcttactaacaccagg...   
10626  NC_007665.1  gttaaaaacctcaagatgcatgccgctattacttcaatgaaacgtg...   

       isZoonotic  
0               1  
1               1  
2               1  
3               1  
4               1  
...

10627it [01:18, 134.93it/s] 


In [23]:
print(X_train)

          cccg      ccct      gcgc      ctgc      ggag      tgga      cgcg  \
3601  0.355263  0.315789  0.065789  0.381579  0.578947  0.618421  0.065789   
2819  0.135922  0.378641  0.194175  0.572816  0.330097  0.563107  0.058252   
8292  0.017699  0.212389  0.026549  0.159292  0.318584  0.690265  0.000000   
7135  0.493363  0.125369  0.614307  0.320796  0.055310  0.092920  0.672566   
142   0.065574  0.327869  0.016393  0.262295  0.459016  0.754098  0.000000   
...        ...       ...       ...       ...       ...       ...       ...   
2895  0.188889  0.055556  0.233333  0.366667  0.155556  0.433333  0.044444   
7813  0.127820  0.210526  0.000000  0.082707  0.210526  0.578947  0.015038   
905   0.021071  0.033655  0.016389  0.071115  0.139889  0.310506  0.036289   
5192  0.412621  0.157767  0.502427  0.597087  0.400485  0.546117  0.383495   
235   0.196970  0.212121  0.136364  0.196970  0.439394  0.727273  0.000000   

          aagt      tccc      gccc  ...      aatc      cagt    

In [None]:
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier
