In [None]:
from itertools import permutations, product

import tqdm

import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from os import path

import matplotlib.pyplot as plt

from warnings import simplefilter
from collections import OrderedDict
from ctgan import CTGANSynthesizer

if (os.path.abspath('').split('/')[-1] == 'project'):
    %cd utils
elif (os.path.abspath('').split('/')[-1] == 'train_and_vis'):
    %cd ../utils

import query_utils
import model_utils
import validation_utils

if (os.path.abspath('').split('/')[-1] == 'utils'):
    %cd ..

simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action='ignore', category=FutureWarning)

In [None]:
df = pd.read_csv('data/dataset1/human_infecting_virus', delimiter='\t', header=None)
df[['ID', 'DNA Sequence']] = df[0].str.split(expand=True)
df = df.drop(0, axis=1)
df['isZoonotic'] = 1


df2 = pd.read_csv('data/dataset1/Other_viruses', delimiter='\t', header=None)
df2[['ID', 'DNA Sequence']] = df2[0].str.split(expand=True)
df2 = df2.drop(0, axis=1)
df2['isZoonotic'] = 0


nardus = pd.read_csv('data/dataset2/nardus_sequences.csv')
nardus.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)
df2.reset_index(drop=True, inplace=True)

# inconsistency with tax IDS
mergedDf = pd.concat([df, df2, nardus], axis=0, ignore_index=True)
mergedDf = mergedDf.drop_duplicates(subset=['DNA Sequence'])
mergedDf.reset_index(drop=True, inplace=True)

In [None]:
print(mergedDf)

In [None]:

def resetkmerdict(permset)->OrderedDict:
        kmerdict = OrderedDict()
        for i in permset:
            kmerdict[i]=0
        return kmerdict

def assign_kmers_to_dict(row, permset, kmer):
    kmerdict=resetkmerdict(permset)
    st = row[2] # tune for which column the sequence is in
    for j in range(len(st)-kmer+1):
        if not st[j:j+kmer] in permset: continue
        kmerdict[st[j:j+kmer]]+=1
    return kmerdict

def getTrainParams(df, kmer, f="merged", synthetic_pos=0, synthetic_neg=0, save_reg=False, save_new=False, save_merged=True):
    s = product('acgt',repeat = kmer)
    permset = set(["".join(x) for x in list(s)])

    l = []
    
    for row in tqdm.tqdm(df.itertuples()):
        l.append(assign_kmers_to_dict(row, permset, kmer))

    finalkmerdict=pd.DataFrame(l)
    
    # shouldn't need to fill NAs
    # mergedDf.fillna(0, inplace=True)

    X = finalkmerdict
    Y = df['isZoonotic']
    # also test simple average

    # sort X by alphabetical order of its columns
    X = X.reindex(sorted(X.columns), axis=1)
    print(X.columns)

    # feature defs
    vec = pd.concat([X, Y], axis=1)
    
    if save_reg:
        vec.to_csv(f'data/{f}/kmers-{str(kmer)}.csv', index=False)
    
    X_norm = X.apply(lambda x: (x-x.min())/(x.max()-x.min()), axis=1)

    place = pd.concat([X_norm, Y], axis=1)

    if save_reg:
        place.to_csv(f'data/{f}/normalized-{str(kmer)}.csv', index=False)

    div = X.apply(lambda x: x/(len(x)-kmer+1), axis=1)
    div = pd.concat([div, Y], axis=1)
    if save_new:
        div.to_csv(f'data/{f}/lengthdiv-{str(kmer)}.csv', index=False)

    # generate based on normalized data
    if (synthetic_neg != 0):
        # # check if current model is better than pickled model
        # posGanModel.save('models/curr_models/posGanModel.pkl')
        notZoonotic = place.loc[place['isZoonotic']==0]
        notZoonotic = isZoonotic.loc[:, isZoonotic.columns != 'isZoonotic']
        # print(notZoonotic)
        negGanModel = CTGANSynthesizer(batch_size=60, epochs=10, verbose=True)
        negGanModel.fit(notZoonotic)

        # negGanModel.save('models/curr_models/negGanModel.pkl')
        # generate negative samples
        print('Generating negative samples...')
        negSamples = negGanModel.sample(synthetic_neg)
        negSamples['isZoonotic'] = 0
        print('Negative samples generated')
        print(negSamples)
        print(negSamples.shape)
        place = pd.concat([place, negSamples], axis=0, ignore_index=True)
        return train_test_split(place.drop('isZoonotic', axis=1), place['isZoonotic'], test_size=0.2, random_state=1)

    # generate based on normalized data
    if (synthetic_pos != 0):
        isZoonotic = place.loc[place['isZoonotic']==1]
        isZoonotic = isZoonotic.loc[:, isZoonotic.columns != 'isZoonotic']
        # print(isZoonotic)

        posGanModel = CTGANSynthesizer(batch_size=60, epochs=10, verbose=True)
        posGanModel.fit(isZoonotic)

        print('Generating negative samples...')
        posSamples = posGanModel.sample(synthetic_neg)
        posSamples['isZoonotic'] = 1
        print('Negative samples generated')
        print(posSamples)
        print(posSamples.shape)
        place = pd.concat([place, posSamples], axis=0, ignore_index=True)
        return train_test_split(place.drop('isZoonotic', axis=1), place['isZoonotic'], test_size=0.2, random_state=1)

        # negGanModel.save('models/curr_models/negGanModel.pkl')

    return train_test_split(X, Y, test_size=0.2, random_state=1)

In [None]:
for kmer in range(3, 7):
    # based on literature
    X_train, X_test, y_train, y_test = getTrainParams(mergedDf, kmer, f="merged", save_reg=True, save_new=True)
    zz = X_train.head()

In [None]:
X_train, X_test, y_train, y_test = getTrainParams(mergedDf, kmer=4, f="merged", save_new=True)

In [None]:
print(X_train)

In [None]:
from imblearn.ensemble import BalancedBaggingClassifier, BalancedRandomForestClassifier
