# Load libraries

In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix

  import pandas.util.testing as tm


# All features


In [None]:
df1 = pd.concat([pd.read_csv('drive/My Drive/ppnnw5.csv')[['Gen', 'die', 'Fam']],
                 pd.read_csv('drive/My Drive/ppnnw5.csv').iloc[:,10:],
                 pd.read_csv('drive/My Drive/ppnnw6.csv').iloc[:,10:]], axis = 1)

df1 = df1.loc[df1['die'] == 'no']
df1['Gen'] = 1
df2 = pd.concat([pd.read_csv('drive/My Drive/h1w5.csv')[['die']],
                 pd.read_csv('drive/My Drive/h1w5.csv').iloc[:, 5:], 
                 pd.read_csv('drive/My Drive/h1w6p1.csv').iloc[:,5:],
                 pd.read_csv('drive/My Drive/h1w6p2.csv').iloc[:,5:]], axis = 1)
df2 = df2.loc[df2['die'] == 'no']
df2['Gen'] = 0

data = pd.concat([df1, df2], axis = 0).reset_index(drop = True)

Model performance with all features

In [None]:
temp = []
for i in range(5):
  #Splitting features and labels
  X = data.iloc[:, 3:].values.astype(float)
  y = data.iloc[:, 0].values.astype(float)

  # Splitting the dataset into the Training set and Test set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

  # Feature Scaling
  sc = StandardScaler()
  X_train = sc.fit_transform(X_train)
  X_test = sc.transform(X_test)

  ### Training the decision tree model on the Training set
  classifier = SVC(kernel = 'linear')
  classifier.fit(X_train, y_train)

  # Predicting the Test set results
  y_pred = classifier.predict(X_test)

  # Making the Confusion Matrix
  cm = confusion_matrix(y_test, y_pred)

  #evaluation
  cv_bin = cross_val_score(classifier, X_train, y_train, cv=10, scoring='f1')
  cv_mac = cross_val_score(classifier, X_train, y_train, cv=10, scoring='f1_macro')
  f1bin = f1_score(y_test, y_pred, average='binary')
  f1mac = f1_score(y_test, y_pred, average='macro')

  temp.append((cm[1,1], cm[0,1], cm[0,0], cm[1,0], cv_bin.mean(), cv_bin.std(), cv_mac.mean(), cv_mac.std(), f1bin, f1mac))
  print('correct/TP: %d ** FP: %d **  TN: %d ** FN: %d ** cv_bin mean: %f ** cv_bin std: %f ** cv_mac mean: %f ** cv_mac std: %f ** f1bin: %f ** f1mac: %f' 
        % (cm[1,1], cm[0,1], cm[0,0], cm[1,0], cv_bin.mean(), cv_bin.std(), cv_mac.mean(), cv_mac.std(), f1bin, f1mac))
  
svmallf = pd.DataFrame(temp, columns = ['TP', 'FP', 'TN', 'FN', 'cv_bin mean', 'cv_bin std', 'cv_mac mean', 'cv_man std', 'f1bin', 'f1mac'])

correct/TP: 1056 ** FP: 3 **  TN: 3838 ** FN: 1 ** cv_bin mean: 0.997389 ** cv_bin std: 0.001817 ** cv_mac mean: 0.998350 ** cv_mac std: 0.001148 ** f1bin: 0.998110 ** f1mac: 0.998794
correct/TP: 1013 ** FP: 6 **  TN: 3878 ** FN: 1 ** cv_bin mean: 0.996623 ** cv_bin std: 0.001824 ** cv_mac mean: 0.997858 ** cv_mac std: 0.001157 ** f1bin: 0.996557 ** f1mac: 0.997828
correct/TP: 1067 ** FP: 3 **  TN: 3828 ** FN: 0 ** cv_bin mean: 0.997053 ** cv_bin std: 0.001602 ** cv_mac mean: 0.998140 ** cv_mac std: 0.001011 ** f1bin: 0.998596 ** f1mac: 0.999102
correct/TP: 1010 ** FP: 2 **  TN: 3885 ** FN: 1 ** cv_bin mean: 0.995825 ** cv_bin std: 0.000784 ** cv_mac mean: 0.997351 ** cv_mac std: 0.000498 ** f1bin: 0.998517 ** f1mac: 0.999066
correct/TP: 1012 ** FP: 6 **  TN: 3879 ** FN: 1 ** cv_bin mean: 0.997749 ** cv_bin std: 0.001636 ** cv_mac mean: 0.998572 ** cv_mac std: 0.001038 ** f1bin: 0.996553 ** f1mac: 0.997826


LOFO CV

In [None]:
temp = []

df2test = df2.sample(frac = 0.3)
df2train = df2[~df2.index.isin(df2test.index)]

for fam in set(df1['Fam'].to_list()):
    df1test = df1.loc[df1['Fam'] == fam]
    df1train = df1.loc[~(df1['Fam'] == fam)]
    df1train = df1train.loc[df1train['die'] == 'no']
    
    X_train = pd.concat([df1train.iloc[:,3:], df2train.iloc[:,1:-1]], axis = 0).reset_index(drop = True)
    y_train = pd.concat([df1train.iloc[:,0], df2train.iloc[:,-1]], axis = 0).reset_index(drop = True)

    X_test = pd.concat([df1test.iloc[:,3:], df2test.iloc[:,1:-1]], axis = 0).reset_index(drop = True)
    y_test = pd.concat([df1test.iloc[:,0], df2test.iloc[:,-1]], axis = 0).reset_index(drop = True)
    
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    # Fitting Random Forest Classification to the Training set
    classifier = SVC(kernel = 'linear')
    classifier.fit(X_train, y_train)
    
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    
    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    
    #evaluation
    f1bin = f1_score(y_test, y_pred, average='binary')
    f1mac = f1_score(y_test, y_pred, average='macro')
    
    temp.append((fam, len(df1test), cm[1,1], cm[0,1], cm[0,0], cm[1,0], f1bin, f1mac))

    print('Fam: %s ** total: %d ** correct/TP: %d ** FP: %d **  TN: %d ** FN: %d ** f1bin: %f ** f1mac: %f' 
          % (fam, len(df1test), cm[1,1], cm[0,1], cm[0,0], cm[1,0], f1bin, f1mac))
  
svmlofocv_allf = pd.DataFrame(temp, columns = ['Fam', 'Total', 'Correct/TP', 'FP', 'TN', 'FN', 'f1bin', 'f1mac']) 

Fam: togaviridae ** total: 74 ** correct/TP: 74 ** FP: 9 **  TN: 4634 ** FN: 0 ** f1bin: 0.942675 ** f1mac: 0.970853
Fam: barnaviridae ** total: 2 ** correct/TP: 2 ** FP: 9 **  TN: 4634 ** FN: 0 ** f1bin: 0.307692 ** f1mac: 0.653361
Fam: arteriviridae ** total: 48 ** correct/TP: 48 ** FP: 9 **  TN: 4634 ** FN: 0 ** f1bin: 0.914286 ** f1mac: 0.956658
Fam: chuviridae ** total: 4 ** correct/TP: 4 ** FP: 9 **  TN: 4634 ** FN: 0 ** f1bin: 0.470588 ** f1mac: 0.734809
Fam: potyviridae ** total: 340 ** correct/TP: 340 ** FP: 9 **  TN: 4634 ** FN: 0 ** f1bin: 0.986938 ** f1mac: 0.992984
Fam: nodaviridae ** total: 20 ** correct/TP: 20 ** FP: 9 **  TN: 4634 ** FN: 0 ** f1bin: 0.816327 ** f1mac: 0.907678
Fam: hypoviridae ** total: 24 ** correct/TP: 24 ** FP: 9 **  TN: 4634 ** FN: 0 ** f1bin: 0.842105 ** f1mac: 0.920568
Fam: artoviridae ** total: 2 ** correct/TP: 2 ** FP: 9 **  TN: 4634 ** FN: 0 ** f1bin: 0.307692 ** f1mac: 0.653361
Fam: filoviridae ** total: 24 ** correct/TP: 24 ** FP: 9 **  TN: 4

In [None]:
svmlofocv_allf

Unnamed: 0,Fam,Total,Correct/TP,FP,TN,FN,f1bin,f1mac
0,togaviridae,74,74,9,4634,0,0.942675,0.970853
1,barnaviridae,2,2,9,4634,0,0.307692,0.653361
2,arteriviridae,48,48,9,4634,0,0.914286,0.956658
3,chuviridae,4,4,9,4634,0,0.470588,0.734809
4,potyviridae,340,340,9,4634,0,0.986938,0.992984
5,nodaviridae,20,20,9,4634,0,0.816327,0.907678
6,hypoviridae,24,24,9,4634,0,0.842105,0.920568
7,artoviridae,2,2,9,4634,0,0.307692,0.653361
8,filoviridae,24,24,9,4634,0,0.842105,0.920568
9,virgaviridae,82,82,7,4636,0,0.959064,0.979155


## Model performance with 194 features

In [None]:
l =    ['CAATCG', 'GGTA', 'GTTGA', 'CATACG', 'TGAT', 'GTCGAA', 'CCAATT', 'TGTCGA', 'GTTGAC', 
        'CGGTTA', 'CGATA', 'TAGCGT', 'AAAAAA', 'GGTTGA', 'AAAAA', 'CAAT', 'CGTCAA', 'CGGTAA', 
        'CGTTGA', 'CAAC', 'ACCAAT', 'TTGACG', 'TCAATC', 'GTTG', 'CAATTG', 'CGCAAT', 'GGTT', 
        'TTGTCG', 'TCAA', 'CCAAT', 'ATACGC', 'CGATAG', 'TTGA', 'CATCAA', 'GTTGAT', 'ATACGG', 
        'CGATC', 'GTTGGT', 'GATC', 'CCGATA', 'GTCAAT', 'CGATAA', 'CGCGTT', 'ATCGTA', 'TCGGTT', 
        'ATCAAC', 'ATTGG', 'TCGA', 'GCAATC', 'CAATCT', 'GCGTAC', 'TTCGAC', 'GGTTG', 'TCCAAT', 
        'CGTAGT', 'TTGCG', 'CGGTT', 'CGATTA', 'ACGGTT', 'ATCAAT', 'TTGCGC', 'CAATC', 'CGAT', 
        'GCGTTA', 'CCGTTA', 'TTGAC', 'CGTCGA', 'ATAGCG', 'GTTGAG', 'TATCCG', 'GGTAGT', 'CGTTGC', 
        'TCGGTA', 'TGGTT', 'AGGTTG', 'AGGAA', 'TCGAGT', 'TCAAT', 'ATAGGG', 'GTACGC', 'ACGCAA', 
        'GGTACG', 'TCCTG', 'CAGAG', 'TTGATG', 'GTCGAC', 'CGGTTG', 'ACATCG', 'CAGC', 'TTTTTT', 
        'GGTATG', 'TAGGGT', 'GGTTC', 'TGGTTC', 'CGCGTA', 'AACC', 'CTCGAT', 'AACGGT', 'ATCAA', 
        'TCCAAC', 'AAAATG', 'AAATG', 'GTTGT', 'GTCATA', 'TCAATT', 'ACAATC', 'AATAAA', 'AATTGG', 
        'ATCATA', 'GGCGTA', 'AAATAA', 'TCAAC', 'CTAACG', 'ACGATA', 'TCAACC', 'ACCGGT', 'CAGA', 
        'GCATAC', 'CTGTA', 'CGGTA', 'GTTGC', 'CTGT', 'ATTGGT', 'GGTCAA', 'CGTA', 'GTCAA', 'TCGCAA', 
        'AGCTG', 'ATGGTA', 'CGCAA', 'GTCAAA', 'GATTGG', 'CTGGA', 'TTCGCA', 'ATCCAA', 'AACCGA', 
        'CCGCAA', 'GTCGA', 'TAACAC', 'TTTTTA', 'ATAACG', 'TATGGT', 'TTGGCG', 'TGGT', 'TCCAT', 
        'CAATT', 'GCGTAA', 'TCGT', 'CCGTAA', 'GGGTTA', 'CAATTA', 'AGCCT', 'TAGCGA', 'TACCAC', 
        'CGCAAA', 'ATATCG', 'CTGCA', 'CGTACC', 'ATCGGT', 'TCGATA', 'TCGTTG', 'TTGGT', 'CGACCA', 
        'GAAGA', 'GT', 'CTGAG', 'ATGCGA', 'CAG', 'GGTAC', 'GGTTGT', 'CCATAC', 'GTTGCG', 'GATAGG', 
        'GAT', 'CAA', 'CATACC', 'AACCGT', 'TTGCGA', 'CGTAT', 'CGTAAT', 'CGATGT', 'TTCCT', 'AC', 
        'CGATCG', 'CGCTAA', 'CACAAC', 'TACC', 'TTGAG', 'CCAATC', 'TATGCG', 'TATCG', 'AGCAG', 'GGTAG', 
        'TCGTAG', 'AACCCG', 'AGAG', 'GAATCA', 'GAAGTT', 'GAAGAT', 'TCTTTA', 'CGAAGA', 'TCGAT', 'CAGT', 
        'CTTGA', 'TACG', 'CGTATG', 'CAAACA', 'TTCG', 'AGGGTT', 'TCGACC', 'CGTT', 'CTGTGA', 'TCAGT', 
        'CAGCT', 'GTGTCT', 'TGTAAA', 'ATGCCT', 'TAAACC', 'ATACCG', 'AACCG', 'CAACAA', 'CATTGG', 
        'GTACTC', 'TGTGAA', 'ATACCC', 'AACACC', 'TGCAGT', 'ATGTAA', 'AGGA', 'AAGCAC', 'GTAGGT', 
        'GTAGCT', 'CGTAGA', 'ACCCAA', 'CCAATA', 'TATTTT', 'CTTG', 'TTTTT', 'GCAATG', 'GTCGTC', 
        'GTCTAA', 'ATGCTG', 'CGTATC', 'TAGTTA', 'CAATCC', 'TTGGTC', 'TCGCA', 'GTCCAA', 'TAGGTC', 
        'CGAC', 'TGATC', 'CATAAC', 'TGATTA', 'CAATAG', 'TCTCGA', 'CTACAG', 'CTTTGG', 'TACGTC', 
        'CTCAGT', 'TTGG', 'TATTTG', 'AAACTG', 'TTCGCC', 'TGACAC', 'ACTCGT', 'CACAAA', 'TGACGA', 
        'CTTGAT', 'TCGACT', 'TGATCG', 'TAACG', 'ATATTT', 'TAGAGG', 'CCGATC', 'TCAACG', 'GTCGAT', 
        'GCGATA', 'ATGCCG', 'ACAAC', 'ACACCG', 'TTACCG', 'CGTAC', 'GTATTT', 'TTGCGT', 'CTCAG', 
        'CTTGCG', 'TTTCGA', 'TCGTA', 'TCCTGA', 'TACGCC', 'GAAG', 'GCTATC', 'CTAGTG', 'CCCTAT', 
        'ACCAGA', 'GTACTG', 'CATTTA', 'GTTAGA', 'ATAATC', 'AGGCAT', 'GATGTT', 'TAGATT', 'CAACGC', 
        'CGATTG', 'ACATTT', 'AAATGT', 'CCGAT', 'GTTATA', 'TGTTG', 'TTCAAT', 'GTCGTT', 'ACCGAT', 
        'CGATAC', 'TCGATG', 'CCACAC', 'CCAT', 'GGGTGT', 'CTATC', 'GTTGTG', 'CTTTCG', 'ATAGAT', 
        'TATGGC', 'AGCTGG', 'CTCT', 'CACAGA', 'TGGTTG', 'CTAG', 'ACGGGT', 'CATTTT', 'TTTTAG', 
        'CATAAT', 'AACCCA', 'CATAG', 'ACGGGA', 'GGTGTT', 'CCGTCC', 'AAGACC', 'ATTGCG', 'CAGTTT', 
        'TGAAGA', 'ATCG', 'AATTTG', 'TCAAAA', 'CCAAC', 'ATCA', 'TGCAG', 'CAAATT', 'ACGTTA', 'CGACTA', 
        'ACCCCT', 'CCCACT', 'CCGTAT', 'CAACGG', 'TCCATC', 'ACTAGG', 'AAGCAG', 'TTTGTA', 'GGGTCA', 
        'CGGTGT', 'GCAGTT', 'ACAATT', 'ACAACC', 'ACGTTG', 'AAATAC', 'TTCAGT', 'AGTTGA', 'GACATA', 
        'CTGACT', 'CTGTTT', 'CAAGGG', 'ATTTTA', 'CAAAC', 'TAGG', 'CTGAA', 'GCGATC', 'CAACCA', 
        'ATTCGC', 'CAATTC', 'CGTTAG', 'CACTAG', 'AGCAGT', 'CTTCTT', 'CAACTG', 'GGTGAG', 'TTCAG', 
        'AAATA', 'CGTTGT', 'TATCGC', 'AGCCA', 'GGTGAT', 'GGATAA', 'GCTGTA', 'GAAGTA', 'TTGAT', 
        'ACCGAC', 'TAGTCG', 'CTGTG', 'ATCTA', 'ACACCA', 'ATGGAT', 'CCTC', 'TGAAG', 'CGCACT', 'TAACCC', 
        'ACGTCA', 'TGATAG', 'TCGAA', 'ATACCA', 'AGGGTA', 'GATCGT', 'ATCCAG', 'TCAGC', 'GCAGAT', 
        'TACCTG', 'ACCCGT', 'CCATA', 'AATTGA', 'AGGTAC', 'GGT', 'CTGCCT', 'ACTAGT', 'TTGTTC', 'TTACTT', 
        'TTTTA', 'CAGAC', 'GCGTAG', 'GAGCAT', 'ATC', 'ATGG', 'AAGCCT', 'CACTCG', 'TCAGTG', 'CTGGAG', 
        'CTTCCT', 'CGTCAT', 'CCGTTG', 'TAGTTG', 'CGTCCA', 'GCCCAG', 'ATGGT', 'TACAC', 'GATAGT', 
        'ACAACA', 'CTATAG', 'AATGGT', 'AGAAA', 'ATGACG', 'GATAAC', 'CAGCTT', 'CATGAT', 'TTCGTT', 
        'GTGTAC', 'TTGAAC', 'CACAAG', 'AATTG', 'GTCA', 'TGCCT', 'CTCGAC', 'CTATCT', 'TGACGG', 
        'CCATAT', 'ACCATT', 'AAAGAG', 'ATTAGA', 'GCGTAT', 'CGTTG', 'TACGT', 'ACGAT', 'TCCATG', 
        'CCTG', 'GTCAAC', 'TTGTGA', 'ACTAG', 'GCGACA', 'TGAAGC', 'TACAAC', 'ACGGTA', 'TAGCGG', 
        'ATTGAT', 'ATCACG', 'ATTCGG', 'TATTTC', 'TAAAAT', 'GTATCG', 'CAACCG', 'CCCCAA', 'CGTAAC',
        'CTAGGG', 'ATCAAA', 'ATTGGA', 'TCGTC', 'GCAACT', 'CACTA', 'CCATAG', 'CACAA', 'ATAGGT', 
        'ACGTAG', 'CATGCG', 'ATGATC', 'CCGTCA', 'CGATT', 'TTTGAC', 'CGTTA', 'TCTGTA', 'TAGATC', 
        'CATAGC', 'TCCAGA', 'ACACTC', 'CGATCA', 'GGATGT', 'TTACGC', 'AACTGC', 'ATCCAT', 'CCACTC', 
        'GTCCAT', 'ACCACT', 'GCGCGG', 'GTCAAG', 'AAGTGT', 'ATGATA', 'AGCT', 'CTAGAG', 'GCAGA', 
        'GTCATG', 'CTGTAG', 'TCATGC', 'CAGAA', 'ACACGG', 'CGGTAT', 'TTTTAA', 'TCAAAG', 'TTACAC', 
        'TCCGCT', 'CCCCCC', 'GCACAA', 'TTCTGT', 'TATGTA', 'GGTAAC', 'TAGCG', 'ATACC', 'GGTATC', 
        'ACAA', 'TCTGT', 'ACTA', 'TCCGTA', 'CTATCA', 'CGCATA', 'TCTCAG', 'CATGGA', 'ATCT', 'TGCGAC', 
        'ACAC', 'GACCAT', 'CAGCAT', 'ACG', 'CTAGTA', 'TTAGCG', 'TAACGA', 'ACTGAG', 'GAAGT', 'CTTGAC', 
        'CGTAGG', 'TCGCTC', 'CTGAAG', 'TTGACA', 'CGGCAA', 'CTCAAG', 'ACGTA', 'CAAGTG', 'TCAGA', 'GAGCAA', 
        'CGAAAG', 'ACTTTT', 'TACCG', 'GTTGTT', 'GCCATA', 'GCGTA', 'ACGA', 'TCGACG', 'ATACG', 'GTAC', 
        'TTGCCG', 'GGCGGG', 'CTAGT', 'GAACCA', 'TTTCTG', 'TCTTGA', 'TAAAA', 'TTGAGG', 'TAACGC', 'TATAGG',
        'GTCGTA', 'GCATTT', 'TACTG', 'TATCGA', 'CGTAA', 'AGGCTT', 'CGTTAT', 'TATC', 'TTGATT', 'TGCAGA', 
        'TAGGCG', 'GCTTGA', 'ATCTAT', 'GGTTAG', 'TACAAT']

c = [f for f in l if len(f) < 6]
b = [f for f in l if len(f) == 6]
b1 = [f for f in b if f[0] == 'A' or f[0] == 'T']
b2 = [f for f in b if f[0] == 'G' or f[0] == 'C']

df1 = pd.concat([pd.read_csv('drive/My Drive/ppnnw5.csv')[['Gen', 'die', 'Fam'] + c], 
                 pd.read_csv('drive/My Drive/ppnnw6.csv')[b]], axis = 1)

df1 = df1.loc[df1['die'] == 'no']
df1['Gen'] = 1

df2 = pd.concat([pd.read_csv('drive/My Drive/h1w5.csv')[['die'] + c], 
                 pd.read_csv('drive/My Drive/h1w6p1.csv')[b1],
                 pd.read_csv('drive/My Drive/h1w6p2.csv')[b2]], axis = 1)
df2 = df2.loc[df2['die'] == 'no']
df2['Gen'] = 0

data = pd.concat([df1, df2], axis = 0).reset_index(drop = True)

In [None]:
temp = []
for i in range(5):
  #Splitting features and labels
  X = data.iloc[:, 3:].values.astype(float)
  y = data.iloc[:, 0].values.astype(float)

  # Splitting the dataset into the Training set and Test set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

  # Feature Scaling
  sc = StandardScaler()
  X_train = sc.fit_transform(X_train)
  X_test = sc.transform(X_test)

  ### Training the decision tree model on the Training set
  classifier = SVC(kernel = 'linear')
  classifier.fit(X_train, y_train)

  # Predicting the Test set results
  y_pred = classifier.predict(X_test)

  # Making the Confusion Matrix
  cm = confusion_matrix(y_test, y_pred)

  #evaluation
  cv_bin = cross_val_score(classifier, X_train, y_train, cv=10, scoring='f1')
  cv_mac = cross_val_score(classifier, X_train, y_train, cv=10, scoring='f1_macro')
  f1bin = f1_score(y_test, y_pred, average='binary')
  f1mac = f1_score(y_test, y_pred, average='macro')

  temp.append((cm[1,1], cm[0,1], cm[0,0], cm[1,0], cv_bin.mean(), cv_bin.std(), cv_mac.mean(), cv_mac.std(), f1bin, f1mac))
  print('correct/TP: %d ** FP: %d **  TN: %d ** FN: %d ** cv_bin mean: %f ** cv_bin std: %f ** cv_mac mean: %f ** cv_mac std: %f ** f1bin: %f ** f1mac: %f' 
        % (cm[1,1], cm[0,1], cm[0,0], cm[1,0], cv_bin.mean(), cv_bin.std(), cv_mac.mean(), cv_mac.std(), f1bin, f1mac))

correct/TP: 1051 ** FP: 0 **  TN: 3847 ** FN: 0 ** cv_bin mean: 0.998860 ** cv_bin std: 0.001272 ** cv_mac mean: 0.999279 ** cv_mac std: 0.000804 ** f1bin: 1.000000 ** f1mac: 1.000000
correct/TP: 1081 ** FP: 0 **  TN: 3817 ** FN: 0 ** cv_bin mean: 0.998850 ** cv_bin std: 0.001281 ** cv_mac mean: 0.999275 ** cv_mac std: 0.000808 ** f1bin: 1.000000 ** f1mac: 1.000000
correct/TP: 1030 ** FP: 1 **  TN: 3867 ** FN: 0 ** cv_bin mean: 0.998867 ** cv_bin std: 0.000742 ** cv_mac mean: 0.999282 ** cv_mac std: 0.000470 ** f1bin: 0.999515 ** f1mac: 0.999693
correct/TP: 1020 ** FP: 3 **  TN: 3875 ** FN: 0 ** cv_bin mean: 0.999355 ** cv_bin std: 0.001070 ** cv_mac mean: 0.999591 ** cv_mac std: 0.000678 ** f1bin: 0.998532 ** f1mac: 0.999072
correct/TP: 1043 ** FP: 0 **  TN: 3855 ** FN: 0 ** cv_bin mean: 0.999025 ** cv_bin std: 0.001077 ** cv_mac mean: 0.999383 ** cv_mac std: 0.000682 ** f1bin: 1.000000 ** f1mac: 1.000000


LOFO CV

In [None]:
temp = []

df1 = pd.concat([pd.read_csv('drive/My Drive/ppnnw5.csv')[['Gen', 'die', 'Fam'] + c], 
                 pd.read_csv('drive/My Drive/ppnnw6.csv')[b]], axis = 1)
df1['Gen'] = 1

df2test = df2.sample(frac = 0.3)
df2train = df2[~df2.index.isin(df2test.index)]

for fam in set(df1['Fam'].to_list()):
    df1test = df1.loc[df1['Fam'] == fam]
    df1train = df1.loc[~(df1['Fam'] == fam)]
    df1train = df1train.loc[df1train['die'] == 'no']
    
    X_train = pd.concat([df1train.iloc[:,3:], df2train.iloc[:,1:-1]], axis = 0).reset_index(drop = True)
    y_train = pd.concat([df1train.iloc[:,0], df2train.iloc[:,-1]], axis = 0).reset_index(drop = True)

    X_test = pd.concat([df1test.iloc[:,3:], df2test.iloc[:,1:-1]], axis = 0).reset_index(drop = True)
    y_test = pd.concat([df1test.iloc[:,0], df2test.iloc[:,-1]], axis = 0).reset_index(drop = True)
    
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    # Fitting Random Forest Classification to the Training set
    classifier = SVC(kernel = 'linear')
    classifier.fit(X_train, y_train)
    
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    
    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    
    #evaluation
    f1bin = f1_score(y_test, y_pred, average='binary')
    f1mac = f1_score(y_test, y_pred, average='macro')
    
    temp.append((fam, len(df1test), cm[1,1], cm[0,1], cm[0,0], cm[1,0], f1bin, f1mac))

    print('Fam: %s ** total: %d ** correct/TP: %d ** FP: %d **  TN: %d ** FN: %d ** f1bin: %f ** f1mac: %f' 
          % (fam, len(df1test), cm[1,1], cm[0,1], cm[0,0], cm[1,0], f1bin, f1mac))
  
svmlofocv_194f = pd.DataFrame(temp, columns = ['Fam', 'Total', 'Correct/TP', 'FP', 'TN', 'FN', 'f1bin', 'f1mac']) 

Fam: togaviridae ** total: 84 ** correct/TP: 84 ** FP: 1 **  TN: 4642 ** FN: 0 ** f1bin: 0.994083 ** f1mac: 0.996988
Fam: barnaviridae ** total: 2 ** correct/TP: 2 ** FP: 1 **  TN: 4642 ** FN: 0 ** f1bin: 0.800000 ** f1mac: 0.899946
Fam: arteriviridae ** total: 48 ** correct/TP: 48 ** FP: 1 **  TN: 4642 ** FN: 0 ** f1bin: 0.989691 ** f1mac: 0.994792
Fam: chuviridae ** total: 4 ** correct/TP: 4 ** FP: 1 **  TN: 4642 ** FN: 0 ** f1bin: 0.888889 ** f1mac: 0.944391
Fam: potyviridae ** total: 362 ** correct/TP: 362 ** FP: 1 **  TN: 4642 ** FN: 0 ** f1bin: 0.998621 ** f1mac: 0.999256
Fam: nodaviridae ** total: 20 ** correct/TP: 20 ** FP: 1 **  TN: 4642 ** FN: 0 ** f1bin: 0.975610 ** f1mac: 0.987751
Fam: hypoviridae ** total: 24 ** correct/TP: 24 ** FP: 1 **  TN: 4642 ** FN: 0 ** f1bin: 0.979592 ** f1mac: 0.989742
Fam: artoviridae ** total: 2 ** correct/TP: 2 ** FP: 1 **  TN: 4642 ** FN: 0 ** f1bin: 0.800000 ** f1mac: 0.899946
Fam: filoviridae ** total: 32 ** correct/TP: 32 ** FP: 1 **  TN: 4

In [None]:
svmlofocv_194f

Unnamed: 0,Fam,Total,Correct/TP,FP,TN,FN,f1bin,f1mac
0,togaviridae,84,84,1,4642,0,0.994083,0.996988
1,barnaviridae,2,2,1,4642,0,0.8,0.899946
2,arteriviridae,48,48,1,4642,0,0.989691,0.994792
3,chuviridae,4,4,1,4642,0,0.888889,0.944391
4,potyviridae,362,362,1,4642,0,0.998621,0.999256
5,nodaviridae,20,20,1,4642,0,0.97561,0.987751
6,hypoviridae,24,24,1,4642,0,0.979592,0.989742
7,artoviridae,2,2,1,4642,0,0.8,0.899946
8,filoviridae,32,32,1,4642,0,0.984615,0.992254
9,virgaviridae,86,86,0,4643,0,1.0,1.0


# 68 Features

In [None]:
l =    ['TCAATC', 'CCAATT', 'GTTGAC', 'TGAT', 'GGTT', 'GGTTGA', 'GTTGA',
       'TAGCGT', 'CGGTTA', 'TTGACG', 'CGATA', 'CGTCAA', 'CAATCG', 'CGTTGA',
       'GTCGAA', 'GTTG', 'CGGTAA', 'CATACG', 'AAAAAA', 'ACCAAT', 'TCAA',
       'TTGTCG', 'CGATAG', 'AAAAA', 'CAAC', 'ATACGC', 'CAATTG', 'CCAAT',
       'CGCAAT', 'GGTA', 'CAAT', 'TGTCGA', 'CGAT', 'CAATCT', 'GCGTAC', 'GGTTG',
       'TTGA', 'GTTGGT', 'TCCAAT', 'TTGCG', 'ACGGTT', 'CGATTA', 'ATCAAT',
       'GCAATC', 'ATCAAC', 'GTTGAT', 'GTCAAT', 'CGTCGA', 'ATTGG', 'CGTAGT',
       'ATAGCG', 'CCGATA', 'CAATC', 'CATCAA', 'TTCGAC', 'TTGAC', 'TTGCGC',
       'GCGTTA', 'CCGTTA', 'ATACGG', 'CGCGTT', 'CGATC', 'CGATAA', 'ATCGTA',
       'TCGA', 'TCGGTT', 'GATC', 'CGGTT']

c = [f for f in l if len(f) < 6]
b = [f for f in l if len(f) == 6]
b1 = [f for f in b if f[0] == 'A' or f[0] == 'T']
b2 = [f for f in b if f[0] == 'G' or f[0] == 'C']

df1 = pd.concat([pd.read_csv('drive/My Drive/ppnnw5.csv')[['Gen', 'die', 'Fam'] + c], 
                 pd.read_csv('drive/My Drive/ppnnw6.csv')[b]], axis = 1)

df1 = df1.loc[df1['die'] == 'no']
df1['Gen'] = 1

df2 = pd.concat([pd.read_csv('drive/My Drive/h1w5.csv')[['die'] + c], 
                 pd.read_csv('drive/My Drive/h1w6p1.csv')[b1],
                 pd.read_csv('drive/My Drive/h1w6p2.csv')[b2]], axis = 1)
df2 = df2.loc[df2['die'] == 'no']
df2['Gen'] = 0

data = pd.concat([df1, df2], axis = 0).reset_index(drop = True)

In [None]:
temp = []
for i in range(5):
  #Splitting features and labels
  X = data.iloc[:, 3:].values.astype(float)
  y = data.iloc[:, 0].values.astype(float)

  # Splitting the dataset into the Training set and Test set
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

  # Feature Scaling
  sc = StandardScaler()
  X_train = sc.fit_transform(X_train)
  X_test = sc.transform(X_test)

  ### Training the decision tree model on the Training set
  classifier = SVC(kernel = 'linear')
  classifier.fit(X_train, y_train)

  # Predicting the Test set results
  y_pred = classifier.predict(X_test)

  # Making the Confusion Matrix
  cm = confusion_matrix(y_test, y_pred)

  #evaluation
  cv_bin = cross_val_score(classifier, X_train, y_train, cv=10, scoring='f1')
  cv_mac = cross_val_score(classifier, X_train, y_train, cv=10, scoring='f1_macro')
  f1bin = f1_score(y_test, y_pred, average='binary')
  f1mac = f1_score(y_test, y_pred, average='macro')

  temp.append((cm[1,1], cm[0,1], cm[0,0], cm[1,0], cv_bin.mean(), cv_bin.std(), cv_mac.mean(), cv_mac.std(), f1bin, f1mac))
  print('correct/TP: %d ** FP: %d **  TN: %d ** FN: %d ** cv_bin mean: %f ** cv_bin std: %f ** cv_mac mean: %f ** cv_mac std: %f ** f1bin: %f ** f1mac: %f' 
        % (cm[1,1], cm[0,1], cm[0,0], cm[1,0], cv_bin.mean(), cv_bin.std(), cv_mac.mean(), cv_mac.std(), f1bin, f1mac))

correct/TP: 1016 ** FP: 11 **  TN: 3854 ** FN: 17 ** cv_bin mean: 0.984278 ** cv_bin std: 0.004516 ** cv_mac mean: 0.990050 ** cv_mac std: 0.002851 ** f1bin: 0.986408 ** f1mac: 0.991394
correct/TP: 954 ** FP: 13 **  TN: 3922 ** FN: 9 ** cv_bin mean: 0.985245 ** cv_bin std: 0.005137 ** cv_mac mean: 0.990608 ** cv_mac std: 0.003267 ** f1bin: 0.988601 ** f1mac: 0.992902
correct/TP: 995 ** FP: 19 **  TN: 3863 ** FN: 21 ** cv_bin mean: 0.986937 ** cv_bin std: 0.005561 ** cv_mac mean: 0.991721 ** cv_mac std: 0.003523 ** f1bin: 0.980296 ** f1mac: 0.987572
correct/TP: 1037 ** FP: 23 **  TN: 3823 ** FN: 15 ** cv_bin mean: 0.986466 ** cv_bin std: 0.005747 ** cv_mac mean: 0.991448 ** cv_mac std: 0.003630 ** f1bin: 0.982008 ** f1mac: 0.988531
correct/TP: 982 ** FP: 14 **  TN: 3888 ** FN: 14 ** cv_bin mean: 0.985283 ** cv_bin std: 0.003978 ** cv_mac mean: 0.990653 ** cv_mac std: 0.002525 ** f1bin: 0.985944 ** f1mac: 0.991178


In [None]:
temp = []

df1 = pd.concat([pd.read_csv('drive/My Drive/ppnnw5.csv')[['Gen', 'die', 'Fam'] + c], 
                 pd.read_csv('drive/My Drive/ppnnw6.csv')[b]], axis = 1)
df1['Gen'] = 1

df2test = df2.sample(frac = 0.3)
df2train = df2[~df2.index.isin(df2test.index)]

for fam in set(df1['Fam'].to_list()):
    df1test = df1.loc[df1['Fam'] == fam]
    df1train = df1.loc[~(df1['Fam'] == fam)]
    df1train = df1train.loc[df1train['die'] == 'no']
    
    X_train = pd.concat([df1train.iloc[:,3:], df2train.iloc[:,1:-1]], axis = 0).reset_index(drop = True)
    y_train = pd.concat([df1train.iloc[:,0], df2train.iloc[:,-1]], axis = 0).reset_index(drop = True)

    X_test = pd.concat([df1test.iloc[:,3:], df2test.iloc[:,1:-1]], axis = 0).reset_index(drop = True)
    y_test = pd.concat([df1test.iloc[:,0], df2test.iloc[:,-1]], axis = 0).reset_index(drop = True)
    
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    # Fitting Random Forest Classification to the Training set
    classifier = SVC(kernel = 'linear')
    classifier.fit(X_train, y_train)
    
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    
    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    
    #evaluation
    f1bin = f1_score(y_test, y_pred, average='binary')
    f1mac = f1_score(y_test, y_pred, average='macro')
    
    temp.append((fam, len(df1test), cm[1,1], cm[0,1], cm[0,0], cm[1,0], f1bin, f1mac))

    print('Fam: %s ** total: %d ** correct/TP: %d ** FP: %d **  TN: %d ** FN: %d ** f1bin: %f ** f1mac: %f' 
          % (fam, len(df1test), cm[1,1], cm[0,1], cm[0,0], cm[1,0], f1bin, f1mac))
  
svmlofocv_68f = pd.DataFrame(temp, columns = ['Fam', 'Total', 'Correct/TP', 'FP', 'TN', 'FN', 'f1bin', 'f1mac']) 

Fam: togaviridae ** total: 84 ** correct/TP: 84 ** FP: 19 **  TN: 4624 ** FN: 0 ** f1bin: 0.898396 ** f1mac: 0.948173
Fam: barnaviridae ** total: 2 ** correct/TP: 2 ** FP: 19 **  TN: 4624 ** FN: 0 ** f1bin: 0.173913 ** f1mac: 0.585931
Fam: arteriviridae ** total: 48 ** correct/TP: 48 ** FP: 19 **  TN: 4624 ** FN: 0 ** f1bin: 0.834783 ** f1mac: 0.916366
Fam: chuviridae ** total: 4 ** correct/TP: 4 ** FP: 19 **  TN: 4624 ** FN: 0 ** f1bin: 0.296296 ** f1mac: 0.647123
Fam: potyviridae ** total: 362 ** correct/TP: 361 ** FP: 19 **  TN: 4624 ** FN: 1 ** f1bin: 0.973046 ** f1mac: 0.985444
Fam: nodaviridae ** total: 20 ** correct/TP: 20 ** FP: 19 **  TN: 4624 ** FN: 0 ** f1bin: 0.677966 ** f1mac: 0.837958
Fam: hypoviridae ** total: 24 ** correct/TP: 24 ** FP: 19 **  TN: 4624 ** FN: 0 ** f1bin: 0.716418 ** f1mac: 0.857184
Fam: artoviridae ** total: 2 ** correct/TP: 2 ** FP: 19 **  TN: 4624 ** FN: 0 ** f1bin: 0.173913 ** f1mac: 0.585931
Fam: filoviridae ** total: 32 ** correct/TP: 32 ** FP: 19 

In [None]:
svmlofocv_68f

Unnamed: 0,Fam,Total,Correct/TP,FP,TN,FN,f1bin,f1mac
0,togaviridae,84,84,19,4624,0,0.898396,0.948173
1,barnaviridae,2,2,19,4624,0,0.173913,0.585931
2,arteriviridae,48,48,19,4624,0,0.834783,0.916366
3,chuviridae,4,4,19,4624,0,0.296296,0.647123
4,potyviridae,362,361,19,4624,1,0.973046,0.985444
5,nodaviridae,20,20,19,4624,0,0.677966,0.837958
6,hypoviridae,24,24,19,4624,0,0.716418,0.857184
7,artoviridae,2,2,19,4624,0,0.173913,0.585931
8,filoviridae,32,32,19,4624,0,0.771084,0.884517
9,virgaviridae,86,86,19,4624,0,0.900524,0.949237
