# Using all 5460 features

In [2]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
from Bio import SeqUtils
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFECV
from sklearn.metrics import confusion_matrix
import scikitplot as skplt
from sklearn import metrics
from collections import OrderedDict
from sklearn.datasets import make_classification

In [None]:
#Load virus data
df1 = pd.concat([pd.read_csv('ppnnw5.csv')[['Gen', 'die', 'Fam']],
                 pd.read_csv('ppnnw5.csv').iloc[:,10:],
                 pd.read_csv('ppnnw6.csv').iloc[:,10:]], axis = 1)

#df1 = df1.loc[df1['die'] == 'no']
df1['Gen'] = 1

#load human data
df2 = pd.concat([pd.read_csv('h1w5.csv')[['die']],
                 pd.read_csv('h1w5.csv').iloc[:, 5:], 
                 pd.read_csv('h1w6p1.csv').iloc[:,5:],
                 pd.read_csv('h1w6p2.csv').iloc[:,5:]], axis = 1)
df2 = df2.loc[df2['die'] == 'no']
df2['Gen'] = 0

In [None]:
temp = []

df2test = df2.sample(frac = 0.3)
df2train = df2[~df2.index.isin(df2test.index)]

for fam in set(df1['Fam'].to_list()):
    df1test = df1.loc[df1['Fam'] == fam]
    df1train = df1.loc[~(df1['Fam'] == fam)]
    df1train = df1train.loc[df1train['die'] == 'no']
    
    X_train = pd.concat([df1train.iloc[:,3:], df2train.iloc[:,1:-1]], axis = 0).reset_index(drop = True)
    y_train = pd.concat([df1train.iloc[:,0], df2train.iloc[:,-1]], axis = 0).reset_index(drop = True)

    X_test = pd.concat([df1test.iloc[:,3:], df2test.iloc[:,1:-1]], axis = 0).reset_index(drop = True)
    y_test = pd.concat([df1test.iloc[:,0], df2test.iloc[:,-1]], axis = 0).reset_index(drop = True)
    
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    # Fitting Random Forest Classification to the Training set
    classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy')
    classifier.fit(X_train, y_train)
    
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    
    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    
    #evaluation and visualization
    f1bin = f1_score(y_test, y_pred, average='binary')
    f1mac = f1_score(y_test, y_pred, average='macro')
    
    temp.append((fam, len(df1test), cm[1,1], cm[0,1], cm[0,0], cm[1,0], f1bin, f1mac))

    print('Fam: %s ** total: %d ** correct/TP: %d ** FP: %d **  TN: %d ** FN: %d ** f1bin: %f ** f1mac: %f' 
          % (fam, len(df1test), cm[1,1], cm[0,1], cm[0,0], cm[1,0], f1bin, f1mac))

#saving results
lofocv_allf = pd.DataFrame(temp, columns = ['Fam', 'Total', 'Correct/TP', 'FP', 'TN', 'FN', 'f1bin', 'f1mac']) 
lofocv_allf.to_csv('drive/My Drive/lofocvallf')

In [None]:
# Using 194 features

In [3]:
l =    ['CAATCG', 'GGTA', 'GTTGA', 'CATACG', 'TGAT', 'GTCGAA', 'CCAATT', 'TGTCGA', 'GTTGAC', 
        'CGGTTA', 'CGATA', 'TAGCGT', 'AAAAAA', 'GGTTGA', 'AAAAA', 'CAAT', 'CGTCAA', 'CGGTAA', 
        'CGTTGA', 'CAAC', 'ACCAAT', 'TTGACG', 'TCAATC', 'GTTG', 'CAATTG', 'CGCAAT', 'GGTT', 
        'TTGTCG', 'TCAA', 'CCAAT', 'ATACGC', 'CGATAG', 'TTGA', 'CATCAA', 'GTTGAT', 'ATACGG', 
        'CGATC', 'GTTGGT', 'GATC', 'CCGATA', 'GTCAAT', 'CGATAA', 'CGCGTT', 'ATCGTA', 'TCGGTT', 
        'ATCAAC', 'ATTGG', 'TCGA', 'GCAATC', 'CAATCT', 'GCGTAC', 'TTCGAC', 'GGTTG', 'TCCAAT', 
        'CGTAGT', 'TTGCG', 'CGGTT', 'CGATTA', 'ACGGTT', 'ATCAAT', 'TTGCGC', 'CAATC', 'CGAT', 
        'GCGTTA', 'CCGTTA', 'TTGAC', 'CGTCGA', 'ATAGCG', 'GTTGAG', 'TATCCG', 'GGTAGT', 'CGTTGC', 
        'TCGGTA', 'TGGTT', 'AGGTTG', 'AGGAA', 'TCGAGT', 'TCAAT', 'ATAGGG', 'GTACGC', 'ACGCAA', 
        'GGTACG', 'TCCTG', 'CAGAG', 'TTGATG', 'GTCGAC', 'CGGTTG', 'ACATCG', 'CAGC', 'TTTTTT', 
        'GGTATG', 'TAGGGT', 'GGTTC', 'TGGTTC', 'CGCGTA', 'AACC', 'CTCGAT', 'AACGGT', 'ATCAA', 
        'TCCAAC', 'AAAATG', 'AAATG', 'GTTGT', 'GTCATA', 'TCAATT', 'ACAATC', 'AATAAA', 'AATTGG', 
        'ATCATA', 'GGCGTA', 'AAATAA', 'TCAAC', 'CTAACG', 'ACGATA', 'TCAACC', 'ACCGGT', 'CAGA', 
        'GCATAC', 'CTGTA', 'CGGTA', 'GTTGC', 'CTGT', 'ATTGGT', 'GGTCAA', 'CGTA', 'GTCAA', 'TCGCAA', 
        'AGCTG', 'ATGGTA', 'CGCAA', 'GTCAAA', 'GATTGG', 'CTGGA', 'TTCGCA', 'ATCCAA', 'AACCGA', 
        'CCGCAA', 'GTCGA', 'TAACAC', 'TTTTTA', 'ATAACG', 'TATGGT', 'TTGGCG', 'TGGT', 'TCCAT', 
        'CAATT', 'GCGTAA', 'TCGT', 'CCGTAA', 'GGGTTA', 'CAATTA', 'AGCCT', 'TAGCGA', 'TACCAC', 
        'CGCAAA', 'ATATCG', 'CTGCA', 'CGTACC', 'ATCGGT', 'TCGATA', 'TCGTTG', 'TTGGT', 'CGACCA', 
        'GAAGA', 'GT', 'CTGAG', 'ATGCGA', 'CAG', 'GGTAC', 'GGTTGT', 'CCATAC', 'GTTGCG', 'GATAGG', 
        'GAT', 'CAA', 'CATACC', 'AACCGT', 'TTGCGA', 'CGTAT', 'CGTAAT', 'CGATGT', 'TTCCT', 'AC', 
        'CGATCG', 'CGCTAA', 'CACAAC', 'TACC', 'TTGAG', 'CCAATC', 'TATGCG', 'TATCG', 'AGCAG', 'GGTAG', 
        'TCGTAG', 'AACCCG', 'AGAG', 'GAATCA', 'GAAGTT', 'GAAGAT', 'TCTTTA', 'CGAAGA', 'TCGAT', 'CAGT', 
        'CTTGA', 'TACG', 'CGTATG', 'CAAACA', 'TTCG', 'AGGGTT', 'TCGACC', 'CGTT', 'CTGTGA', 'TCAGT', 
        'CAGCT', 'GTGTCT', 'TGTAAA', 'ATGCCT', 'TAAACC', 'ATACCG', 'AACCG', 'CAACAA', 'CATTGG', 
        'GTACTC', 'TGTGAA', 'ATACCC', 'AACACC', 'TGCAGT', 'ATGTAA', 'AGGA', 'AAGCAC', 'GTAGGT', 
        'GTAGCT', 'CGTAGA', 'ACCCAA', 'CCAATA', 'TATTTT', 'CTTG', 'TTTTT', 'GCAATG', 'GTCGTC', 
        'GTCTAA', 'ATGCTG', 'CGTATC', 'TAGTTA', 'CAATCC', 'TTGGTC', 'TCGCA', 'GTCCAA', 'TAGGTC', 
        'CGAC', 'TGATC', 'CATAAC', 'TGATTA', 'CAATAG', 'TCTCGA', 'CTACAG', 'CTTTGG', 'TACGTC', 
        'CTCAGT', 'TTGG', 'TATTTG', 'AAACTG', 'TTCGCC', 'TGACAC', 'ACTCGT', 'CACAAA', 'TGACGA', 
        'CTTGAT', 'TCGACT', 'TGATCG', 'TAACG', 'ATATTT', 'TAGAGG', 'CCGATC', 'TCAACG', 'GTCGAT', 
        'GCGATA', 'ATGCCG', 'ACAAC', 'ACACCG', 'TTACCG', 'CGTAC', 'GTATTT', 'TTGCGT', 'CTCAG', 
        'CTTGCG', 'TTTCGA', 'TCGTA', 'TCCTGA', 'TACGCC', 'GAAG', 'GCTATC', 'CTAGTG', 'CCCTAT', 
        'ACCAGA', 'GTACTG', 'CATTTA', 'GTTAGA', 'ATAATC', 'AGGCAT', 'GATGTT', 'TAGATT', 'CAACGC', 
        'CGATTG', 'ACATTT', 'AAATGT', 'CCGAT', 'GTTATA', 'TGTTG', 'TTCAAT', 'GTCGTT', 'ACCGAT', 
        'CGATAC', 'TCGATG', 'CCACAC', 'CCAT', 'GGGTGT', 'CTATC', 'GTTGTG', 'CTTTCG', 'ATAGAT', 
        'TATGGC', 'AGCTGG', 'CTCT', 'CACAGA', 'TGGTTG', 'CTAG', 'ACGGGT', 'CATTTT', 'TTTTAG', 
        'CATAAT', 'AACCCA', 'CATAG', 'ACGGGA', 'GGTGTT', 'CCGTCC', 'AAGACC', 'ATTGCG', 'CAGTTT', 
        'TGAAGA', 'ATCG', 'AATTTG', 'TCAAAA', 'CCAAC', 'ATCA', 'TGCAG', 'CAAATT', 'ACGTTA', 'CGACTA', 
        'ACCCCT', 'CCCACT', 'CCGTAT', 'CAACGG', 'TCCATC', 'ACTAGG', 'AAGCAG', 'TTTGTA', 'GGGTCA', 
        'CGGTGT', 'GCAGTT', 'ACAATT', 'ACAACC', 'ACGTTG', 'AAATAC', 'TTCAGT', 'AGTTGA', 'GACATA', 
        'CTGACT', 'CTGTTT', 'CAAGGG', 'ATTTTA', 'CAAAC', 'TAGG', 'CTGAA', 'GCGATC', 'CAACCA', 
        'ATTCGC', 'CAATTC', 'CGTTAG', 'CACTAG', 'AGCAGT', 'CTTCTT', 'CAACTG', 'GGTGAG', 'TTCAG', 
        'AAATA', 'CGTTGT', 'TATCGC', 'AGCCA', 'GGTGAT', 'GGATAA', 'GCTGTA', 'GAAGTA', 'TTGAT', 
        'ACCGAC', 'TAGTCG', 'CTGTG', 'ATCTA', 'ACACCA', 'ATGGAT', 'CCTC', 'TGAAG', 'CGCACT', 'TAACCC', 
        'ACGTCA', 'TGATAG', 'TCGAA', 'ATACCA', 'AGGGTA', 'GATCGT', 'ATCCAG', 'TCAGC', 'GCAGAT', 
        'TACCTG', 'ACCCGT', 'CCATA', 'AATTGA', 'AGGTAC', 'GGT', 'CTGCCT', 'ACTAGT', 'TTGTTC', 'TTACTT', 
        'TTTTA', 'CAGAC', 'GCGTAG', 'GAGCAT', 'ATC', 'ATGG', 'AAGCCT', 'CACTCG', 'TCAGTG', 'CTGGAG', 
        'CTTCCT', 'CGTCAT', 'CCGTTG', 'TAGTTG', 'CGTCCA', 'GCCCAG', 'ATGGT', 'TACAC', 'GATAGT', 
        'ACAACA', 'CTATAG', 'AATGGT', 'AGAAA', 'ATGACG', 'GATAAC', 'CAGCTT', 'CATGAT', 'TTCGTT', 
        'GTGTAC', 'TTGAAC', 'CACAAG', 'AATTG', 'GTCA', 'TGCCT', 'CTCGAC', 'CTATCT', 'TGACGG', 
        'CCATAT', 'ACCATT', 'AAAGAG', 'ATTAGA', 'GCGTAT', 'CGTTG', 'TACGT', 'ACGAT', 'TCCATG', 
        'CCTG', 'GTCAAC', 'TTGTGA', 'ACTAG', 'GCGACA', 'TGAAGC', 'TACAAC', 'ACGGTA', 'TAGCGG', 
        'ATTGAT', 'ATCACG', 'ATTCGG', 'TATTTC', 'TAAAAT', 'GTATCG', 'CAACCG', 'CCCCAA', 'CGTAAC',
        'CTAGGG', 'ATCAAA', 'ATTGGA', 'TCGTC', 'GCAACT', 'CACTA', 'CCATAG', 'CACAA', 'ATAGGT', 
        'ACGTAG', 'CATGCG', 'ATGATC', 'CCGTCA', 'CGATT', 'TTTGAC', 'CGTTA', 'TCTGTA', 'TAGATC', 
        'CATAGC', 'TCCAGA', 'ACACTC', 'CGATCA', 'GGATGT', 'TTACGC', 'AACTGC', 'ATCCAT', 'CCACTC', 
        'GTCCAT', 'ACCACT', 'GCGCGG', 'GTCAAG', 'AAGTGT', 'ATGATA', 'AGCT', 'CTAGAG', 'GCAGA', 
        'GTCATG', 'CTGTAG', 'TCATGC', 'CAGAA', 'ACACGG', 'CGGTAT', 'TTTTAA', 'TCAAAG', 'TTACAC', 
        'TCCGCT', 'CCCCCC', 'GCACAA', 'TTCTGT', 'TATGTA', 'GGTAAC', 'TAGCG', 'ATACC', 'GGTATC', 
        'ACAA', 'TCTGT', 'ACTA', 'TCCGTA', 'CTATCA', 'CGCATA', 'TCTCAG', 'CATGGA', 'ATCT', 'TGCGAC', 
        'ACAC', 'GACCAT', 'CAGCAT', 'ACG', 'CTAGTA', 'TTAGCG', 'TAACGA', 'ACTGAG', 'GAAGT', 'CTTGAC', 
        'CGTAGG', 'TCGCTC', 'CTGAAG', 'TTGACA', 'CGGCAA', 'CTCAAG', 'ACGTA', 'CAAGTG', 'TCAGA', 'GAGCAA', 
        'CGAAAG', 'ACTTTT', 'TACCG', 'GTTGTT', 'GCCATA', 'GCGTA', 'ACGA', 'TCGACG', 'ATACG', 'GTAC', 
        'TTGCCG', 'GGCGGG', 'CTAGT', 'GAACCA', 'TTTCTG', 'TCTTGA', 'TAAAA', 'TTGAGG', 'TAACGC', 'TATAGG',
        'GTCGTA', 'GCATTT', 'TACTG', 'TATCGA', 'CGTAA', 'AGGCTT', 'CGTTAT', 'TATC', 'TTGATT', 'TGCAGA', 
        'TAGGCG', 'GCTTGA', 'ATCTAT', 'GGTTAG', 'TACAAT']

c = [f for f in l if len(f) < 6]
b = [f for f in l if len(f) == 6]
b1 = [f for f in b if f[0] == 'A' or f[0] == 'T']
b2 = [f for f in b if f[0] == 'G' or f[0] == 'C']

In [4]:
df1 = pd.concat([pd.read_csv('../ppnnw5.csv')[['Gen', 'die', 'Fam'] + c], 
                 pd.read_csv('../ppnnw6.csv')[b]], axis = 1)

#df1 = df1.loc[df1['die'] == 'no']
df1['Gen'] = 1

df2 = pd.concat([pd.read_csv('../h1w5.csv')[['die'] + c], 
                 pd.read_csv('../h1w6p1.csv')[b1],
                 pd.read_csv('../h1w6p2.csv')[b2]], axis = 1)
df2 = df2.loc[df2['die'] == 'no']
df2['Gen'] = 0

In [5]:
temp = []

df2test = df2.sample(frac = 0.3)
df2train = df2[~df2.index.isin(df2test.index)]

for fam in set(df1['Fam'].to_list()):
    df1test = df1.loc[df1['Fam'] == fam]
    df1train = df1.loc[~(df1['Fam'] == fam)]
    df1train = df1train.loc[df1train['die'] == 'no']
    
    X_train = pd.concat([df1train.iloc[:,3:], df2train.iloc[:,1:-1]], axis = 0).reset_index(drop = True)
    y_train = pd.concat([df1train.iloc[:,0], df2train.iloc[:,-1]], axis = 0).reset_index(drop = True)

    X_test = pd.concat([df1test.iloc[:,3:], df2test.iloc[:,1:-1]], axis = 0).reset_index(drop = True)
    y_test = pd.concat([df1test.iloc[:,0], df2test.iloc[:,-1]], axis = 0).reset_index(drop = True)
    
    #scaling
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    # Fitting Random Forest Classification to the Training set
    classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy')
    classifier.fit(X_train, y_train)
    
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    
    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    
    #evaluation and visualization
    f1bin = f1_score(y_test, y_pred, average='binary')
    f1mac = f1_score(y_test, y_pred, average='macro')   
   
    temp.append((fam, len(df1test), cm[1,1], cm[0,1], cm[0,0], cm[1,0], f1bin, f1mac))
    print('Fam: %s ** total: %d ** correct/TP: %d ** FP: %d **  TN: %d ** FN: %d ** f1bin: %f ** f1mac: %f' 
          % (fam, len(df1test), cm[1,1], cm[0,1], cm[0,0], cm[1,0], f1bin, f1mac))

lofocv_194 = pd.DataFrame(temp, columns = ['Fam', 'Total', 'Correct/TP', 'FP', 'TN', 'FN', 'f1bin', 'f1mac']) 
lofocv_194.to_csv('drive/My Drive/lofocv194f')

NameError: name 'confusion_matrix' is not defined

# Using 68 features

In [52]:
l =    ['TCAATC', 'CCAATT', 'GTTGAC', 'TGAT', 'GGTT', 'GGTTGA', 'GTTGA',
       'TAGCGT', 'CGGTTA', 'TTGACG', 'CGATA', 'CGTCAA', 'CAATCG', 'CGTTGA',
       'GTCGAA', 'GTTG', 'CGGTAA', 'CATACG', 'AAAAAA', 'ACCAAT', 'TCAA',
       'TTGTCG', 'CGATAG', 'AAAAA', 'CAAC', 'ATACGC', 'CAATTG', 'CCAAT',
       'CGCAAT', 'GGTA', 'CAAT', 'TGTCGA', 'CGAT', 'CAATCT', 'GCGTAC', 'GGTTG',
       'TTGA', 'GTTGGT', 'TCCAAT', 'TTGCG', 'ACGGTT', 'CGATTA', 'ATCAAT',
       'GCAATC', 'ATCAAC', 'GTTGAT', 'GTCAAT', 'CGTCGA', 'ATTGG', 'CGTAGT',
       'ATAGCG', 'CCGATA', 'CAATC', 'CATCAA', 'TTCGAC', 'TTGAC', 'TTGCGC',
       'GCGTTA', 'CCGTTA', 'ATACGG', 'CGCGTT', 'CGATC', 'CGATAA', 'ATCGTA',
       'TCGA', 'TCGGTT', 'GATC', 'CGGTT']

c = [f for f in l if len(f) < 6]
b = [f for f in l if len(f) == 6]
b1 = [f for f in b if f[0] == 'A' or f[0] == 'T']
b2 = [f for f in b if f[0] == 'G' or f[0] == 'C']

In [59]:
df1 = pd.concat([pd.read_csv('../ppnnw5.csv')[['Gen', 'die', 'Fam'] + c], 
                 pd.read_csv('../ppnnw6.csv')[b]], axis = 1)

#df1 = df1.loc[df1['die'] == 'no']
df1['Gen'] = 1

df2 = pd.concat([pd.read_csv('../h1w5.csv')[['die'] + c], 
                 pd.read_csv('../h1w6p1.csv')[b1],
                 pd.read_csv('../h1w6p2.csv')[b2]], axis = 1)
df2 = df2.loc[df2['die'] == 'no']
df2['Gen'] = 0

In [62]:
temp = []

df2test = df2.sample(frac = 0.3)
df2train = df2[~df2.index.isin(df2test.index)]

for fam in set(df1['Fam'].to_list()):
    df1test = df1.loc[df1['Fam'] == fam]
    df1train = df1.loc[~(df1['Fam'] == fam)]
    df1train = df1train.loc[df1train['die'] == 'no']
    
    X_train = pd.concat([df1train.iloc[:,3:], df2train.iloc[:,1:-1]], axis = 0).reset_index(drop = True)
    y_train = pd.concat([df1train.iloc[:,0], df2train.iloc[:,-1]], axis = 0).reset_index(drop = True)

    X_test = pd.concat([df1test.iloc[:,3:], df2test.iloc[:,1:-1]], axis = 0).reset_index(drop = True)
    y_test = pd.concat([df1test.iloc[:,0], df2test.iloc[:,-1]], axis = 0).reset_index(drop = True)
    
    #scaling
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    # Fitting Random Forest Classification to the Training set
    classifier = RandomForestClassifier(n_estimators = 200, criterion = 'entropy')
    classifier.fit(X_train, y_train)
    
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    
    # Making the Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    
    #evaluation and visualization
    f1bin = f1_score(y_test, y_pred, average='binary')
    f1mac = f1_score(y_test, y_pred, average='macro')   
   
    temp.append((fam, len(df1test), cm[1,1], cm[0,1], cm[0,0], cm[1,0], f1bin, f1mac))
    print('Fam: %s ** total: %d ** correct/TP: %d ** FP: %d **  TN: %d ** FN: %d ** f1bin: %f ** f1mac: %f' 
          % (fam, len(df1test), cm[1,1], cm[0,1], cm[0,0], cm[1,0], f1bin, f1mac))

lofocv_68 = pd.DataFrame(temp, columns = ['Fam', 'Total', 'Correct/TP', 'FP', 'TN', 'FN', 'f1bin', 'f1mac']) 
lofocv_68.to_csv('drive/My Drive/lofocv68f')

Fam: kitaviridae ** total: 2 ** correct/TP: 2 ** FP: 5 **  TN: 4638 ** FN: 0 ** f1bin: 0.444444 ** f1mac: 0.721953
Fam: hepeviridae ** total: 32 ** correct/TP: 32 ** FP: 5 **  TN: 4638 ** FN: 0 ** f1bin: 0.927536 ** f1mac: 0.963499
Fam: caliciviridae ** total: 1762 ** correct/TP: 1576 ** FP: 5 **  TN: 4638 ** FN: 186 ** f1bin: 0.942866 ** f1mac: 0.961345
Fam: luteoviridae ** total: 106 ** correct/TP: 106 ** FP: 7 **  TN: 4636 ** FN: 0 ** f1bin: 0.968037 ** f1mac: 0.983641
Fam: mesoniviridae ** total: 26 ** correct/TP: 26 ** FP: 5 **  TN: 4638 ** FN: 0 ** f1bin: 0.912281 ** f1mac: 0.955871
Fam: tymoviridae ** total: 82 ** correct/TP: 73 ** FP: 6 **  TN: 4637 ** FN: 9 ** f1bin: 0.906832 ** f1mac: 0.952609
Fam: roniviridae ** total: 4 ** correct/TP: 4 ** FP: 7 **  TN: 4636 ** FN: 0 ** f1bin: 0.533333 ** f1mac: 0.766289
Fam: peribunyaviridae ** total: 4 ** correct/TP: 1 ** FP: 6 **  TN: 4637 ** FN: 3 ** f1bin: 0.181818 ** f1mac: 0.590424
Fam: narnaviridae ** total: 72 ** correct/TP: 62 ** 

ValueError: 7 columns passed, passed data had 8 columns