In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord
from Bio import SeqUtils
from sklearn.model_selection import cross_val_score

In [5]:
def split(df1):
    p = df1.loc[:2861].sample(frac=1).reset_index(drop = True)
    n = df1.loc[2862:].sample(frac=1).reset_index(drop = True)
    ps1 = p.iloc[:1144]
    ns1 = n.iloc[:112]
    ps2 = p.iloc[1144:2288].reset_index(drop = True)
    ns2 = n.iloc[112:224].reset_index(drop = True)
    ps3 = p.iloc[2288:].reset_index(drop = True)
    ns3 = n.iloc[224:].reset_index(drop = True)
    
    return (ps1,ns1,ps2,ns2,ps3,ns3)

def featsrc(rp1, rn1):    
    
    res = pd.concat([rp1,rn1], axis = 0).reset_index(drop = True)
    
    rpd = pd.concat([rp1.quantile(q=0.05), rp1.quantile(q=0.10), rp1.quantile(q=0.90), rp1.quantile(q=0.95)], axis=1)
    rnd = pd.concat([rn1.quantile(q=0.05), rn1.quantile(q=0.10), rn1.quantile(q=0.90), rn1.quantile(q=0.95)], axis=1)
    
    odf = pd.DataFrame(index = rpd.index, columns= rpd.columns)

    for i3 in rpd.index:
        odf.loc[i3,0.05] = max(rpd.loc[i3,0.05], rnd.loc[i3,0.05])
        odf.loc[i3,0.10] = max(rpd.loc[i3,0.10], rnd.loc[i3,0.10])
        odf.loc[i3,0.90] = min(rpd.loc[i3,0.90], rnd.loc[i3,0.90])
        odf.loc[i3,0.95] = min(rpd.loc[i3,0.95], rnd.loc[i3,0.95])
    
    odf['in 90'] = 0
    odf['in 80'] = 0

    for i in res.index:
        for c in res.columns:
            if odf.loc[c,0.05] < res.loc[i,c] < odf.loc[c,0.95]:
                odf.loc[c, 'in 90'] = odf.loc[c, 'in 90'] + 1

                if odf.loc[c,0.10] < res.loc[i,c] < odf.loc[c,0.90]:
                    odf.loc[c, 'in 80'] = odf.loc[c, 'in 80'] + 1
    
    #print(odf.sort_values(by=['in 80', 'in 90'])[:5])
    return odf.sort_values(by=['in 80', 'in 90'])[:7].index


def rfimp(feats,pnrtr,pnrts):
    
    r1 = pnrtr[feats].copy()
    r2 = pnrts[feats].copy()
    pnrtr = pnrtr.iloc[:,:8].drop(columns=['Id', 'MolTyp', 'Topol', 'Des', 'Len'])
    pnrts = pnrts.iloc[:,:8].drop(columns=['Id', 'MolTyp', 'Topol', 'Des', 'Len'])
    
    pnrtr = pd.concat([pnrtr, r1], axis = 1)
    pnrts = pd.concat([pnrts, r2], axis = 1)
         
    #Splitting features and labels
    X_train = pnrtr.iloc[:, 3:].values.astype(float)
    X_test = pnrts.iloc[:, 3:].values.astype(float)
    y_train = pnrtr.iloc[:, 1].values.astype(float)
    y_test = pnrts.iloc[:, 1].values.astype(float)
    
    X = np.concatenate([X_train, X_test], axis = 0)
    y = np.concatenate([y_train, y_test], axis = 0)
       
    # Feature Scaling
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    
    # Fitting Random Forest Classification to the Training set
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators = 15, criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    
    # Predicting the Test set results
    y_pred = classifier.predict(X_test)
    
    # Making the Confusion Matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
    
    scores_mac = cross_val_score(classifier, X, y, cv=5, scoring='f1_macro')
    
    return scores_mac


pndf = pd.read_csv('pndf.csv')
res = pd.read_csv('res 340.csv')
df1 = pd.concat([pndf, res], axis = 1)

for i1 in range(10):
    df1['Gen'] = 0
    rindex = np.random.choice(len(df1), 283, replace=False)
    for i2 in rindex:
        df1.loc[i2,'Gen'] = 1

    aclist = []

    for i2 in range(5):
        a,b,c,d,e,f = split(df1)
        ps1 = a.iloc[:,:8]
        ns1 = b.iloc[:,:8]
        rp1 = a.iloc[:,8:]
        rn1 = b.iloc[:,8:]
        pnrtr = pd.concat([c,d], axis = 0)
        pnrts = pd.concat([e,f], axis = 0)    

        feats = featsrc(rp1,rn1)
        fmac = rfimp(feats, pnrtr, pnrts)

        aclist.append(fmac.mean())
        print(fmac.mean(), fmac.std())
            
    print(np.mean(aclist))

0.47307856047108265
0.47307856047108265 0.0013652631810758956
0.48363973763116536
0.48363973763116536 0.012122975138786598
0.475857414992948
0.475857414992948 0.0009887612193222638
0.4785068137724691
0.4785068137724691 0.010010661557066383


KeyboardInterrupt: 

In [None]:
#challenger

In [7]:
pndf = pd.read_csv('pndf.csv')
res = pd.read_csv('res 340.csv')
df1 = pd.concat([pndf, res], axis = 1)
df1.loc[pndf['Gen'] == 'ssRNA(+)', 'Gen'] = 0
df1.loc[pndf['Gen'] == 'ssRNA(-)', 'Gen'] = 1

aclist = []

for i in range(20):
    a,b,c,d,e,f = split(df1)
    ps1 = a.iloc[:,:8]
    ns1 = b.iloc[:,:8]
    rp1 = a.iloc[:,8:]
    rn1 = b.iloc[:,8:]
    pnrtr = pd.concat([c,d], axis = 0)
    pnrts = pd.concat([e,f], axis = 0)    
    
    feats = featsrc(rp1,rn1)
    fmac = rfimp(feats, pnrtr, pnrts)

    aclist.append(fmac.mean())
    print(fmac.mean(), fmac.std()*2)

print(np.mean(aclist))

0.9207575896760926 0.0460903691917027
0.9412977975037589 0.04482698769864616
0.9395340335104635 0.03824609389016609
0.9244683788042651 0.053371526678654205
0.9353771025464184 0.052068139940026265
0.9334737930406167 0.0636438344598506
0.9462532495819532 0.022987400548310652
0.9358735483036249 0.021858984558209477
0.9435829220446307 0.04585539795637788
0.9372946134250281 0.030292373812348092
0.9379518908588519 0.03223200624229081
0.940094700470582 0.040488132459892995
0.9451731038701482 0.031790645490533044
0.9278534967227458 0.07480711377705579
0.9469624686865095 0.030440340435937765
0.9379419780843827 0.0621904672943766
0.9417141751672794 0.050057984027103576
0.9385583882781713 0.03274350870892962
0.9397745282351442 0.05152227599243765
0.9405825737586404 0.04112095924323731
0.9377260166284653
