In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, auc, roc_curve
from sklearn.ensemble import RandomForestClassifier

import random


import warnings 
warnings.filterwarnings("ignore") 

In [2]:
def genData(df_tmp,df_health,runID_pos,disease,randomState=0):
    df_pos = df_tmp[ ( df_tmp['SampleID'].isin(runID_pos) ) & ( df_tmp['disease'] == disease )].drop_duplicates()
    df_pos['label'] = 1
    
    random.seed(randomState)
    neg_idx = random.sample(range(len(df_health)), len(df_pos))
    df_neg = df_health.loc[neg_idx]
    df_neg['label'] = 0
    
    df_health_new = df_health.loc[~df_health.index.isin(neg_idx)]
    df_health_new = df_health_new.reset_index(drop=True)
    
    data = pd.concat([df_pos,df_neg])
    data = data.reset_index(drop=True)
    return data,df_health_new

In [3]:
def isDisease(x):
    if disease_name in x and len(x)==1:
        return 'Single'
    elif disease_name in x and len(x)>1:
        return 'Multiple'
    else:
        return 'Other'

In [4]:
def Eval(true, prob):
    fpr, tpr, thresholds = roc_curve(true, prob, pos_label=1)
    auc_tmp = auc(fpr, tpr)

    prob[prob<0.5] = 0
    prob[prob>=0.5] = 1
    return auc_tmp

In [5]:
def genData_test(df_tmp,df_health,runID_pos,disease,randomState=0):
    df_pos = df_tmp[ df_tmp['SampleID'].isin(runID_pos ) 
                    & ( df_tmp['disease'] == disease )].drop_duplicates()    
    df_pos['label'] = 1    
    df_pos = df_pos.reset_index(drop=True)
    
    random.seed(randomState)
    pos_idx = random.sample(range(len(df_pos)), len(df_health))
    
    data_Multiple = pd.concat([df_pos.loc[pos_idx,df_health.columns],df_health])
    data_Multiple = data_Multiple.reset_index(drop=True)   
    
    return data_Multiple

In [6]:
def result_single(disease_name):
    data_Single, df_health_new = genData(df_new,df_health,
                                                 set(disease_sta.loc[disease_sta['is_'+disease_name]=='Single','SampleID']),
                                                 disease_name)   


    train = data_Single[column_names+['label']]
    std_ = train[column_names].std()
    feats = [x for x in train[column_names].columns if x not in list(std_[std_==0].index)]


    auc_single_array = np.zeros(5)
    auc_multiple_array = np.zeros(5)

    auc_single_rf = np.zeros(5)
    auc_multiple_rf = np.zeros(5)

    count = 0


    for train_index, test_index in kf.split(train,train['label']):


        train_, test_ = train.loc[train_index], train.loc[test_index]
        y_test = test_['label']


        clf.fit(train_[feats],train_['label'])
        y_pred = clf.predict_proba(test_[feats])[:,1]    
        auc_single_array[count] = Eval(y_test, y_pred)

        rf.fit(train_[feats],train_['label'])
        y_pred = rf.predict_proba(test_[feats])[:,1] 
        auc_single_rf[count] = Eval(y_test, y_pred)


        data_Multiple = genData_test(df_new,train_[train_['label']==0],
                                     set(disease_sta.loc[disease_sta['is_'+disease_name]=='Multiple','SampleID']) ,disease_name)


        clf.fit(data_Multiple[feats],data_Multiple['label'])
        y_pred = clf.predict_proba(test_[feats])[:,1]
        auc_multiple_array[count] = Eval(y_test, y_pred)

        rf.fit(data_Multiple[feats],data_Multiple['label'])
        y_pred = rf.predict_proba(test_[feats])[:,1]
        auc_multiple_rf[count] = Eval(y_test, y_pred)

        count = count + 1
    
    print('For lightgbm,the AUC of SD is %5.4f, the AUC of MD is %5.4f.'%(np.mean(auc_single_array),np.mean(auc_multiple_array)))
    print('For randomforest,the AUC of SD is %5.4f, the AUC of MD is %5.4f.'%(np.mean(auc_single_rf),np.mean(auc_multiple_rf)))

In [7]:
df_disease = pd.read_csv('patients_ID.csv')
df_disease.columns = ['SampleID','disease']

In [8]:
df = pd.read_csv('disease_health.OTU.Abd_newID')

disease_SampleID = set(df['SampleID']) & set(df_disease['SampleID'])

health_SampleID = set(df['SampleID']) - set(df_disease['SampleID'])
df_health = pd.DataFrame({'SampleID':list(health_SampleID)})
df_health['disease'] = 'health'

df_all = pd.concat([df_disease.loc[df_disease['SampleID'].isin(disease_SampleID)],df_health])
df_all = df_all.reset_index(drop=True)

In [9]:
df_new = df_all.merge(df,how='left',on=['SampleID'])
disease_sta = pd.DataFrame(df_new[['SampleID','disease']].groupby('SampleID',as_index=False).agg(set))

In [10]:
disease_sta['isHealth'] = disease_sta['disease'].apply(lambda x: 1 if 'health' in x else 0)
runID_health = set(disease_sta.loc[disease_sta['isHealth']==1,'SampleID'])

df_health = df_new[df_new['SampleID'].isin(runID_health)]
df_health = df_health.reset_index(drop=True)

In [11]:
kf = StratifiedKFold(n_splits  = 5, shuffle=True, random_state=2020)

clf = lgb.LGBMClassifier(learning_rate=0.02,max_depth=6,n_estimators=1000,random_state=2020,num_leaves =64,
                         n_jobs=50,subsample=0.8,subsample_freq=5,colsample_bytree=0.7)
rf = RandomForestClassifier(n_estimators=500,n_jobs=50,random_state=2020)

In [12]:
column_names = [x for x in list(df_new.columns) if 'SampleID' != x and 'disease' != x]

In [13]:
disease_name = 'thyroid'
disease_sta['is_'+disease_name] = disease_sta['disease'].apply(lambda x: isDisease(x))

In [15]:
result_single('thyroid')

For lightgbm,the AUC of SD is 0.7281, the AUC of MD is 0.6997.
For randomforest,the AUC of SD is 0.7138, the AUC of MD is 0.6826.


In [16]:
data_Multiple, df_health_new = genData(df_new,df_health,
                                             set(disease_sta.loc[disease_sta['is_'+disease_name]=='Multiple','SampleID']),
                                             disease_name)   
pos = data_Multiple[data_Multiple['label']==1]
pos = pos.reset_index(drop=True)
random.seed(11)
pos_idx = random.sample(range(len(pos)), 698)

neg = data_Multiple[data_Multiple['label']==0]
neg = neg.reset_index(drop=True)
neg_idx = random.sample(range(len(neg)), 698)
    
data_Multiple = pd.concat([pos.loc[pos_idx],neg.loc[neg_idx]])
data_Multiple = data_Multiple.reset_index(drop=True)   

In [17]:
train = data_Multiple[column_names+['label']]
std_ = train[column_names].std()
feats = [x for x in train[column_names].columns if x not in list(std_[std_==0].index)]

auc_single_array = np.zeros(5)
auc_multiple_array = np.zeros(5)

auc_single_rf = np.zeros(5)
auc_multiple_rf = np.zeros(5)

count = 0


for train_index, test_index in kf.split(train,train['label']):
    
    
    train_, test_ = train.loc[train_index], train.loc[test_index]
    y_test = test_['label']
    
    clf.fit(train_[feats],train_['label'])
    y_pred = clf.predict_proba(test_[feats])[:,1]    
    auc_multiple_array[count] = Eval(y_test, y_pred)
    
    rf.fit(train_[feats],train_['label'])
    y_pred = rf.predict_proba(test_[feats])[:,1]
    auc_multiple_rf[count] = Eval(y_test, y_pred)
    
    
    data_Single = genData_test(df_new,train_[train_['label']==0],
                                 set(disease_sta.loc[disease_sta['is_'+disease_name]=='Single','SampleID']) ,disease_name)
    
    
    clf.fit(data_Single[feats],data_Single['label'])
    y_pred = clf.predict_proba(test_[feats])[:,1]
    auc_single_array[count] = Eval(y_test, y_pred)
    
    rf.fit(data_Single[feats],data_Single['label'])
    y_pred = rf.predict_proba(test_[feats])[:,1]
    auc_single_rf[count] = Eval(y_test, y_pred)
    
    count = count + 1
    
print('For lightgbm,the AUC of MD is %5.4f, the AUC of SD is %5.4f.'%(np.mean(auc_multiple_array),np.mean(auc_single_array)))
print('For randomforest,the AUC of MD is %5.4f, the AUC of SD is %5.4f'%(np.mean(auc_multiple_rf),np.mean(auc_single_rf)))

For lightgbm,the AUC of MD is 0.7938, the AUC of SD is 0.7642.
For randomforest,the AUC of MD is 0.7690, the AUC of SD is 0.7553
