# 人肝毒性数据集V3

## 预备工作

### (1)导入需要的包

In [1]:
import pandas as  pd
import numpy as np
import math
from sklearn import tree
from sklearn import metrics
from sklearn import model_selection
from sklearn import decomposition
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from matplotlib import pyplot as plt
from sklearn.preprocessing import label_binarize
import copy
import pandas as pd
import numpy as np
import seaborn           as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn import preprocessing
from sklearn import decomposition
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import AffinityPropagation
from sklearn.datasets import load_iris
from sklearn.cluster import SpectralClustering
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from sklearn import tree
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
from sklearn import preprocessing

### (2)加载数据集

In [2]:
path_V1 = r'V1_ECFP4.csv'
path_V2 = r'V2_ECFP4.csv'
path_V3 = r'V3_ECFP4.csv'

In [3]:
raw_df = pd.read_csv(path_V3)
raw_df.head()

Unnamed: 0,0_x,1_x,2_x,3_x,4_x,5_x,6_x,7_x,8_x,9_x,...,1027_y,57804,8061,10962,10153,5566,2597,874_y,57149,label
0,0,0,0,0,0,0,0,0,0,0,...,-0.02115,0.03211,-0.02618,0.00932,0.01491,0.00428,0.01452,-0.0026,-0.00472,1
1,0,0,0,0,0,0,0,0,0,0,...,-0.01372,-0.01007,0.00708,-0.0079,-0.01119,0.01697,-0.00627,-0.00721,0.0069,1
2,0,0,0,0,0,0,0,0,0,0,...,-0.01936,0.01332,0.03463,-0.00217,-0.01188,0.00317,0.0,-0.01888,-0.006,1
3,0,1,0,0,0,1,0,0,0,0,...,-0.08093,-0.00731,-0.06143,-0.00253,-0.04999,0.0173,0.00477,-0.00133,0.002,1
4,0,0,0,0,0,0,0,0,0,0,...,0.01924,-0.01487,0.01042,0.01155,-0.04849,-0.00483,0.0,-0.00921,0.01372,1


In [4]:
label = raw_df['label']
label.head()

0    1
1    1
2    1
3    1
4    1
Name: label, dtype: int64

按照说明文件中的描述，把数据的不同属性分开，其中ECFP4为离散型数据，其余为连续型数据

In [5]:
X=raw_df.iloc[:,:-1]
ECFP4 = raw_df.iloc[:,0:2048]
phychem = raw_df.iloc[:,2048:2248]
L7 = raw_df.iloc[:,2248:-1]
phychem_ECFP4 = pd.concat([phychem,ECFP4],axis=1)
print(X.shape)
print(ECFP4.shape)
print(phychem.shape)
print(L7.shape)
print(phychem_ECFP4.shape)

(252, 3226)
(252, 2048)
(252, 200)
(252, 978)
(252, 2248)


### (3)函数定义

In [6]:
# 输入分类器，获取评价标准，如accuracy、precision等
def my_cross_validate_score(estimator,X,y,cv = 5,mean = False,ret_est=False):
    kf = model_selection.StratifiedKFold(n_splits=cv)
    #存储k次训练中得到的模型与其对应的分数
    res = dict()
    accuracy = []
    f1_score = []
    auc = []
    recall_0 = []
    recall_1 = []
    precision_0 = []
    precision_1 = []
    it=1
    #进行k次训练
    for train_index, test_index in kf.split(X,y):
        # print('train_index', train_index, 'test_index', test_index)
        train_X, train_y = X.iloc[train_index],y.iloc[train_index]
        test_X, test_y = X.iloc[test_index],y.iloc[test_index]
        estimator.fit(train_X,train_y)
        clf_predict = estimator.predict(test_X)
        report = metrics.classification_report(test_y,clf_predict,output_dict=True)
        
        accuracy.append(report['accuracy'])
        f1_score.append(report['macro avg']['f1-score'])
        try:
            if isinstance(estimator, SVC) or isinstance(estimator, LinearSVC):
                score = estimator.decision_function(test_X)
                test_y_hot = label_binarize(test_y, classes=(0, 1))
                fpr, tpr, thresholds = metrics.roc_curve(test_y_hot.ravel(), score.ravel())
                auc.append(metrics.auc(fpr, tpr))
            else:
                auc.append(metrics.roc_auc_score(test_y,estimator.predict_proba(test_X)[:,1]))
        except:
            auc.append(0)
        recall_0.append(report['0']['recall'])
        recall_1.append(report['1']['recall'])
        precision_0.append(report['0']['precision'])
        precision_1.append(report['1']['precision'])
        # print("iteration",it,".....")
        it+=1
        res['accuracy'] = accuracy
        res['f1_score'] = f1_score
        res['auc'] = auc
        res['recall_0'] = recall_0
        res['recall_1'] = recall_1
        res['precision_0'] = precision_0
        res['precision_1'] = precision_1
        
    if mean:
        for key in res.keys():
            res[key] = np.mean(res[key])
    if ret_est:
        return res,estimator
    else:
        return res


In [7]:
def show_heatmap(cm,col_names,figsize=(5, 5)):
    f, ax = plt.subplots(figsize=figsize)
    ax =  sns.heatmap(cm,cmap="YlGnBu_r",fmt="d",annot=True,ax=ax,xticklabels=col_names,yticklabels=col_names)
    ax.set_xlabel("cluster")
    ax.set_ylabel("truth")
    plt.show() 

In [8]:
rand_state = 2022528

### (4)模型构建的一些前置工作

In [None]:

als = [DecisionTreeClassifier(),RandomForestClassifier(),ExtraTreeClassifier(),GradientBoostingClassifier()
,xgb.XGBClassifier(probability=True,use_label_encoder=False),SVC(),LinearSVC(),KNeighborsClassifier()]
names = ['DecisionTreeClassifier','RandomForestClassifier','ExtraTreeClassifier',
'GradientBoostingClassifier','XGBClassifier','SVC','LinearSVC','KNeighborsClassifier']

for clf,name in zip(als,names):
    res = my_cross_validate_score(clf,ECFP4,label,mean=True,cv=10)
    print(name)
    print(res)

## 模型的实现

In [16]:
class My_up_sampler():
    def __init__(self):
        pass
    
    def up_sample(self,X_train,y_train):
        index_1 = y_train[y_train == 1].index
        index_0 = y_train[y_train == 0].index
        X_train_0 = X_train.loc[index_0]
        y_train_0 = y_train.loc[index_0]
        scale = len(index_1)/len(index_0)
        # print('scale:',scale)
        ret_X = copy.deepcopy(X_train)
        ret_y = copy.deepcopy(y_train)
        for i in range(math.ceil(scale)-1):
            ret_X = ret_X.append(X_train_0)
            ret_y = ret_y.append(y_train_0)
        return ret_X,ret_y  


In [9]:
class My_vote_select_with_up_sample_clf():
    def __init__(self, topN,score = 'recall',offset = 0):
        #使用phychem进行训练的基分类器
        #KNN不支持特征选择
        GDBT_phychem = GradientBoostingClassifier()
        RandomForest_phychem = RandomForestClassifier()
        XGB_phychem = xgb.XGBClassifier(verbosity=0,use_label_encoder=False)
        SVC_phychem = SVC(kernel="linear")
        # KNN_phychem = KNeighborsClassifier()
        #使用ECFP4_phychem进行训练的基分类器
        RandomForest_ECFP4_phychem = RandomForestClassifier()
        ExtraTree_ECFP4_phychem = ExtraTreeClassifier()
        SVC_ECFP4_phychem = SVC(kernel="linear")
        LinearSVC_ECFP4_phychem = LinearSVC()
        # KNN_ECFP4_phychem = KNeighborsClassifier()
        #使用ECFP4进行训练的基分类器
        RandomForest_ECFP4 = RandomForestClassifier()
        GDBT_ECFP4 = GradientBoostingClassifier()
        SVC_ECFP4 = SVC(kernel="linear")
        LinearSVC_ECFP4 = LinearSVC()
        # KNN_ECFP4 = KNeighborsClassifier()
        
        self.base_als_phychem = [GDBT_phychem,RandomForest_phychem,XGB_phychem,SVC_phychem]
        self.base_als_ECFP4_phychem = [RandomForest_ECFP4_phychem,ExtraTree_ECFP4_phychem,SVC_ECFP4_phychem,LinearSVC_ECFP4_phychem]
        self.base_als_ECFP4 = [RandomForest_ECFP4,GDBT_ECFP4,SVC_ECFP4,LinearSVC_ECFP4]
        self.all_base_algo = self.base_als_phychem+self.base_als_ECFP4_phychem+self.base_als_ECFP4
        self.algo_names = ['GDBT_phychem','RandomForest_phychem','XGB_phychem','SVC_phychem',
        'RandomForest_ECFP4_phychem','ExtraTree_ECFP4_phychem','SVC_ECFP4_phychem','LinearSVC_ECFP4_phychem',
        'RandomForest_ECFP4','GDBT_ECFP4','SVC_ECFP4','LinearSVC_ECFP4']
        selectors = []
        for algo in self.all_base_algo:
            selectors.append(SelectFromModel(estimator = algo))
        self.algo_selector_set = list(zip(self.all_base_algo,selectors,self.algo_names))

        self.res_dic = dict()
        self.keys = ['accuracy','f1_score','auc','recall_0','recall_1','precision_0','precision_1']

        self.kmeans = KMeans(n_clusters=2)
        # self.prepredictor = My_pre_clus()
        self.topN = topN
        self.score = score
        self.offset = offset

    def res_map(self,x):
        thres = int(self.topN/2)+self.offset
        # print(thres)
        if x<=thres:
            return 0
        else:
            return 1
        
    def fit(self,X_train,y_train):
        my_up_sampler = My_up_sampler()
        # print('X_train.shape',X_train.shape)
        # print('y_train.shape',y_train.shape)
        X,y = my_up_sampler.up_sample(X_train,y_train)
        # print(X.shape)
        # print(y.shape)
        ECFP4 = X.iloc[:,0:1684]
        phychem = X.iloc[:,1684:1860]
        L7 = X.iloc[:,1860:]
        phychem_ECFP4 = pd.concat([phychem,ECFP4],axis=1)

        for key in self.keys:
            self.res_dic[key] = []

        for i in range(len(self.algo_selector_set)):
            t=self.algo_selector_set[i]
            # print('fitting:',t[2])
            if i in range(4):
                t[1].fit(phychem,y)
                train_selected = t[1].transform(phychem)
            elif i in range(4,8):
                t[1].fit(phychem_ECFP4,y)
                train_selected = t[1].transform(phychem_ECFP4)
            else:
                t[1].fit(ECFP4,y)
                train_selected = t[1].transform(ECFP4)
            
            train_selected = pd.DataFrame(train_selected)
            temp_res = my_cross_validate_score(t[0],train_selected,y,cv=10,mean=True)
            for key in self.keys:
                self.res_dic[key].append(temp_res[key])

        pca_2=decomposition.PCA(n_components=2)
        phychem_2d = pca_2.fit_transform(phychem)
        self.kmeans.fit(phychem_2d)
        # self.prepredictor.fit(X_train,y_train)

        # print(self.res_dic)
    def predict(self,X_test):
        ECFP4 = X_test.iloc[:,0:1684]
        phychem = X_test.iloc[:,1684:1860]
        L7 = X_test.iloc[:,1860:]
        phychem_ECFP4 = pd.concat([phychem,ECFP4],axis=1)
        
        if self.score in ['recall','precision']:
            clf41_index = pd.DataFrame(self.res_dic).sort_values([self.score+'_1'],ascending=False)[:self.topN].index
            clf40_index = pd.DataFrame(self.res_dic).sort_values([self.score+'_0'],ascending=False)[:self.topN].index
        else:
            clf41_index = pd.DataFrame(self.res_dic).sort_values([self.score],ascending=False)[:self.topN].index
            clf40_index = pd.DataFrame(self.res_dic).sort_values([self.score],ascending=False)[:self.topN].index 


        clf41 = []
        clf40 = []
        for i,j in zip(clf41_index,clf40_index):
            clf41.append(copy.deepcopy(self.algo_selector_set[i]))
            clf40.append(copy.deepcopy(self.algo_selector_set[j]))
        
        pca_2=decomposition.PCA(n_components=2)
        phychem_2d = pca_2.fit_transform(phychem)
        prepred = self.kmeans.predict(phychem_2d)
        # prepred = self.prepredictor.predict(X_test)
        clf41_preds = []
        clf40_preds = []

        for i in range(len(clf41)):
            t = clf41[i]
            if clf41_index[i] in range(0,4):
                test_data = phychem
            elif clf41_index[i] in range(4,8):
                test_data = phychem_ECFP4
            else:
                test_data = ECFP4
            clf41_preds.append(t[0].predict(t[1].transform(test_data)))

        for i in range(len(clf40)):
            t = clf40[i]
            if clf40_index[i] in range(0,4):
                test_data = phychem
            elif clf40_index[i] in range(4,8):
                test_data = phychem_ECFP4
            else:
                test_data = ECFP4
            clf40_preds.append(t[0].predict(t[1].transform(test_data)))

        # col_names = ['clf0','clf1','clf2','clf3','clf4']
        clf41_preds_dic = dict()
        clf40_preds_dic = dict()
        
        for i  in range(self.topN):
            key = 'clf'+str(i)
            clf41_preds_dic[key] = clf41_preds[i]
            clf40_preds_dic[key] = clf40_preds[i]
        
        clf41_preds_df = pd.DataFrame(clf41_preds_dic)
        clf40_preds_df = pd.DataFrame(clf40_preds_dic)

        clf41_voted_pred = clf41_preds_df.sum(axis=1).map(self.res_map)
        clf40_voted_pred = clf40_preds_df.sum(axis=1).map(self.res_map)
        
        res = []

        for i in range(len(prepred)):
            if prepred[i] == 1:
                res.append(clf41_voted_pred[i])
            else:
                res.append(clf40_voted_pred[i])

        # print('prepred',prepred)
        # print('clf41',clf41)
        # print('clf40',clf40)
        # print('clf41_preds_dic',clf41_preds_dic)
        # print('clf40_preds_dic',clf40_preds_dic)
        # print('clf41_voted_pred',clf41_voted_pred)
        # print('clf40_voted_pred',clf40_voted_pred)
        return res        

In [19]:
scores = ['accuracy','precision','recall','f1_score']
topNs = range(1,6,2)
reports = []

for score in scores :
    for topN in topNs:
        reports.append("{}-{}".format(score,topN))
        my_vote_clf = My_vote_select_with_up_sample_clf(topN,score=score)
        report = my_cross_validate_score(my_vote_clf,X,label,cv = 10,mean=True)
        #reports.append(metrics.classification_report(y_test,my_vote_clf_pred))
        reports.append(report)
        print("Done {}-{}".format(score,topN))
for report in reports:
    print(report)

Done accuracy-1
Done accuracy-3
Done accuracy-5
Done precision-1
Done precision-3
Done precision-5
Done recall-1




Done recall-3
Done recall-5




Done f1_score-1
Done f1_score-3
Done f1_score-5
accuracy-1
{'accuracy': 0.6466153846153846, 'f1_score': 0.5345747930009426, 'auc': 0.0, 'recall_0': 0.30476190476190473, 'recall_1': 0.7713450292397661, 'precision_0': 0.3246428571428571, 'precision_1': 0.7527157465857156}
accuracy-3
{'accuracy': 0.6349230769230768, 'f1_score': 0.5250185063265558, 'auc': 0.0, 'recall_0': 0.30476190476190473, 'recall_1': 0.7555555555555556, 'precision_0': 0.29797619047619045, 'precision_1': 0.7496535456287778}
accuracy-5
{'accuracy': 0.6467692307692307, 'f1_score': 0.5325242026635214, 'auc': 0.0, 'recall_0': 0.29047619047619044, 'recall_1': 0.7766081871345029, 'precision_0': 0.32662698412698415, 'precision_1': 0.7489340385276918}
precision-1
{'accuracy': 0.6429230769230767, 'f1_score': 0.5263546660418735, 'auc': 0.0, 'recall_0': 0.3047619047619047, 'recall_1': 0.7660818713450293, 'precision_0': 0.3004906204906205, 'precision_1': 0.7525994520615263}
precision-3
{'accuracy': 0.6429230769230768, 'f1_score': 0