In [1]:
import math
import os
from os.path import join as opj
import warnings
warnings.filterwarnings('ignore')
import time

import pandas as pd
import numpy as np
import neurolab as nl
from minepy import MINE
import matplotlib.pyplot as plt
import lime
from lime import lime_tabular
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import ClassifierMixin, BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, ShuffleSplit, RepeatedKFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from mlxtend.classifier import EnsembleVoteClassifier
from lshash.lshash import LSHash
from minepy import MINE
from tqdm import tqdm
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from communities.algorithms import louvain_method, girvan_newman
from communities.visualization import draw_communities
from mlxtend.classifier import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from pyitlib.discrete_random_variable import information_mutual_conditional as imc
from somlearn import SOM
import networkx as nx
import seaborn as sns
from collections import Counter
from sklearn.feature_selection import SelectFromModel
from mlxtend.classifier import EnsembleVoteClassifier, StackingClassifier

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import RidgeClassifierCV, RidgeClassifier

In [2]:
def decompose_fusion_feature(fid: int, gene_num=36):
    """
        decompose fusion feature into snps_id and ROI_id
        
        ROI_id = fid // gene_num
        gene_id = fid % gene_num
        
        More specifically, in our experiment, gene is 36
        
        return ROI_id: the ROI compose the fusion features `fid`
        return snp_id: the SNP compose the fusion features `fid`
    """
    ROI_id = fid // gene_num
    gene_id = fid % gene_num
    
    return ROI_id, gene_id

In [4]:
class EnsembleVote(BaseEstimator, ClassifierMixin):
    def __init__(self, clfs, oob_features):
        self.clfs = clfs.copy()
        self.oob_features = oob_features.copy()
        
    def fit(self, X, y):
        pass
    
    def predict(self, X):
        predicted_probabilitiy = self.predict_proba(X)
        return np.argmax(predicted_probabilitiy, axis=1)        
    
    def predict_proba(self, X):
        predict_proba = []
#         for i in tqdm(range(len(self.clfs)), desc='predict_proba'):
        for i in range(len(self.clfs)):
            predict_proba.append(self.clfs[i].predict_proba(X[:, self.oob_features[i]]))

        predicted_probabilitiy = np.sum(predict_proba, axis=0)
        
        return predicted_probabilitiy / len(self.clfs)
    
    def fit_predict(self, X):
        return self.predict(X)
    
    def fit_predict_proba(self, X):
        return self.predict_proba(X)
    
    def predict_w(self, X, w):
        predicted_probability = self.predict_proba_w(X, w)
        return np.argmax(predicted_probability, axis=1)
    
    def predict_proba_w(self, X, w):
        predict_proba = []
#         for i in tqdm(range(len(self.clfs)), desc='predict_proba'):
        for i in range(len(self.clfs)):
            predict_proba.append(self.clfs[i].predict_proba(X[:, self.oob_features[i]]) * w[i])

        predicted_probabilitiy = np.sum(predict_proba, axis=0)
        
        return predicted_probabilitiy / len(self.clfs)

In [5]:
class TDCOSRM(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimators, n_estimators, max_samples, max_features, bootstrap, bootstrap_features, hash_size, num_hashtables, n_rows, n_columns, num_results):
        self.base_estimators = base_estimators
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.bootstrap_features = bootstrap_features
        
        self.hash_size = hash_size
        self.num_hashtables = num_hashtables
        
        self.n_rows = n_rows
        self.n_columns = n_columns
        self.num_results = num_results
        
        self.bagging = BaggingClassifier(base_estimators,
                                         n_estimators=n_estimators,
                                         max_samples=max_samples,
                                        max_features=max_features,
                                        bootstrap=bootstrap,
                                        bootstrap_features=bootstrap_features, 
                                        oob_score=True, 
                                        n_jobs=-1)
        
        self.som = SOM(n_columns=self.n_columns, n_rows=self.n_rows)
        
    
    def fit(self, X, y):
        train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=.3)
        self.bagging.fit(train_x, train_y)
        
        # 集成中所有基学习器及其选择的特征
        self.estimators = self.bagging.estimators_
        self.features = self.bagging.estimators_features_
        
        cl_expression = []
        self.lsh = LSHash(hash_size=self.hash_size, input_dim=X.shape[1] + valid_x.shape[0], num_hashtables=self.num_hashtables)
        
        # 预分桶
        for i, esti in enumerate(self.estimators):
            structure_ = np.zeros((valid_x.shape[1]))
            for j in range(self.features[i].shape[0]):
                structure_[self.features[i]] += 1
            structure_ -= np.mean(structure_)
        
            functional_ = esti.predict(valid_x[:, self.features[i]])
            
            # 从结构相似性和功能相似性两个角度来解释estimators之间的相似性
            cl_expression_ = np.hstack((structure_, functional_))
            cl_expression.append(cl_expression_)
            
            self.lsh.index(cl_expression_.astype(int), extra_data=str(i))
        
        self.som_labels = self.som.fit_predict(cl_expression)
        cluster_ids = list(set(self.som_labels))
        clusters = {i: [] for i in cluster_ids}
        
        for i in clusters:
            estis = np.where(self.som_labels == i)[0].tolist()
            clusters[i] = estis
            
        final_cluster = dict()
        for i in cluster_ids:
            # 如果某个cluster中的estimators数量小于5， 则将这个cluster中所有estimators分配出去
            if len(clusters[i]) < 5:
                # 首先统计每个分类器ANN-50 近邻中的最多的类别
                c_counter = []
                for e in clusters[i]:
                    query_res = self.lsh.query(cl_expression[e], num_results=self.num_results)
                    # 统计该estimatos ANN 所属的cluster
                    neighbor_class = []
                    for res in query_res:
                        nei = self.som_labels[int(res[0][-1])]
                        neighbor_class.append(nei)
                    # ANN-50近邻按照major的方式对cluster进行投票
                    e_class = Counter(neighbor_class).most_common()[0][0]
                    c_counter.append(e_class)
                c_id = Counter(c_counter).most_common()[0][0]
                if c_id not in final_cluster.keys():
                    final_cluster[c_id] = clusters[i]
                else:
                    final_cluster[c_id].extend(clusters[i])
            else:
                if i not in final_cluster.keys():
                    final_cluster[i] = clusters[i]
                else:
                    final_cluster[i].extend(clusters[i])
        
        
        keeped_cluster = dict()
        for cid, ens in final_cluster.items():
            cluster_scale = len(ens)
            
            # cluster中每个estimators的性能
            valid_acc = [metrics.accuracy_score(self.estimators[i].predict(valid_x[:, self.features[i]]), valid_y) for i in ens]
#             print(valid_acc)
            # cluster中性能最好的estimator
            best_acc = np.argmax(valid_acc)
        
            
            # 找到与best estimator最为相似的所有estimators
            query_res = self.lsh.query(cl_expression[ens[best_acc]])
            sim_ = [int(res[0][-1]) for res in query_res]
            # 这些estimators中与best estimators同一个cluster中的前20%被保留
            keeped_num = int(cluster_scale * 0.4)
            
            keeped_ = []
            for sim in sim_:
                if len(keeped_) < keeped_num:
                    if sim in ens:
                        keeped_.append(sim)
            keeped_cluster[cid] = keeped_
        self.keeped_cluster = keeped_cluster

        ensembles = []
        for k, v in keeped_cluster.items():
            if v:
                ensembles.append(EnsembleVote([self.estimators[_] for _ in v], [self.features[_] for _ in v]))
        self.ensembles = ensembles
        
        self.votingClassifier = StackingClassifier(ensembles, fit_base_estimators=False, meta_classifier=LogisticRegression(n_jobs=-1))
        self.votingClassifier.fit(X, y)
        
        return self
    
    def predict(self, X):
#         predicted_probabilitiy = self.predict_proba(X)
#         return np.argmax(predicted_probabilitiy, axis=1)  
        return self.votingClassifier.predict(X)
    
    def predict_proba(self, X):
        predict_proba = []
        for i in range(len(self.ensembles)):
            predict_proba.append(self.ensembles[i].predict_proba(X))

        predicted_probabilitiy = np.sum(predict_proba, axis=0)
        
        return predicted_probabilitiy / len(self.ensembles)
    
    def get_coef(self):
        w = preprocessing.normalize(self.votingClassifier.meta_clf_.coef_, norm='l2').reshape(-1, )
        
        coef = np.zeros((data.shape[1], ))

        for i, en in enumerate(self.votingClassifier.clfs_):
            for fea in en.oob_features:
                coef[fea] += w[i]
        return coef