In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from gensim.models.doc2vec import Doc2Vec

from sklearn.metrics import make_scorer, confusion_matrix, multilabel_confusion_matrix, accuracy_score, matthews_corrcoef
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, cross_validate, train_test_split, KFold
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import ADASYN, RandomOverSampler

import re
import time
from pathlib import Path

from utils import embedding_tools
from utils import drawing_tools
from utils import training_tools
from utils import process_pssm_feature

import warnings
warnings.filterwarnings("ignore")


classes = ('out', 'inner', 'matrix', 'space')


In [None]:
def mcc_O(y_true, y_pred):
    mcm = multilabel_confusion_matrix(y_true, y_pred)
    cm = mcm[0]
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    tp = cm[1,1]
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    return mcc

def mcc_I(y_true, y_pred):
    mcm = multilabel_confusion_matrix(y_true, y_pred)
    cm = mcm[1]
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    tp = cm[1,1]
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    return mcc

def mcc_M(y_true, y_pred):
    mcm = multilabel_confusion_matrix(y_true, y_pred)
    cm = mcm[2]
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    tp = cm[1,1]
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    return mcc

def mcc_S(y_true, y_pred):
    mcm = multilabel_confusion_matrix(y_true, y_pred)
    cm = mcm[3]
    tn = cm[0,0]
    fp = cm[0,1]
    fn = cm[1,0]
    tp = cm[1,1]
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    return mcc

scoring = {'MCC(O)': make_scorer(mcc_O),
          'MCC(I)': make_scorer(mcc_I),
          'MCC(M)': make_scorer(mcc_M),
          'MCC(S)': make_scorer(mcc_S),
          'mcc': make_scorer(matthews_corrcoef)}

def printMcc(cv_result):
    print('mcc(O):{:.2f}, mcc(I):{:.2f}, mcc(M):{:.2f}, mcc(S):{:.2f}, mcc:{:.2f}'
           .format(cv_result['test_MCC(O)'].mean(), cv_result['test_MCC(I)'].mean(), cv_result['test_MCC(M)'].mean(), cv_result['test_MCC(S)'].mean(), cv_result['test_mcc'].mean()))

In [None]:
SM766_PATH = Path('SM766-20\SM766-20.csv')

data = shuffle(pd.read_csv(SM766_PATH, usecols=[1, 2]))

sequences, labels = data['sequence'].values, data['label'].values

print(len(data))

train_sequences, test_sequences, train_labels, test_labels = train_test_split(sequences, labels, test_size=0.1, random_state=42)
print(train_sequences.shape, train_labels.shape, test_sequences.shape, test_labels.shape)

In [None]:
SM424_PATH = Path('data\protein_data\submitochondrial\SM424-18\SM424-18.csv')
sm424_data = shuffle(pd.read_csv(SM424_PATH, usecols=[1, 2]))

sm424_sequences, sm424_labels = sm424_data['sequence'].values, sm424_data['label'].values

print(len(sm424_sequences))

In [None]:
m983_path = Path('data\protein_data\submitochondrial\M983.csv')
m495_path = Path('data\protein_data\submitochondrial\M495.csv')

m983 = shuffle(pd.read_csv(m983_path, usecols=[0, 1, 2]))
m495 = shuffle(pd.read_csv(m495_path, usecols=[0, 1, 2]))

m983_id, m983_sequences, m983_labels = m983['protein_id'], m983['sequence'].values, m983['label'].values
m495_id, m495_sequences, m495_labels = m495['protein_id'], m495['sequence'].values, m495['label'].values

print(len(m983_sequences), len(m495_sequences))

In [None]:
# 保存model dm的文件
MOD_PATH_DM = Path('output\doc2vec_models\dm')
model_path_dm = []
k_list_dm = []
size_list_dm = []
for p in MOD_PATH_DM.glob('*.pkl'):
    model_path_dm.append(p)
    name = p.stem.split('_')
    k_list_dm.append(name[0])
    size_list_dm.append(name[2])

In [None]:
for i in range(0, len(model_path_dm)):
    print(i, model_path_dm[i], k_list_dm[i], size_list_dm[i])

In [None]:
# 保存model dbow的文件
MOD_PATH_DBOW = Path('output\doc2vec_models\dm')
model_path_dbow = []
k_list_dbow = []
size_list_dbow = []
for p in MOD_PATH_DBOW.glob('*.pkl'):
    model_path_dbow.append(p)
    name = p.stem.split('_')
    k_list_dbow.append(name[0])
    size_list_dbow.append(name[2])

In [None]:
for i in range(0, len(model_path_dm)):
    print(i, model_path_dbow[i], k_list_dbow[i], size_list_dbow[i])

In [None]:
# 在m983 m495数据集上训练的doc2vec模型(dbow dm)
# dbow
M_DBOW_PATH = Path('output\doc2vec_models\m983_m495\dbow')
m_dbow_path_list = []
m_dbow_k_list = []
m_dbow_contWord_list = []
m_dbow_size_list = []
for p in M_DBOW_PATH.glob('*.pkl'):
    m_dbow_path_list.append(p)
    name = p.stem.split('_')
    m_dbow_k_list.append(name[0])
    m_dbow_contWord_list.append(name[1])
    m_dbow_size_list.append(name[2])

# dm
M_DM_PATH = Path('output\doc2vec_models\m983_m495\dm')
m_dm_path_list = []
m_dm_k_list = []
m_dm_contWord_list = []
m_dm_size_list = []
for p in M_DM_PATH.glob('*.pkl'):
    m_dm_path_list.append(p)
    name = p.stem.split('_')
    m_dm_k_list.append(name[0])
    m_dm_contWord_list.append(name[1])
    m_dm_size_list.append(name[2])


In [None]:
for i in range(0, len(m_dbow_path_list)):
    print(i, m_dbow_path_list[i], m_dbow_k_list[i], m_dbow_size_list[i])

In [None]:
for i in range(0, len(m_dm_path_list)):
    print(i, m_dm_path_list[i], m_dbow_k_list[i], m_dm_size_list[i])

In [None]:
# 子句的向量加和
def getVecs_mean(model, sequences, k, mean=True):
    vectors = []
    for sequence in sequences:
        sentences = embedding_tools.seq_to_k_sentence(sequence, int(k))
        vector = np.array([model.infer_vector(sentence) for sentence in sentences])
        if mean is True:
            vectors.append(vector.mean(0))
        else:
            vectors.append(vector.sum(0))
    return vectors

# 将每个子句的向量链接在一起
def getVecs(model, sequences, k, mean=True):
    vectors = []
    for sequence in sequences:
        sentences = embedding_tools.seq_to_k_sentence(sequence, int(k))
        vector = []
        for sentence in sentences:
            vector.extend(model.infer_vector(sentence))
        vectors.append(vector)
    return np.array(vectors)


def get_vectors(dm, dbow, sequences, k, mean=True):
    dm_vecs = getVecs(dm, sequences, k, mean)
    dbow_vecs = getVecs(dbow, sequences, k, mean)
    vecs = np.concatenate((dm_vecs, dbow_vecs), axis=1)
    return vecs

In [None]:
parameters = {'C': [1e0,1e1,1e2,1e3,1e4,1e5], 
              'gamma': [1e-4,1e-3,1e-2,1e-1,1e0]}
lg_parameters = {'C': [1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3]}
knc_parameters = {'n_neighbors': [5,10,15,20,25,30],
                  'leaf_size': [20, 30, 40],
                  'weights': ['distance', 'uniform'],
                  'p': [1, 2]}
mplc_parameters = {'hidden_layer_sizes': [(100),(100,100),(100,100,100)]}

cs_path = Path('output\cv_result/11-16')

# svc = SVC(C=100, gamma=0.01, decision_function_shape='ovo' ,cache_size=2000, class_weight='balanced')
# grid = GridSearchCV(svc, param_grid=parameters, scoring=scoring, refit='f1', cv=10)
svc = SVC( decision_function_shape='ovo' ,cache_size=2000, class_weight='balanced')
adasyn = ADASYN(sampling_strategy='not majority', random_state=42)

classifier_methods = [SVC( decision_function_shape='ovo' ,cache_size=2000, class_weight='balanced'),
                      LogisticRegression(class_weight='balanced', multi_class='multinomial'),
                      KNeighborsClassifier(),
                      MLPClassifier()]

classifier_names = ['SVM',
                    'LogisticRegression',
                    'KNN',
                    'Neural Network']

clssifier_parameters = [parameters, lg_parameters, knc_parameters, mplc_parameters]

def evaluate(estimator, parameters, X, y):
    clf = RandomizedSearchCV(estimator=estimator, param_distributions=parameters, cv=10, scoring=scoring, refit='mcc')
    clf.fit(X, y)
    cv_results = cross_validate(clf.best_estimator_, X, y, cv=10, scoring=scoring)
    return cv_results

In [None]:
classifier_dic = {'SVC': (SVC( decision_function_shape='ovo' ,cache_size=2000, class_weight='balanced'), 
                         {'C': [1e0,1e1,1e2,1e3,1e4,1e5], 
                          'gamma': [1e-4,1e-3,1e-2,1e-1,1e0]}),
                 'LR': (LogisticRegression(class_weight='balanced', multi_class='multinomial'),
                        {'C': [1e-3,1e-2,1e-1,1e0,1e1,1e2,1e3]}),
                 'KNN': (KNeighborsClassifier(), {'n_neighbors': [5,10,15,20,25,30],
                                                  'leaf_size': [20, 30, 40],
                                                  'weights': ['distance', 'uniform'],
                                                  'p': [1, 2]}),
              #    'NN': (MLPClassifier(), {'hidden_layer_sizes': [(100),(100,100),(100,100,100)]}),
                 'XGB': (XGBClassifier(n_jobs=-1, eval_metric='rmse'), {'max_depth': [5, 10], 'learning_rate': [0.5, 0.05]})}

adasyn = ADASYN(sampling_strategy='not majority', random_state=42)

def evaluate(estimator, parameters, X, y):
    clf = RandomizedSearchCV(estimator=estimator, param_distributions=parameters, cv=10, scoring=scoring, refit='mcc')
    clf.fit(X, y)
    cv_results = cross_validate(clf.best_estimator_, X, y, cv=10, scoring=scoring)
    return cv_results

def run(train_X, train_y, test_X, test_y):
       X_res, y_res = adasyn.fit_resample(train_X, train_y)
       for name, (estimator, parameter) in classifier_dic.items():
              print('-' * 100)
              print('\tClassifer: %s' %name)
              clf = RandomizedSearchCV(estimator=estimator, param_distributions=parameter, cv=10, scoring=scoring, refit='mcc')
              clf.fit(train_X, train_y)
              best_estimator = clf.best_estimator_
              cv_results = cross_validate(best_estimator, train_X, train_y, cv=10, scoring=scoring)
              test_pred = best_estimator.predict(test_X)
              mcc = matthews_corrcoef(test_y, test_pred)
              print('\t未采样')
              print('\t\tmcc(O) \t|\t mcc(I) \t|\t mcc(M) \t|\t mcc')
              print('\t训练集:{:.4f} \t|\t {:.4f} \t|\t {:.4f} \t|\t {:.4f}'
                     .format(cv_results['test_MCC(O)'].mean(), 
                     cv_results['test_MCC(I)'].mean(), 
                     cv_results['test_MCC(M)'].mean(),
                     cv_results['test_mcc'].mean()))
              print('\t测试集:{:.4f} \t|\t {:.4f} \t|\t {:.4f} \t|\t {:.4f}'
                     .format(mcc_O(test_y, test_pred), 
                     mcc_I(test_y, test_pred), 
                     mcc_M(test_y, test_pred),
                     mcc))
              clf = RandomizedSearchCV(estimator=estimator, param_distributions=parameter, cv=10, scoring=scoring, refit='mcc')
              clf.fit(X_res, y_res)
              best_estimator = clf.best_estimator_
              cv_results = cross_validate(best_estimator, X_res, y_res, cv=10, scoring=scoring)
              test_pred = best_estimator.predict(test_X)
              mcc = matthews_corrcoef(test_y, test_pred)
              print()
              print('\t采样')
              print('\t\tmcc(O) \t|\t mcc(I) \t|\t mcc(M) \t|\t mcc')
              print('\t训练集:{:.4f} \t|\t {:.4f} \t|\t {:.4f} \t|\t {:.4f}'
                     .format(cv_results['test_MCC(O)'].mean(), 
                    cv_results['test_MCC(I)'].mean(), 
                    cv_results['test_MCC(M)'].mean(),
                    cv_results['test_mcc'].mean()))
              print('\t测试集:{:.4f} \t|\t {:.4f} \t|\t {:.4f} \t|\t {:.4f}'
                     .format(mcc_O(test_y, test_pred),
                     mcc_I(test_y, test_pred), 
                     mcc_M(test_y, test_pred),
                     mcc))
              print('-' * 100)
              print()
       print()

In [None]:
# XGB 输入 3-6-128模型
estimator,parameters = classifier_dic['XGB']
dm = Doc2Vec.load(str(model_path_dm[40]))
dbow = Doc2Vec.load(str(model_path_dbow[40]))
X = get_vectors(dm, dbow, sequences, k_list_dbow[40])
y = labels

X_res, y_res = adasyn.fit_resample(X, y)
cv_results = evaluate(estimator, parameters, X_res, y_res)
print('\t采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'.format(cv_results['test_MCC(O)'].mean(), cv_results['test_MCC(I)'].mean(), cv_results['test_MCC(M)'].mean(), cv_results['test_MCC(S)'].mean(),cv_results['test_mcc'].mean()))

In [None]:
mcc_list = []
for index in range(75):
    print('-'*80)
    print(index, model_path_dbow[index], model_path_dm[index])
    dm = Doc2Vec.load(str(model_path_dm[index]))
    dbow = Doc2Vec.load(str(model_path_dbow[index]))
    X_train = get_vectors(dm, dbow, train_sequences, k_list_dbow[index])
    X_test = get_vectors(dm, dbow, test_sequences, k_list_dbow[index])
    clf = RandomizedSearchCV(estimator=svc, param_distributions=parameters, cv=10, scoring=scoring, refit='mcc')

    print('未采样：')
    clf.fit(X_train, train_labels)
    print(clf.best_params_)
    print('best mcc: %.2f' %clf.best_score_)
    best_estimator = clf.best_estimator_
    print('最优模型在训练集上10折交叉验证的结果：')
    cv_results = cross_validate(best_estimator, X_train, train_labels, cv=10, scoring=scoring)
    y_pred = best_estimator.predict(X_train)
    cm = confusion_matrix(train_labels, y_pred)
    printMcc(cv_results)
    print(cm)

    print('测试集结果：')
    y_pred = best_estimator.predict(X_test)
    mcc = matthews_corrcoef(test_labels, y_pred)
    cm = confusion_matrix(test_labels, y_pred)
    print('mcc(O):{:.2f}, mcc(I):{:.2f}, mcc(M):{:.2f}, mcc(S):{:.2f}, mcc:{:.2f}'
           .format(mcc_O(test_labels, y_pred), mcc_I(test_labels, y_pred), mcc_M(test_labels, y_pred), mcc_S(test_labels, y_pred), mcc))
    print(cm)

    print('采样：')
    X_res, y_res = adasyn.fit_resample(X_train, train_labels)
    print('采样后训练集大小：{}, {}'
           .format(X_res.shape, y_res.shape))
    clf.fit(X_res, y_res)
    print(clf.best_params_)
    print('best mcc: %.2f' %clf.best_score_)
    best_estimator = clf.best_estimator_
    print('最优模型在训练集上10折交叉验证的结果：')
    cv_results = cross_validate(best_estimator, X_res, y_res, cv=10, scoring=scoring)
    y_pred = best_estimator.predict(X_res)
    cm = confusion_matrix(y_res, y_pred)
    printMcc(cv_results)
    print(cm)

    print('测试集结果：')
    y_pred = best_estimator.predict(X_test)
    mcc = matthews_corrcoef(test_labels, y_pred)
    cm = confusion_matrix(test_labels, y_pred)
    print('mcc(O):{:.2f}, mcc(I):{:.2f}, mcc(M):{:.2f}, mcc(S):{:.2f}, mcc:{:.2f}'
           .format(mcc_O(test_labels, y_pred), mcc_I(test_labels, y_pred), mcc_M(test_labels, y_pred), mcc_S(test_labels, y_pred), mcc))
    print(cm)
    print('-'*80)
    print()

In [None]:
# 使用train set作为训练集，766作为测试集
# 对不平衡的数据集进行采样
# 不使用均值化
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='not majority', random_state=42)

test_cm_list = []
test_mcm_list = []
for i in range(0, 50):
    print('-' * 80)
    print(i, model_path_dm[i], model_path_dbow[i])
    dm = Doc2Vec.load(str(model_path_dm[i]))
    dbow = Doc2Vec.load(str(model_path_dbow[i]))

    print('sequences -> vectors')
    train_vecs = get_vectors(dm, dbow, train_sequences, k_list_dm[i])
    test_vecs = get_vectors(dm, dbow, test_sequences, k_list_dm[i])
    print(train_vecs.shape, test_vecs.shape)

    print('re_sampling')
    X_res, y_res = smote.fit_resample(train_vecs, train_labels)

    print('train model')
    svc.fit(X_res, y_res)
    train_cm = confusion_matrix(y_res, svc.predict(X_res))
    print(train_cm)

    print('evaluate model on test dataset')
    test_pred = svc.predict(test_vecs)
    cm = confusion_matrix(test_labels, test_pred)
    print("mcc(O): %3f \t mcc(I): %3f \t mcc(M): %3f \t mcc(S): %3f \t " 
    %(mcc_O(test_labels, test_pred), mcc_I(test_labels, test_pred), mcc_M(test_labels, test_pred), mcc_S(test_labels, test_pred)))
    print(cm)
    print('-' * 80)

In [None]:
# 使用train set作为训练集，766作为测试集
# 未使用过采样
# 使用Standardscaler将数据标准化
test_cm_list = []
test_mcm_list = []
for i in range(0, 50):
    print('-' * 80)
    print(i, model_path_dm[i], model_path_dbow[i])
    dm = Doc2Vec.load(str(model_path_dm[i]))
    dbow = Doc2Vec.load(str(model_path_dbow[i]))

    print('sequences -> vectors')
    train_vecs = scaler.fit_transform(get_vectors(dm, dbow, train_sequences, k_list_dm[i]))
    test_vecs = scaler.transform(get_vectors(dm, dbow, test_sequences, k_list_dm[i]))
    print(train_vecs.shape, test_vecs.shape)

    print('train model')
    svc.fit(train_vecs, train_labels)
    train_cm = confusion_matrix(train_labels, svc.predict(train_vecs))
    print(train_cm)

    print('evaluate model on test dataset')
    test_pred = svc.predict(test_vecs)
    cm = confusion_matrix(test_labels, test_pred)
    print("mcc(O): %3f \t mcc(I): %3f \t mcc(M): %3f \t mcc(S): %3f \t " 
    %(mcc_O(test_labels, test_pred), mcc_I(test_labels, test_pred), mcc_M(test_labels, test_pred), mcc_S(test_labels, test_pred)))
    print(cm)
    print('-' * 80)

In [None]:
# 使用766作为训练集，train set作为测试集
# 未使用过采样
# 使用Standardscaler将数据标准化
for i in range(0, 50):
    print('-' * 80)
    print(i, model_path_dm[i], model_path_dbow[i])
    dm = Doc2Vec.load(str(model_path_dm[i]))
    dbow = Doc2Vec.load(str(model_path_dbow[i]))

    print('sequences -> vectors')
    test_vecs = scaler.fit_transform(get_vectors(dm, dbow, test_sequences, k_list_dm[i]))
    train_vecs = scaler.transform(get_vectors(dm, dbow, train_sequences, k_list_dm[i]))
    print(train_vecs.shape, test_vecs.shape)

    print('train model')
    svc.fit(test_vecs, test_labels)
    test_pred = svc.predict(test_vecs)
    train_cm = confusion_matrix(test_labels, test_pred)
    print(train_cm)
    print(matthews_corrcoef(test_labels, test_pred))

    print('evaluate model on test dataset')
    train_pred = svc.predict(train_vecs)
    cm = confusion_matrix(train_labels, train_pred)
    mcc = matthews_corrcoef(train_labels, train_pred)
    print(cm)
    print(mcc)
    print('-' * 80)

In [None]:
# 使用766数据集
no_resample_mcc_list = []
resample_mcc_list = []
for index in range(0, 75):
    print('-'*80)
    print(index, model_path_dbow[index], model_path_dm[index])
    dm = Doc2Vec.load(str(model_path_dm[index]))
    dbow = Doc2Vec.load(str(model_path_dbow[index]))
    X = get_vectors(dm, dbow, sequences, k_list_dbow[index])
    y = labels

    X_res, y_res = adasyn.fit_resample(X, y)

    for (name, method, parameters) in zip(classifier_names, classifier_methods, clssifier_parameters):
        print('\tClassifer: %s' %name)
        cv_results = evaluate(method, parameters, X, y)
        no_resample_mcc_list.append(cv_results['test_mcc'].mean())
        print('\t未采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
               .format(cv_results['test_MCC(O)'].mean(), 
                       cv_results['test_MCC(I)'].mean(), 
                       cv_results['test_MCC(M)'].mean(), 
                       cv_results['test_MCC(S)'].mean(),
                       cv_results['test_mcc'].mean()))
        
        cv_results = evaluate(method, parameters, X_res, y_res)
        resample_mcc_list.append(cv_results['test_mcc'].mean())
        print('\t采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
               .format(cv_results['test_MCC(O)'].mean(), 
                       cv_results['test_MCC(I)'].mean(), 
                       cv_results['test_MCC(M)'].mean(), 
                       cv_results['test_MCC(S)'].mean(),
                       cv_results['test_mcc'].mean()))
        print()
    print('-'*80)


In [None]:
# 使用766数据集
no_resample_mcc_list = []
resample_mcc_list = []
for index in range(75, len(model_path_dbow)):
    print('-'*80)
    print(index, model_path_dbow[index], model_path_dm[index])
    dm = Doc2Vec.load(str(model_path_dm[index]))
    dbow = Doc2Vec.load(str(model_path_dbow[index]))
    X = get_vectors(dm, dbow, sequences, k_list_dbow[index])
    y = labels

    X_res, y_res = adasyn.fit_resample(X, y)

    for (name, method, parameters) in zip(classifier_names, classifier_methods, clssifier_parameters):
        print('\tClassifer: %s' %name)
        cv_results = evaluate(method, parameters, X, y)
        no_resample_mcc_list.append(cv_results['test_mcc'].mean())
        print('\t未采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
               .format(cv_results['test_MCC(O)'].mean(), 
                       cv_results['test_MCC(I)'].mean(), 
                       cv_results['test_MCC(M)'].mean(), 
                       cv_results['test_MCC(S)'].mean(),
                       cv_results['test_mcc'].mean()))
        
        cv_results = evaluate(method, parameters, X_res, y_res)
        resample_mcc_list.append(cv_results['test_mcc'].mean())
        print('\t采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
               .format(cv_results['test_MCC(O)'].mean(), 
                       cv_results['test_MCC(I)'].mean(), 
                       cv_results['test_MCC(M)'].mean(), 
                       cv_results['test_MCC(S)'].mean(),
                       cv_results['test_mcc'].mean()))
        print()
    print('-'*80)

In [None]:
# 使用766数据集
no_resample_mcc_list = []
resample_mcc_list = []
for index in range(125, len(model_path_dbow)):
    print('-'*80)
    print(index, model_path_dbow[index], model_path_dm[index])
    dm = Doc2Vec.load(str(model_path_dm[index]))
    dbow = Doc2Vec.load(str(model_path_dbow[index]))
    X = get_vectors(dm, dbow, sequences, k_list_dbow[index])
    y = labels

    X_res, y_res = adasyn.fit_resample(X, y)
    cv_results = evaluate(classifier_methods[0], clssifier_parameters[0], X, y)
    no_resample_mcc_list.append(cv_results['test_mcc'].mean())
    print('\t未采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
            .format(cv_results['test_MCC(O)'].mean(), 
                    cv_results['test_MCC(I)'].mean(), 
                    cv_results['test_MCC(M)'].mean(), 
                    cv_results['test_MCC(S)'].mean(),
                    cv_results['test_mcc'].mean()))
        
    cv_results = evaluate(classifier_methods[0], clssifier_parameters[0], X_res, y_res)
    resample_mcc_list.append(cv_results['test_mcc'].mean())
    print('\t采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
            .format(cv_results['test_MCC(O)'].mean(), 
                    cv_results['test_MCC(I)'].mean(), 
                    cv_results['test_MCC(M)'].mean(), 
                    cv_results['test_MCC(S)'].mean(),
                    cv_results['test_mcc'].mean()))
    print()
        
    print('-'*80)

In [None]:
dbowpath = Path('output\doc2vec_models\dbow/3_6_64.pkl') 
dmpath = Path('output\doc2vec_models\dm/3_6_64.pkl')
dm = Doc2Vec.load(str(dbowpath))
dbow = Doc2Vec.load(str(dmpath))
X = get_vectors(dm, dbow, sequences, 3)
y = labels
print(X.shape, y.shape)

X_res, y_res = adasyn.fit_resample(X, y)
estimator, parameter = classifier_dic.get('SVC')
clf = RandomizedSearchCV(estimator=estimator, param_distributions=parameter, cv=10, scoring=scoring, refit='mcc')
clf.fit(X_res, y_res)
clf.best_params_

In [None]:
# 在m983数据集和m495数据上实验
m983_path = Path('data\protein_data\submitochondrial\M983.csv')
m495_path = Path('data\protein_data\submitochondrial\M495.csv')

m983 = shuffle(pd.read_csv(m983_path))
m495 = shuffle(pd.read_csv(m495_path))

m983_sequences, m983_labels = m983['sequence'].values, m983['label'].values
m495_sequences, m495_labels = m495['sequence'].values, m495['label'].values

scoring = {'MCC(O)': make_scorer(mcc_O),
          'MCC(I)': make_scorer(mcc_I),
          'MCC(M)': make_scorer(mcc_M),
          'mcc': make_scorer(matthews_corrcoef)}
y = m983_labels
test_y = m495_labels
for index in range(0, 70):
    dbow = Doc2Vec.load(str(m_dbow_path_list[index]))
    dm = Doc2Vec.load(str(m_dm_path_list[index]))
    X = get_vectors(dm, dbow, m983_sequences, m_dbow_k_list[index])
    test_X = get_vectors(dm, dbow, m495_sequences, m_dbow_k_list[index])
    print('index:{} | {}-mer | content word:{} | embedding size:{} | vector size:{}'
          .format(index, m_dbow_k_list[index], m_dbow_contWord_list[index], m_dbow_size_list[index], X[0].size))
    run(X, y, test_X, test_y)
    print()

In [None]:
# 在m983数据集和m495数据上实验
m983_path = Path('data\protein_data\submitochondrial\M983.csv')
m495_path = Path('data\protein_data\submitochondrial\M495.csv')

m983 = shuffle(pd.read_csv(m983_path))
m495 = shuffle(pd.read_csv(m495_path))

m983_sequences, m983_labels = m983['sequence'].values, m983['label'].values
m495_sequences, m495_labels = m495['sequence'].values, m495['label'].values

model_path_dm = Path('output\doc2vec_models\m983_m495\dm/3_6_64.pkl')
model_path_dbow = Path('output\doc2vec_models\m983_m495\dbow/3_6_64.pkl')

scoring = {'MCC(O)': make_scorer(mcc_O),
          'MCC(I)': make_scorer(mcc_I),
          'MCC(M)': make_scorer(mcc_M),
          'mcc': make_scorer(matthews_corrcoef)}
y = m983_labels
test_y = m495_labels
for index in range(70, len(m_dbow_path_list)):
    dbow = Doc2Vec.load(str(m_dbow_path_list[index]))
    dm = Doc2Vec.load(str(m_dm_path_list[index]))
    X = get_vectors(dm, dbow, m983_sequences, m_dbow_k_list[index])
    test_X = get_vectors(dm, dbow, m495_sequences, m_dbow_k_list[index])
    print('index:{} | {}-mer | content word:{} | embedding size:{} | vector size:{}'
          .format(index, m_dbow_k_list[index], m_dbow_contWord_list[index], m_dbow_size_list[index], X[0].size))
    run(X, y, test_X, test_y)
    print()

In [None]:
# 在m983数据集和m495数据上实验
m983_path = Path('data\protein_data\submitochondrial\M983.csv')
m495_path = Path('data\protein_data\submitochondrial\M495.csv')

m983 = shuffle(pd.read_csv(m983_path))
m495 = shuffle(pd.read_csv(m495_path))

m983_sequences, m983_labels = m983['sequence'].values, m983['label'].values
m495_sequences, m495_labels = m495['sequence'].values, m495['label'].values

model_path_dm = Path('output\doc2vec_models\m983_m495\dm/3_6_64.pkl')
model_path_dbow = Path('output\doc2vec_models\m983_m495\dbow/3_6_64.pkl')

scoring = {'MCC(O)': make_scorer(mcc_O),
          'MCC(I)': make_scorer(mcc_I),
          'MCC(M)': make_scorer(mcc_M),
          'mcc': make_scorer(matthews_corrcoef)}
y = m983_labels
test_y = m495_labels
for index in range(93, len(m_dbow_path_list)):
    dbow = Doc2Vec.load(str(m_dbow_path_list[index]))
    dm = Doc2Vec.load(str(m_dm_path_list[index]))
    X = get_vectors(dm, dbow, m983_sequences, m_dbow_k_list[index])
    test_X = get_vectors(dm, dbow, m495_sequences, m_dbow_k_list[index])
    print('index:{} | {}-mer | content word:{} | embedding size:{} | vector size:{}'
          .format(index, m_dbow_k_list[index], m_dbow_contWord_list[index], m_dbow_size_list[index], X[0].size))
    run(X, y, test_X, test_y)
    print()

In [None]:
# sm766上训练模型，m495上测试模型
index = 44
dm = Doc2Vec.load(str(model_path_dm[index]))
dbow = Doc2Vec.load(str(model_path_dbow[index]))

train_vecs = get_vectors(dm, dbow, sequences, k_list_dbow[index])
train_labels = labels

test_vecs = get_vectors(dm, dbow, m495_sequences, k_list_dbow[index])
test_labels = m495_labels

X_res, y_res = adasyn.fit_resample(train_vecs, train_labels)

clf = RandomizedSearchCV(classifier_methods[0], clssifier_parameters[0], scoring=scoring, cv=10, refit='mcc')
clf.fit(X_res, y_res)

best_estimator = clf.best_estimator_

test_pred = best_estimator.predict(test_vecs)
cm = confusion_matrix(test_labels, test_pred)
print(cm)
mcc = matthews_corrcoef(test_labels, test_pred)
print("mcc(O): %3f \t mcc(I): %3f \t mcc(M): %3f \t" 
    %(mcc_O(test_labels, test_pred), mcc_I(test_labels, test_pred), mcc_M(test_labels, test_pred)))

In [None]:
# m983上训练模型，m495上测试模型

scoring = {'MCC(O)': make_scorer(mcc_O),
          'MCC(I)': make_scorer(mcc_I),
          'MCC(M)': make_scorer(mcc_M),
          'mcc': make_scorer(matthews_corrcoef)}

model_path_dm = Path('output\doc2vec_models\m983_m495\dm/3_6_64.pkl')
model_path_dbow = Path('output\doc2vec_models\m983_m495\dbow/3_6_64.pkl')

dm = Doc2Vec.load(str(model_path_dm))
dbow = Doc2Vec.load(str(model_path_dbow))

train_vecs = get_vectors(dm, dbow, m983_sequences, 3)
train_labels = m983_labels

test_vecs = get_vectors(dm, dbow, m495_sequences, 3)
test_labels = m495_labels

X_res, y_res = adasyn.fit_resample(train_vecs, train_labels)

clf = RandomizedSearchCV(classifier_methods[0], clssifier_parameters[0], scoring=scoring, cv=10, refit='mcc')
clf.fit(X_res, y_res)

best_estimator = clf.best_estimator_

test_pred = best_estimator.predict(test_vecs)
cm = confusion_matrix(test_labels, test_pred)
print(cm)
mcc = matthews_corrcoef(test_labels, test_pred)
print("mcc(O): %3f \t mcc(I): %3f \t mcc(M): %3f \t" 
    %(mcc_O(test_labels, test_pred), mcc_I(test_labels, test_pred), mcc_M(test_labels, test_pred)))

In [None]:
# 在M495 M983数据集上使用pssm矩阵做为特征 简单的按行叠加，处理成20维的矩阵
# 首先按照蛋白质的id号按照顺序读取pssm文件
scoring = {'MCC(O)': make_scorer(mcc_O),
          'MCC(I)': make_scorer(mcc_I),
          'MCC(M)': make_scorer(mcc_M),
          'mcc': make_scorer(matthews_corrcoef)}

from utils import process_pssm_feature
m983_pssm_path = 'data\protein_data\PSSM_feature\M983/'
m495_pssm_path = 'data\protein_data\PSSM_feature\M495/'
train_vectors = []
test_vectors = []

for protein_id in m983_id:
    pssm_path = m983_pssm_path + protein_id + '.txt'
    pssm = np.loadtxt(pssm_path)
    train_vectors.append(process_pssm_feature.addToOneLineSum(pssm))

for protein_id in m495_id:
    pssm_path = m495_pssm_path + protein_id + '.txt'
    pssm = np.loadtxt(pssm_path)
    test_vectors.append(process_pssm_feature.addToOneLineSum(pssm))

train_vectors = np.array(train_vectors)
test_vectors = np.array(test_vectors)
train_labels = m983_labels
test_labels = m495_labels

run(train_vectors, train_labels, test_vectors, test_labels)

In [None]:
# 在M495 M983数据集上使用pssm矩阵做为特征 将L * 20维的pssm矩阵处理成 20 * 20 -> 400维向量
# 首先按照蛋白质的id号按照顺序读取pssm文件

scoring = {'MCC(O)': make_scorer(mcc_O),
          'MCC(I)': make_scorer(mcc_I),
          'MCC(M)': make_scorer(mcc_M),
          'mcc': make_scorer(matthews_corrcoef)}

m983_pssm_path = 'data\protein_data\PSSM_feature\M983/'
m495_pssm_path = 'data\protein_data\PSSM_feature\M495/'
train_vectors = []
test_vectors = []

for protein_id in m983_id:
    pssm_path = m983_pssm_path + protein_id + '.txt'
    train_vectors.append(process_pssm_feature.getStandardPssm(pssm_path))

for protein_id in m495_id:
    pssm_path = m495_pssm_path + protein_id + '.txt'
    test_vectors.append(process_pssm_feature.getStandardPssm(pssm_path))

train_vectors = np.array(train_vectors)
test_vectors = np.array(test_vectors)
train_labels = m983_labels
test_labels = m495_labels

run(train_vectors, train_labels, test_vectors, test_labels)

In [None]:
# 在m983数据集, m495数据集上使用pssm+doc2vec模型生成的向量作为特征向量

m983_pssm_path = 'data\protein_data\PSSM_feature\M983/'
m495_pssm_path = 'data\protein_data\PSSM_feature\M495/'
train_vectors = []
test_vectors = []

for protein_id in m983_id:
    pssm_path = m983_pssm_path + protein_id + '.txt'
    pssm = np.loadtxt(pssm_path)
    train_vectors.append(process_pssm_feature.addToOneLineSum(pssm))

for protein_id in m495_id:
    pssm_path = m495_pssm_path + protein_id + '.txt'
    pssm = np.loadtxt(pssm_path)
    test_vectors.append(process_pssm_feature.addToOneLineSum(pssm))

train_vectors = np.array(train_vectors)
test_vectors = np.array(test_vectors)

scoring = {'MCC(O)': make_scorer(mcc_O),
          'MCC(I)': make_scorer(mcc_I),
          'MCC(M)': make_scorer(mcc_M),
          'mcc': make_scorer(matthews_corrcoef)}
y = m983_labels
test_y = m495_labels
for index in range(0, 70):
    dbow = Doc2Vec.load(str(m_dbow_path_list[index]))
    dm = Doc2Vec.load(str(m_dm_path_list[index]))
    train_vecs = get_vectors(dm, dbow, m983_sequences, m_dbow_k_list[index])
    test_vecs = get_vectors(dm, dbow, m495_sequences, m_dbow_k_list[index])
    X = np.concatenate((train_vecs, train_vectors), axis=1)
    test_X = np.concatenate((test_vecs, test_vectors), axis=1)
    print('index:{} | {}-mer | content word:{} | embedding size:{} | vector size:{}'
          .format(index, m_dbow_k_list[index], m_dbow_contWord_list[index], m_dbow_size_list[index], X[0].size))
    run(X, y, test_X, test_y)
    print()

In [None]:
# 在m983数据集, m495数据集上使用pssm+doc2vec模型生成的向量作为特征向量
# 70 - 末尾
m983_pssm_path = 'data\protein_data\PSSM_feature\M983/'
m495_pssm_path = 'data\protein_data\PSSM_feature\M495/'
train_vectors = []
test_vectors = []

for protein_id in m983_id:
    pssm_path = m983_pssm_path + protein_id + '.txt'
    pssm = np.loadtxt(pssm_path)
    train_vectors.append(process_pssm_feature.addToOneLineSum(pssm))

for protein_id in m495_id:
    pssm_path = m495_pssm_path + protein_id + '.txt'
    pssm = np.loadtxt(pssm_path)
    test_vectors.append(process_pssm_feature.addToOneLineSum(pssm))

train_vectors = np.array(train_vectors)
test_vectors = np.array(test_vectors)

scoring = {'MCC(O)': make_scorer(mcc_O),
          'MCC(I)': make_scorer(mcc_I),
          'MCC(M)': make_scorer(mcc_M),
          'mcc': make_scorer(matthews_corrcoef)}
y = m983_labels
test_y = m495_labels
for index in range(70, len(m_dbow_path_list)):
    dbow = Doc2Vec.load(str(m_dbow_path_list[index]))
    dm = Doc2Vec.load(str(m_dm_path_list[index]))
    train_vecs = get_vectors(dm, dbow, m983_sequences, m_dbow_k_list[index])
    test_vecs = get_vectors(dm, dbow, m495_sequences, m_dbow_k_list[index])
    X = np.concatenate((train_vecs, train_vectors), axis=1)
    test_X = np.concatenate((test_vecs, test_vectors), axis=1)
    print('index:{} | {}-mer | content word:{} | embedding size:{} | vector size:{}'
          .format(index, m_dbow_k_list[index], m_dbow_contWord_list[index], m_dbow_size_list[index], X[0].size))
    run(X, y, test_X, test_y)
    print()

In [None]:
# 在m983数据集, m495数据集上使用pssm+doc2vec模型生成的向量作为特征向量
# 70 - 末尾
m983_pssm_path = 'data\protein_data\PSSM_feature\M983/'
m495_pssm_path = 'data\protein_data\PSSM_feature\M495/'
train_vectors = []
test_vectors = []

for protein_id in m983_id:
    pssm_path = m983_pssm_path + protein_id + '.txt'
    pssm = np.loadtxt(pssm_path)
    train_vectors.append(process_pssm_feature.addToOneLineSum(pssm))

for protein_id in m495_id:
    pssm_path = m495_pssm_path + protein_id + '.txt'
    pssm = np.loadtxt(pssm_path)
    test_vectors.append(process_pssm_feature.addToOneLineSum(pssm))

train_vectors = np.array(train_vectors)
test_vectors = np.array(test_vectors)

scoring = {'MCC(O)': make_scorer(mcc_O),
          'MCC(I)': make_scorer(mcc_I),
          'MCC(M)': make_scorer(mcc_M),
          'mcc': make_scorer(matthews_corrcoef)}
y = m983_labels
test_y = m495_labels
for index in range(80, len(m_dbow_path_list)):
    dbow = Doc2Vec.load(str(m_dbow_path_list[index]))
    dm = Doc2Vec.load(str(m_dm_path_list[index]))
    train_vecs = get_vectors(dm, dbow, m983_sequences, m_dbow_k_list[index])
    test_vecs = get_vectors(dm, dbow, m495_sequences, m_dbow_k_list[index])
    X = np.concatenate((train_vecs, train_vectors), axis=1)
    test_X = np.concatenate((test_vecs, test_vectors), axis=1)
    print('index:{} | {}-mer | content word:{} | embedding size:{} | vector size:{}'
          .format(index, m_dbow_k_list[index], m_dbow_contWord_list[index], m_dbow_size_list[index], X[0].size))
    run(X, y, test_X, test_y)
    print()

In [None]:
# 在m983数据集, m495数据集上使用pssm(400d)+doc2vec(384d)模型生成的向量作为特征向量

scoring = {'MCC(O)': make_scorer(mcc_O),
          'MCC(I)': make_scorer(mcc_I),
          'MCC(M)': make_scorer(mcc_M),
          'mcc': make_scorer(matthews_corrcoef)}

model_path_dm = Path('output\doc2vec_models\m983_m495\dm/3_6_64.pkl')
model_path_dbow = Path('output\doc2vec_models\m983_m495\dbow/3_6_64.pkl')

dm = Doc2Vec.load(str(model_path_dm))
dbow = Doc2Vec.load(str(model_path_dbow))

train_vecs = get_vectors(dm, dbow, m983_sequences, 3)
train_y = m983_labels

test_vecs = get_vectors(dm, dbow, m495_sequences, 3)
test_y = m495_labels

print(train_vecs.shape)

m983_pssm_path = 'data\protein_data\PSSM_feature\M983/'
m495_pssm_path = 'data\protein_data\PSSM_feature\M495/'
train_vectors = []
test_vectors = []

for protein_id in m983_id:
    pssm_path = m983_pssm_path + protein_id + '.txt'
    train_vectors.append(process_pssm_feature.getStandardPssm(pssm_path))

for protein_id in m495_id:
    pssm_path = m495_pssm_path + protein_id + '.txt'
    test_vectors.append(process_pssm_feature.getStandardPssm(pssm_path))

train_vectors = np.array(train_vectors)
test_vectors = np.array(test_vectors)

train_X = np.concatenate((train_vecs, train_vectors), axis=1)
print(train_X.shape)
test_X = np.concatenate((test_vecs, test_vectors), axis=1)
print(test_X.shape)

run(train_X, train_y, test_X, test_y)

In [None]:
# 在m983 和 m495上实验使用
for name, (clf, parameter) in classifier_dic.items():
    print(name, clf, parameter)

In [None]:
for index in range(75, len(model_path_dm)):
    print('-'*80)
    print(index, model_path_dbow[index], model_path_dm[index])
    dm = Doc2Vec.load(str(model_path_dm[index]))
    dbow = Doc2Vec.load(str(model_path_dbow[index]))
    X = get_vectors(dm, dbow, test_sequences, k_list_dbow[index])
    y = test_labels

    X_res, y_res = adasyn.fit_resample(X, y)

    for (name, method, parameters) in zip(classifier_names, classifier_methods, clssifier_parameters):
        print('\tClassifer: %s' %name)
        cv_results = evaluate(method, parameters, X, y)
        no_resample_mcc_list.append(cv_results['test_mcc'].mean())
        print('\t未采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
               .format(cv_results['test_MCC(O)'].mean(), 
                       cv_results['test_MCC(I)'].mean(), 
                       cv_results['test_MCC(M)'].mean(), 
                       cv_results['test_MCC(S)'].mean(),
                       cv_results['test_mcc'].mean()))
        
        cv_results = evaluate(method, parameters, X_res, y_res)
        resample_mcc_list.append(cv_results['test_mcc'].mean())
        print('\t采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
               .format(cv_results['test_MCC(O)'].mean(), 
                       cv_results['test_MCC(I)'].mean(), 
                       cv_results['test_MCC(M)'].mean(), 
                       cv_results['test_MCC(S)'].mean(),
                       cv_results['test_mcc'].mean()))
        print()
    print('-'*80)

In [None]:
# 在424数据上测试 424数据集上10折交叉验证结果 最优结果3_6_64 (svm + adasyn采样) mcc(O):0.9364 | mcc(I):0.7315 | mcc(M):0.6328 | mcc(S):1.0000 | mcc:0.8284
no_resample_mcc_list = []
resample_mcc_list = []
sampler = RandomOverSampler(sampling_strategy='not majority', random_state = 42)
for index in range(0, len(model_path_dm)):
    print('-'*80)
    print(index, model_path_dbow[index], model_path_dm[index])
    dm = Doc2Vec.load(str(model_path_dm[index]))
    dbow = Doc2Vec.load(str(model_path_dbow[index]))
    X = get_vectors(dm, dbow, sm424_sequences, k_list_dbow[index])
    y = sm424_labels

    X_res, y_res = sampler.fit_resample(X, y)

    for (name, method, parameters) in zip(classifier_names, classifier_methods, clssifier_parameters):
        print('\tClassifer: %s' %name)
        cv_results = evaluate(method, parameters, X, y)
        no_resample_mcc_list.append(cv_results['test_mcc'].mean())
        print('\t未采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
               .format(cv_results['test_MCC(O)'].mean(), 
                       cv_results['test_MCC(I)'].mean(), 
                       cv_results['test_MCC(M)'].mean(), 
                       cv_results['test_MCC(S)'].mean(),
                       cv_results['test_mcc'].mean()))
        
        cv_results = evaluate(method, parameters, X_res, y_res)
        resample_mcc_list.append(cv_results['test_mcc'].mean())
        print('\t采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
               .format(cv_results['test_MCC(O)'].mean(), 
                       cv_results['test_MCC(I)'].mean(), 
                       cv_results['test_MCC(M)'].mean(), 
                       cv_results['test_MCC(S)'].mean(),
                       cv_results['test_mcc'].mean()))
        print()
    print('-'*80)

In [None]:
# 在424数据上测试
no_resample_mcc_list = []
resample_mcc_list = []
sampler = RandomOverSampler(sampling_strategy='not majority', random_state = 42)
for index in range(72, len(model_path_dm)):
    print('-'*80)
    print(index, model_path_dbow[index], model_path_dm[index])
    dm = Doc2Vec.load(str(model_path_dm[index]))
    dbow = Doc2Vec.load(str(model_path_dbow[index]))
    X = get_vectors(dm, dbow, sm424_sequences, k_list_dbow[index])
    y = sm424_labels

    X_res, y_res = sampler.fit_resample(X, y)

    for (name, method, parameters) in zip(classifier_names, classifier_methods, clssifier_parameters):
        print('\tClassifer: %s' %name)
        cv_results = evaluate(method, parameters, X, y)
        no_resample_mcc_list.append(cv_results['test_mcc'].mean())
        print('\t未采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
               .format(cv_results['test_MCC(O)'].mean(), 
                       cv_results['test_MCC(I)'].mean(), 
                       cv_results['test_MCC(M)'].mean(), 
                       cv_results['test_MCC(S)'].mean(),
                       cv_results['test_mcc'].mean()))
        
        cv_results = evaluate(method, parameters, X_res, y_res)
        resample_mcc_list.append(cv_results['test_mcc'].mean())
        print('\t采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
               .format(cv_results['test_MCC(O)'].mean(), 
                       cv_results['test_MCC(I)'].mean(), 
                       cv_results['test_MCC(M)'].mean(), 
                       cv_results['test_MCC(S)'].mean(),
                       cv_results['test_mcc'].mean()))
        print()
    print('-'*80)

In [None]:
# 在424数据上测试
no_resample_mcc_list = []
resample_mcc_list = []
sampler = RandomOverSampler(sampling_strategy='not majority', random_state = 42)
for index in range(100, len(model_path_dm)):
    print('-'*80)
    print(index, model_path_dbow[index], model_path_dm[index])
    dm = Doc2Vec.load(str(model_path_dm[index]))
    dbow = Doc2Vec.load(str(model_path_dbow[index]))
    X = get_vectors(dm, dbow, sm424_sequences, k_list_dbow[index])
    y = sm424_labels

    X_res, y_res = sampler.fit_resample(X, y)

    for (name, method, parameters) in zip(classifier_names, classifier_methods, clssifier_parameters):
        print('\tClassifer: %s' %name)
        cv_results = evaluate(method, parameters, X, y)
        no_resample_mcc_list.append(cv_results['test_mcc'].mean())
        print('\t未采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
               .format(cv_results['test_MCC(O)'].mean(), 
                       cv_results['test_MCC(I)'].mean(), 
                       cv_results['test_MCC(M)'].mean(), 
                       cv_results['test_MCC(S)'].mean(),
                       cv_results['test_mcc'].mean()))
        
        cv_results = evaluate(method, parameters, X_res, y_res)
        resample_mcc_list.append(cv_results['test_mcc'].mean())
        print('\t采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
               .format(cv_results['test_MCC(O)'].mean(), 
                       cv_results['test_MCC(I)'].mean(), 
                       cv_results['test_MCC(M)'].mean(), 
                       cv_results['test_MCC(S)'].mean(),
                       cv_results['test_mcc'].mean()))
        print()
    print('-'*80)

In [None]:
index = 44
sampler = RandomOverSampler(sampling_strategy='not majority', random_state = 42)
print('-'*80)
print(index, model_path_dbow[index], model_path_dm[index])
dm = Doc2Vec.load(str(model_path_dm[index]))
dbow = Doc2Vec.load(str(model_path_dbow[index]))
X = get_vectors(dm, dbow, sm424_sequences, k_list_dbow[index])
y = sm424_labels

X_res, y_res = sampler.fit_resample(X, y)

for (name, method, parameters) in zip(classifier_names, classifier_methods, clssifier_parameters):
    print('\tClassifer: %s' %name)
    cv_results = evaluate(method, parameters, X, y)
    print('\t未采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
            .format(cv_results['test_MCC(O)'].mean(), 
                    cv_results['test_MCC(I)'].mean(), 
                    cv_results['test_MCC(M)'].mean(), 
                    cv_results['test_MCC(S)'].mean(),
                    cv_results['test_mcc'].mean()))
        
    cv_results = evaluate(method, parameters, X_res, y_res)
    print('\t采样结果:| mcc(O):{:.4f} | mcc(I):{:.4f} | mcc(M):{:.4f} | mcc(S):{:.4f} | mcc:{:.4f}'
            .format(cv_results['test_MCC(O)'].mean(), 
                    cv_results['test_MCC(I)'].mean(), 
                    cv_results['test_MCC(M)'].mean(), 
                    cv_results['test_MCC(S)'].mean(),
                    cv_results['test_mcc'].mean()))
    print()
print('-'*80)

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='not majority', random_state=42)
from sklearn.ensemble import AdaBoostClassifier
ada_svc = AdaBoostClassifier(base_estimator=SVC(C=100, gamma=0.01, decision_function_shape='ovo' ,cache_size=2000, class_weight='balanced', probability=True), n_estimators=100, learning_rate=0.8)
ada = AdaBoostClassifier(n_estimators=100, learning_rate=0.8)

for i in range(0, 50):
    print(i, model_path_dm[i], model_path_dbow[i])
    dm = Doc2Vec.load(str(model_path_dm[i]))
    dbow = Doc2Vec.load(str(model_path_dbow[i]))

    print('sequences -> vectors')
    test_vecs = get_vectors(dm, dbow, test_sequences, k_list_dm[i])
    train_vecs = get_vectors(dm, dbow, train_sequences, k_list_dm[i])

    print('未标准化，不采样')
    print('使用svc作为分类器')
    train_evaluate_model(svc, train_vecs, train_labels, test_vecs, test_labels)
    print('使用adaboost(base_estimator=cart)作为分类器')
    train_evaluate_model(ada, train_vecs, train_labels, test_vecs, test_labels)
    # print('使用adaboost(base_estimator=svc)作为分类器')
    # train_evaluate_model(ada_svc, train_vecs, train_labels, test_vecs, test_labels)

    print('标准化，不采样')
    X_train = scaler.fit_transform(train_vecs)
    X_test = scaler.transform(test_vecs)
    print('使用svc作为分类器')
    train_evaluate_model(svc, X_train, train_labels, X_test, test_labels)
    print('使用adaboost(base_estimator=cart)作为分类器')
    train_evaluate_model(ada, X_train, train_labels, X_test, test_labels)
    # print('使用adaboost(base_estimator=svc)作为分类器')
    # train_evaluate_model(ada_svc, X_train, train_labels, X_test, test_labels)

    print('未标准化，训练集过采样')
    X_res, y_res = smote.fit_sample(train_vecs, train_labels)
    print('使用svc作为分类器')
    train_evaluate_model(svc, X_res, y_res, test_vecs, test_labels)
    print('使用adaboost(base_estimator=cart)作为分类器')
    train_evaluate_model(ada, X_res, y_res, test_vecs, test_labels)
    # print('使用adaboost(base_estimator=svc)作为分类器')
    # train_evaluate_model(ada_svc, X_res, y_res, test_vecs, test_labels)

    print('标准化，训练集过采样')
    X_res, y_res = smote.fit_sample(train_vecs, train_labels)
    X_train = scaler.fit_transform(X_res)
    X_test = scaler.transform(test_vecs)
    print('使用svc作为分类器')
    train_evaluate_model(svc, X_train, y_res, X_test, test_labels)
    print('使用adaboost(base_estimator=cart)作为分类器')
    train_evaluate_model(ada, X_train, y_res, X_test, test_labels)
    # print('使用adaboost(base_estimator=svc)作为分类器')
    # train_evaluate_model(ada_svc, X_train, y_res, X_test, test_labels)



In [None]:
from sklearn.metrics import matthews_corrcoef
def train_evaluate_model(clf, X_train, y_train, X_test, y_test):
    print('-' * 80)
    print('train model')
    clf.fit(X_train, y_train)
    pred_train = clf.predict(X_train)
    cm_train = confusion_matrix(y_train, pred_train)
    mcc_train = matthews_corrcoef(y_train, pred_train)
    print('mcc of train dataset: {:.3f}'.format(mcc_train))
    print(cm_train)

    print('evaluate model on test dataset')
    pred_test = clf.predict(X_test)
    cm_test = confusion_matrix(y_test, pred_test)
    mcc_test = matthews_corrcoef(y_test, pred_test)
    print('mcc on test dataset: {:.3f}'.format(mcc_test))
    print(cm_test)
    print('-' * 80 + '\n')

In [None]:
# 将每个子句的向量链接在一起
def getVecs(model, sequences, k, mean=True):
    vectors = []
    for sequence in sequences:
        sentences = embedding_tools.seq_to_k_sentence(sequence, int(k))
        vector = []
        for sentence in sentences:
            vector.extend(model.infer_vector(sentence))
        vectors.append(vector)
    return np.array(vectors)

In [None]:
# 加和
for index in range(0, 50):
    print(index, str(model_path_dm[index]) + ' ' + str(model_path_dbow[index]))
    dm = Doc2Vec.load(str(model_path_dm[index]))
    dbow = Doc2Vec.load(str(model_path_dbow[index]))
    # train_vecs = get_vectors(dm, dbow, train_sequences, k_list_dbow[index])
    test_vecs = get_vectors(dm, dbow, test_sequences, k_list_dbow[index])
    X_res, y_res = adasyn.fit_sample(test_vecs, test_labels)
    print(X_res.shape, y_res.shape)
    cv_result = cross_validate(svc, X_res, y_res, cv=10, scoring=scoring)
    print('-' * 80)
    printMcc(cv_result)
    print('-' * 80)
    print()

In [None]:
# 拼接
for index in range(0, 50):
    print(index, str(model_path_dm[index]) + ' ' + str(model_path_dbow[index]))
    dm = Doc2Vec.load(str(model_path_dm[index]))
    dbow = Doc2Vec.load(str(model_path_dbow[index]))
    test_vecs = get_vectors(dm, dbow, test_sequences, k_list_dbow[index])
    X_res, y_res = adasyn.fit_sample(test_vecs, test_labels)
    print(X_res.shape)
    cv_result = cross_validate(svc, X_res, y_res, cv=10, scoring=scoring)
    print('-' * 80)
    printMcc(cv_result)
    print('-' * 80)
    print()

In [None]:
# 在766上训练模型，在train dataset上测试
for index in range(0, 50):
    print(index, str(model_path_dm[index]), str(model_path_dbow[index]))
    dm = Doc2Vec.load(str(model_path_dm[index]))
    dbow = Doc2Vec.load(str(model_path_dbow[index]))
    train_vecs = get_vectors(dm, dbow, train_sequences, k_list_dbow[index])
    test_vecs = get_vectors(dm, dbow, test_sequences, k_list_dbow[index])

    X_res, y_res = adasyn.fit_resample(test_vecs, test_labels)
    print(X_res.shape, train_vecs.shape)
    svc.fit(X_res, y_res)
    test_pred = svc.predict(test_vecs)
    print(confusion_matrix(test_labels, test_pred))
    train_pred = svc.predict(train_vecs)
    cm = confusion_matrix(train_labels, train_pred)
    print(cm)

In [None]:
# 只使用dm模型得到向量
for index in range(0, 50):
    print(index, str(model_path_dm[index]) + ' ' + str(model_path_dbow[index]))
    dm = Doc2Vec.load(str(model_path_dm[index]))
    test_vecs = getVecs(dm, test_sequences, k_list_dm[index])
    X_res, y_res = adasyn.fit_sample(test_vecs, test_labels)
    print(X_res.shape)
    cv_result = cross_validate(svc, X_res, y_res, cv=10, scoring=scoring)
    print('-' * 80)
    printMcc(cv_result)
    print('-' * 80)
    print()

In [None]:
# 只使用dbow模型得到向量
for index in range(0, 50):
    print(index, str(model_path_dm[index]) + ' ' + str(model_path_dbow[index]))
    dbow = Doc2Vec.load(str(model_path_dbow[index]))
    test_vecs = getVecs(dbow, test_sequences, k_list_dbow[index])
    X_res, y_res = adasyn.fit_sample(test_vecs, test_labels)
    print(X_res.shape)
    cv_result = cross_validate(svc, X_res, y_res, cv=10, scoring=scoring)
    print('-' * 80)
    printMcc(cv_result)
    print('-' * 80)
    print()

In [None]:
# 得到序列的映射向量（3_3_32）
index = 33
dm = Doc2Vec.load(str(model_path_dm[index]))
dbow = Doc2Vec.load(str(model_path_dbow[index]))

X_vecs = get_vectors(dm, dbow, test_sequences, k_list_dbow[index])
print(X_vecs.shape)

In [None]:
def calculate_mult_roc(y_label, y_prob, classes):
    y_label = label_binarize(y_label, classes=[i for i in range(0, len(classes))])

    tpr = dict()
    fpr = dict()
    roc_auc = dict()
    for i in range(0, len(classes)):
        fpr[i], tpr[i], _ = roc_curve(y_label[:, i], y_prob[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    fpr['micro'], tpr['micro'], _ = roc_curve(y_label.ravel(), y_prob.ravel())
    roc_auc['micro'] = auc(fpr['micro'], tpr['micro'])

    # calculate macro_roc_curve and roc area
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(0, len(classes))]))
    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(0, len(classes)):
        mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
    
    # Finally average it and compute AUC
    mean_tpr /= len(classes)
    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

    return fpr, tpr, roc_auc

In [None]:
# 比较不同降维方式的roc曲线
from imblearn.over_sampling import ADASYN, SMOTE, BorderlineSMOTE, KMeansSMOTE, RandomOverSampler, SMOTENC, SVMSMOTE
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import make_pipeline

from sklearn.model_selection import KFold
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc

import time



sampling_methods = [SMOTE(sampling_strategy='not majority', random_state=42),
                   ADASYN(sampling_strategy='not majority', random_state=42)]
sampling_names = ['SMOTE',
                  'ADASYN']

svc = SVC(C=100, gamma=0.01, decision_function_shape='ovo' ,cache_size=2000, class_weight='balanced', probability=True)

kf = KFold(n_splits=10)



y_true = []
y_pred = []
y_prob = []
for train_index, test_index in kf.split(X_vecs):
    x_train, x_test = X_vecs[train_index], X_vecs[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

    svc.fit(x_train, y_train)
    y_true.extend(y_test)
    y_pred.extend(svc.predict(x_test))
    y_prob.extend(svc.predict_proba(x_test).tolist())

print(matthews_corrcoef(y_true, y_pred))
# calculate ROC_Curve and ROC_AUC area of all classes
fpr, tpr, roc_auc = calculate_mult_roc(y_true, np.array(y_prob), classes)

fig = plt.figure(figsize=[18, 10], constrained_layout=True)
gs = fig.add_gridspec(1, 2)
f_ax1 = fig.add_subplot(gs[:, 0])
f_ax1.plot(fpr['micro'], tpr['micro'],
           label='Base model (AUC={:.2f})'.format(roc_auc['micro']))
f_ax1.set_xlabel("False Positive Rate", fontsize=17)
f_ax1.set_ylabel("True Positive Rate", fontsize=17)
f_ax1.set_title('Micro Roc Curve')


f_ax2 = fig.add_subplot(gs[:, 1])
f_ax2.plot(fpr['macro'], tpr['macro'],
           label='Base model (AUC={:.2f})'.format(roc_auc['macro']))
f_ax2.set_xlabel("False Positive Rate", fontsize=17)
f_ax2.set_ylabel("True Positive Rate", fontsize=17)
f_ax2.set_title('Macro Roc Curve')

fig2 = plt.figure(figsize=[18, 10], constrained_layout=True)
f2_gs = fig2.add_gridspec(2, 2)
f2_ax1 = fig2.add_subplot(f2_gs[0, 0])
f2_ax1.plot(fpr[0], tpr[0],
            label='Base model (AUC={:.2f})'
            .format(roc_auc[0]))
f2_ax1.set_xlabel("False Positive Rate", fontsize=17)
f2_ax1.set_ylabel("True Positive Rate", fontsize=17)
f2_ax1.set_title('Outer Roc Curve')

f2_ax2 = fig2.add_subplot(f2_gs[0, 1])
f2_ax2.plot(fpr[1], tpr[1],
            label='Base model (AUC={:.2f})'
            .format(roc_auc[1]))
f2_ax2.set_xlabel("False Positive Rate", fontsize=17)
f2_ax2.set_ylabel("True Positive Rate", fontsize=17)
f2_ax2.set_title('Inner Roc Curve')

f2_ax3 = fig2.add_subplot(f2_gs[1, 0])
f2_ax3.plot(fpr[2], tpr[2],
            label='Base model (AUC={:.2f})'
            .format(roc_auc[2]))
f2_ax3.set_xlabel("False Positive Rate", fontsize=17)
f2_ax3.set_ylabel("True Positive Rate", fontsize=17)
f2_ax3.set_title('Matrix Roc Curve')

f2_ax4 = fig2.add_subplot(f2_gs[1, 1])
f2_ax4.plot(fpr[3], tpr[3],
            label='Base model (AUC={:.2f})'
            .format(roc_auc[3]))
f2_ax4.set_xlabel("False Positive Rate", fontsize=17)
f2_ax4.set_ylabel("True Positive Rate", fontsize=17)
f2_ax4.set_title('Space Roc Curve')

for (name, method) in zip(sampling_names, sampling_methods):
    print(name)
    t0 = time.time()
    X_res, y_res = method.fit_resample(X_vecs, labels)

    y_true = []
    y_pred = []
    y_prob = []
    for train_index, test_index in kf.split(X_res):
        x_train, x_test = X_res[train_index], X_res[test_index]
        y_train, y_test = y_res[train_index], y_res[test_index]

        svc.fit(x_train, y_train)
        y_true.extend(y_test)
        y_pred.extend(svc.predict(x_test))
        y_prob.extend(svc.predict_proba(x_test).tolist())

    t = time.time() - t0
    print(matthews_corrcoef(y_true, y_pred))
    # calculate ROC_Curve and ROC_AUC area of all classes
    fpr, tpr, roc_auc = calculate_mult_roc(y_true, np.array(y_prob), classes)

    f_ax1.plot(fpr['micro'], tpr['micro'],
           label='{} (AUC={:.2f})'.format(name, roc_auc['micro']))

    f_ax2.plot(fpr['macro'], tpr['macro'],
           label='{} (AUC={:.2f})'.format(name, roc_auc['macro']))

    f2_ax1.plot(fpr[0], tpr[0],
                label='{} (AUC={:.2f})'.format(name, roc_auc[0]))
    
    f2_ax2.plot(fpr[1], tpr[1],
                label='{} (AUC={:.2f})'.format(name, roc_auc[1]))
    
    f2_ax3.plot(fpr[2], tpr[2],
                label='{} (AUC={:.2f})'.format(name, roc_auc[2]))
    
    f2_ax4.plot(fpr[3], tpr[3],
                label='{} (AUC={:.2f})'.format(name, roc_auc[3]))

f_ax1.legend(loc=4, fontsize=10)
f_ax2.legend(loc=4, fontsize=10)

f2_ax1.legend(loc=4, fontsize=10)
f2_ax2.legend(loc=4, fontsize=10)
f2_ax3.legend(loc=4, fontsize=10)
f2_ax4.legend(loc=4, fontsize=10)

plt.show()

In [None]:
from collections import Counter
Counter(y_res)

In [None]:
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer, matthews_corrcoef
smote = SMOTE(sampling_strategy='not majority', random_state=42)
adasyn = ADASYN(sampling_strategy='not majority', random_state=42)
X_res, y_res = smote.fit_sample(test_vecs, test_labels)

cv_result = cross_validate(svc, X_res, y_res, cv=10, scoring=scoring)

In [None]:
print(cv_result['test_MCC(O)'].mean(), cv_result['test_MCC(I)'].mean(), cv_result['test_MCC(M)'].mean(), cv_result['test_MCC(S)'].mean(), cv_result['test_mcc'].mean())

In [None]:
# pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC(decision_function_shape='ovo' ,cache_size=2000, class_weight='balanced', probability=True))])

test_cm_list = []
test_mcm_list = []
for i in range(1, 40):
    print('-' * 80)
    print(i, model_path_dm[i], model_path_dbow[i])
    dm = Doc2Vec.load(str(model_path_dm[i]))
    dbow = Doc2Vec.load(str(model_path_dbow[i]))

    print('sequences -> vectors')
    train_vecs = scaler.fit_transform(get_vectors(dm, dbow, train_sequences, k_list_dm[i]))
    test_vecs = scaler.transform(get_vectors(dm, dbow, test_sequences, k_list_dm[i]))
    print(train_vecs.shape, test_vecs.shape)

    print('search optimal parameters')
    grid.fit(train_vecs, train_labels)
    print('best score:', grid.best_score_)
    print('best params:', grid.best_params_)
    cv_result_df = pd.DataFrame(grid.cv_results_)
    cv_result_df.to_json(cs_path / model_path_dm[i].stem)

    print('evaluate best estimator on test dataset')
    print("Optimized Score:",grid.score(test_vecs, test_labels))
    test_pred = grid.predict(test_vecs)
    cm = confusion_matrix(test_labels, test_pred)
    mcm = multilabel_confusion_matrix(test_labels, test_pred)
    test_cm_list.append(cm)
    test_mcm_list.append(mcm)
    print('-' * 80)



In [None]:
test_cm_list = np.load('test-cm-list.npy')
drawing_tools.plot_confusion_matrix(test_cm_list[20], classes)

In [None]:
dm = Doc2Vec.load(str(model_path_dm[20]))
dbow = Doc2Vec.load(str(model_path_dbow[20]))

train_vecs = get_vectors(dm, dbow, train_sequences, k_list_dm[20])
test_vecs = get_vectors(dm, dbow, test_sequences, k_list_dm[20])

train_scal = scaler.fit_transform(train_vecs)
test_scal = scaler.transform(test_vecs)

In [None]:
drawing_tools.plot_confusion_matrix(cm, classes)

In [None]:
cv_result = pd.DataFrame(grid.cv_results_)
cv_result.to_csv('cv_result.csv')

In [None]:
# t-sne可视化数据
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

index = 33
print(index, model_path_dm[index], model_path_dbow[index])
dm = Doc2Vec.load(str(model_path_dm[index]))
dbow = Doc2Vec.load(str(model_path_dbow[index]))

print('sequences -> vectors')
train_vecs = get_vectors(dm, dbow, test_sequences, k_list_dm[index])

In [None]:
tsne = TSNE(perplexity=20, learning_rate=500)
tsne.fit_transform(train_vecs)

tsne = pd.DataFrame(tsne.embedding_, index=test_labels)

In [None]:
tsne = TSNE(perplexity=20, learning_rate=500)
X_res, y_res = adasyn.fit_resample(train_vecs, test_labels)
tsne.fit_transform(X_res)

tsne = pd.DataFrame(tsne.embedding_, index=y_res)

In [None]:
outer = tsne.loc[0]
print(outer.shape)
plt.plot(outer[0],outer[1],'r.')

inner = tsne.loc[1]
print(inner.shape)
plt.plot(inner[0],inner[1],'go')

matrix = tsne.loc[2]
print(matrix.shape)
plt.plot(matrix[0],matrix[1],'b*')

space = tsne.loc[3]
print(space.shape)
plt.plot(space[0],space[1],'yo')

plt.show()

In [None]:
outer = tsne.loc[0]
print(outer.shape)
plt.plot(outer[0],outer[1],'r.')

inner = tsne.loc[1]
print(inner.shape)
plt.plot(inner[0],inner[1],'go')

matrix = tsne.loc[2]
print(matrix.shape)
plt.plot(matrix[0],matrix[1],'b*')

space = tsne.loc[3]
print(space.shape)
plt.plot(space[0],space[1],'yo')

plt.show()

In [None]:
batch_siez = 16
lr = 0.001
epochs = 100

weight = torch.tensor([10, 1.47, 6.25, 16.7], device=device)

In [None]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_size=input_size,
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           bidirectional=True)
        self.decoder = nn.Linear(2 * hidden_size, 4)
    
    
    def forward(self, inputs):
        inputs = inputs.unsqueeze(-1)
        
        embeddings = inputs.permute(1, 0, 2) # 将tensor的维度互换，然后编码

        outputs, _ = self.rnn(embeddings) # output, (h)

        # encoding = torch.cat((outputs[0], outputs[-1]), -1)
        encoding = outputs[-1]

        outs = self.decoder(encoding)

        return outs

In [None]:
train_cm_list = []
train_mcm_list = []

In [None]:
for i in range(0, len(vec_path_list)):
    print(i, vec_path_list[i])
    train_vectors = torch.from_numpy(np.load(vec_path_list[i])).to(device)
    train_tensor_labels = torch.tensor(train_labels, device=device)
    train_dataset = DATA.TensorDataset(train_vectors, train_tensor_labels)
    train_dataloder = DATA.DataLoader(train_dataset, batch_size=batch_siez, shuffle=True)
    
    rnn = RNN(1, 10, 1)
    optimizer = optim.Adam(rnn.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss(weight=weight)
    rnn.to(device)

    start_time = time.time()
    
    training_tools.train(train_dataloder, rnn, loss, optimizer, epochs)

    train_cm, train_mcm = training_tools.evaluate_model(rnn, train_dataloder)
    # test_cm, test_mcm = evaluate_model(ffn, test_dataloder)

    train_cm_list.append(train_cm)
    train_mcm_list.append(train_mcm)

    # test_cm_list.append(test_cm)
    # test_mcm_list.append(test_mcm)

    print('\n')

In [None]:
drawing_tools.plot_confusion_matrix(train_cm_list[0], classes)

In [None]:
from sklearn.svm import NuSVC
clf = NuSVC(nu=0.1, class_weight='balanced', decision_function_shape='ovo')
train_cm_list = []
train_mcm_list = []

test_cm_list = []
test_mcm_list = []

sm424_cm_list = []
sm424_mcm_list = []

In [None]:
for i in range(0, len(model_path_list)):
    print(i)
    print('model path:' + str(model_path_list[i]))
    print('training model:')
    print('performance on train dataset:')
    train_cm, train_mcm = train_model(train_sequences, train_labels, model_path_list[i], int(k_list[i]))
    calculate_classification_index(train_mcm)
    print('\n')
    train_cm_list.append(train_cm)
    train_mcm_list.append(train_mcm)

    print('performance on test dataset:')
    test_cm, test_mcm = evaluate_model(test_sequences, test_labels, model_path_list[i], int(k_list[i]))
    calculate_classification_index(test_mcm)
    print('\n')
    test_cm_list.append(test_cm)
    test_mcm_list.append(test_mcm)

    print('performance on SM424-18 dataset:')
    sm424_cm, sm424_mcm = evaluate_model(sm424_sequences, sm424_labels, model_path_list[i], int(k_list[i]))
    calculate_classification_index(sm424_mcm)
    print('\n')
    sm424_cm_list.append(sm424_cm)
    sm424_mcm_list.append(sm424_mcm)

In [None]:
def train_model(sequences, labels, model_file, k):
    X = embedding_tools.get_vectors(sequences, model_file, k)
    clf.fit(X, labels)
    y_predit = clf.predict(X)
    print('accuracy_score:', accuracy_score(labels, y_predit))
    train_cm = confusion_matrix(labels, y_predit)
    train_mcm = multilabel_confusion_matrix(labels, y_predit)
    return train_cm, train_mcm

def evaluate_model(sequences, labels, model_file, k):
    X = embedding_tools.get_vectors(sequences, model_file, k)
    y_predit = clf.predict(X)
    print('accuracy_score:', accuracy_score(labels, y_predit))
    cm = confusion_matrix(labels, y_predit)
    mcm = multilabel_confusion_matrix(labels, y_predit)
    return cm, mcm

def calculate_classification_index(mcm):
    tn = mcm[:, 0, 0]
    fn = mcm[:, 1, 0]
    fp = mcm[:, 0, 1]
    tp = mcm[:, 1, 1]
    recall = tp / (tp + fn)
    precesion = tp / (tp + fp)
    F1 = (2 * precesion * recall) / (precesion + recall)
    mcc = (tp * tn - fp * fn) / np.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
    print('recall:', recall)
    print('precesion:', precesion)
    print('f1:', F1)
    print('mcc:', mcc)

In [None]:
# 测试PseAAC作为特征的分类性能
from sklearn.model_selection import cross_validate
from imblearn.over_sampling import ADASYN

PseAAC_PATH = Path('data\protein_data\submitochondrial\SM766-20\PseAAC')
adasyn = ADASYN(sampling_strategy='not majority', random_state=42)

outer_PAAC_file = []
inner_PAAC_file = []
matrix_PAAC_file = []
space_PAAC_file = []

for p in PseAAC_PATH.glob('o*.csv'):
    outer_PAAC_file.append(p)
for p in PseAAC_PATH.glob('inner*.csv'):
    inner_PAAC_file.append(p)
for p in PseAAC_PATH.glob('m*.csv'):
    matrix_PAAC_file.append(p)
for p in PseAAC_PATH.glob('inter*.csv'):
    space_PAAC_file.append(p)

In [None]:
for index in range(0, 11):
    print(index, outer_PAAC_file[index], inner_PAAC_file[index], matrix_PAAC_file[index], space_PAAC_file[index])
    outer = pd.read_csv(outer_PAAC_file[index], header=None).values[:, 1:]
    inner = pd.read_csv(inner_PAAC_file[index], header=None).values[:, 1:]
    matrix = pd.read_csv(matrix_PAAC_file[index], header=None).values[:, 1:]
    space = pd.read_csv(space_PAAC_file[index], header=None).values[:, 1:]
    print(outer.shape, inner.shape, matrix.shape, space.shape)
    outer_labels = np.full(len(outer), 0)
    inner_labels = np.full(len(inner), 1)
    matrix_labels = np.full(len(matrix), 2)
    space_labels = np.full(len(space), 3)
    print(outer_labels.shape, inner_labels.shape, matrix_labels.shape, space_labels.shape)

    X = np.concatenate((outer, inner, matrix, space), axis=0)
    y = np.concatenate((outer_labels, inner_labels, matrix_labels, space_labels), axis=0)
    X_res, y_res = adasyn.fit_resample(X, y)
    for (name, method, parameters) in zip(classifier_names, classifier_methods, clssifier_parameters):
        print('-'* 80)
        print(name)
        print('未采样：')
        cv_results = evaluate(method, parameters, X, y)
        printMcc(cv_results)
        print('采样：')
        cv_results = evaluate(method, parameters, X_res, y_res)
        printMcc(cv_results)
        print('-'* 80)
        print('\n')
    print('\n')

In [None]:
index = 2
print('-'* 80)
print(index, outer_PAAC_file[index], inner_PAAC_file[index], matrix_PAAC_file[index], space_PAAC_file[index])
outer = pd.read_csv(outer_PAAC_file[index], header=None).values[:, 1:]
inner = pd.read_csv(inner_PAAC_file[index], header=None).values[:, 1:]
matrix = pd.read_csv(matrix_PAAC_file[index], header=None).values[:, 1:]
space = pd.read_csv(space_PAAC_file[index], header=None).values[:, 1:]
print(outer.shape, inner.shape, matrix.shape, space.shape)
outer_labels = np.full(len(outer), 0)
inner_labels = np.full(len(inner), 1)
matrix_labels = np.full(len(matrix), 2)
space_labels = np.full(len(space), 3)
print(outer_labels.shape, inner_labels.shape, matrix_labels.shape, space_labels.shape)

X = np.concatenate((outer, inner, matrix, space), axis=0)
y = np.concatenate((outer_labels, inner_labels, matrix_labels, space_labels), axis=0)
print(X.shape, y.shape)

X_res, y_res = adasyn.fit_resample(X, y)
print(X_res.shape, y_res.shape)

In [None]:
len(test_sequences[0])

In [None]:
# 测试AAindex作为特征的分类性能
from AAIndex import AAINDEX, AAINDEX_ADD
AAindex_file = Path('choosed_AAindex_properties.csv')


def seq2aaindex(sequences, aaindex_file, length):

    def pad(x):
        return x[:length] if len(x) >= length else x + [0] * (length - len(x))
    
    aaindex = AAINDEX(sequences, aaindex_file)
    aaindex = [pad(x) for x in aaindex]
    return aaindex

In [None]:
# 用32个属性来表示每个氨基酸序列，序列长度设为平均值392
svc = SVC( decision_function_shape='ovo' ,cache_size=2000, class_weight='balanced')
aaindex = seq2aaindex(sequences, AAindex_file, 12544)

X = np.array(aaindex)
y = labels

adasyn = ADASYN(sampling_strategy='not majority', random_state=42)

X_res, y_res = adasyn.fit_resample(X, y)

for (name, method, parameters) in zip(classifier_names, classifier_methods, clssifier_parameters):
    print(name)
    cv_results = evaluate(method, parameters, X, y)
    print('未采样：')
    printMcc(cv_results)
    cv_results = evaluate(method, parameters, X_res, y_res)
    print('采样：')
    printMcc(cv_results)


In [None]:
# 用32个属性来表示每个氨基酸序列，序列长度设为最长120096
svc = SVC( decision_function_shape='ovo' ,cache_size=2000, class_weight='balanced')
aaindex = seq2aaindex(sequences, AAindex_file, 120096)

X = np.array(aaindex)
y = test_labels

adasyn = ADASYN(sampling_strategy='not majority', random_state=42)

X_res, y_res = adasyn.fit_resample(X, y)

for (name, method, parameters) in zip(classifier_names, classifier_methods, clssifier_parameters):
    print(name)
    cv_results = evaluate(method, parameters, X, y)
    print('未采样：')
    printMcc(cv_results)
    cv_results = evaluate(method, parameters, X_res, y_res)
    print('采样：')
    printMcc(cv_results)
    print()

In [None]:
length = 0
for seq in test_sequences:
    length += len(seq)
print(length / 766)

max_l = len(test_sequences[0])
for seq in test_sequences:
    if max_l < len(seq):
        max_l = len(seq)
print(max_l)

In [None]:
aaindex = seq2aaindex(sequences, AAindex_file, 392)

X = np.array(aaindex)
y = labels

adasyn = ADASYN(sampling_strategy='not majority', random_state=42)

X_res, y_res = adasyn.fit_resample(X, y)

for (name, method, parameters) in zip(classifier_names, classifier_methods, clssifier_parameters):
    print(name)
    cv_results = evaluate(method, parameters, X, y)
    print('未采样：')
    printMcc(cv_results)
    cv_results = evaluate(method, parameters, X_res, y_res)
    print('采样：')
    printMcc(cv_results)
    print()

In [None]:
# 填充到最大长度
aaindex = seq2aaindex(sequences, AAindex_file, 3753)

X = np.array(aaindex)
y = labels

adasyn = ADASYN(sampling_strategy='not majority', random_state=42)

X_res, y_res = adasyn.fit_resample(X, y)

print('未采样结果：')
cv_result = cross_validate(svc, X, y, cv=10, scoring=scoring)
printMcc(cv_result)

print('采样后结果：')
cv_result = cross_validate(svc, X_res, y_res, cv=10, scoring=scoring)
printMcc(cv_result)

In [None]:
from utils.feature_extraction import AAC, DPC, TPC

X = np.array(AAC(sequences))
y = labels

adasyn = ADASYN(sampling_strategy='not majority', random_state=42)

X_res, y_res = adasyn.fit_resample(X, y)

for (name, method, parameters) in zip(classifier_names, classifier_methods, clssifier_parameters):
    print(name)
    print('未采样:')
    cv_results = evaluate(method, parameters, X, y)
    printMcc(cv_results)
    print('采样:')
    cv_results = evaluate(method, parameters, X_res, y_res)
    printMcc(cv_results)
    print()

In [None]:
X = np.array(DPC(sequences))
y = labels

adasyn = ADASYN(sampling_strategy='not majority', random_state=42)

X_res, y_res = adasyn.fit_resample(X, y)

for (name, method, parameters) in zip(classifier_names, classifier_methods, clssifier_parameters):
    print(name)
    print('未采样:')
    cv_results = evaluate(method, parameters, X, y)
    printMcc(cv_results)
    print('采样:')
    cv_results = evaluate(method, parameters, X_res, y_res)
    printMcc(cv_results)
    print()

In [None]:
X = np.array(TPC(sequences))
y = labels

adasyn = ADASYN(sampling_strategy='not majority', random_state=42)

X_res, y_res = adasyn.fit_resample(X, y)

for (name, method, parameters) in zip(classifier_names, classifier_methods, clssifier_parameters):
    print(name)
    print('未采样:')
    cv_results = evaluate(method, parameters, X, y)
    printMcc(cv_results)
    print('采样:')
    cv_results = evaluate(method, parameters, X_res, y_res)
    printMcc(cv_results)
    print()