In [1]:
%load_ext autoreload
%autoreload 2
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.sans-serif'] = "Arial"
mpl.rcParams['font.family'] = "sans-serif"
import multiprocessing as mp
import time
import glob
import re
import random
from sklearn.model_selection import train_test_split
import torch

sys.path.insert(0, '/Data/user/panhailin/git_lab/npspy')
import npspy as nps

In [3]:
all_task_dict = {
    'c02': ['1S', '1pS'],
    'c03': ['1S', '1SMe', '3SMe'],
    'c04': ['1S', '1SAc', '3SAc'],
    'c05': ['1S', '1Soct', '3Soct'],
    'c06': ['1S', '1pS', '1SMe', '1SAc', '1Soct'],
    'c07': ['1S', '3SMe', '3SAc', '3Soct'],
    'c08': ['1I', '1L'],
    'c09': ['3I', '3L'],
    'c10': ['3dI', '3dL'],
    'c11': ['3I', '3dI'],
    'c12': ['3L', '3dL'],
    'c15': ['1D', '1D02'],
    'c16': ['1R', '1R02'],
    'c17': ['1Y', '1Y02'],
    'c18': ['1W', '1W02'],
}

def stratified_sample(df, column_name, sample_size=15000, random_state=42):
    """
    对DataFrame按指定列类别分层随机抽样
    
    参数:
        df: 输入DataFrame
        column_name: 分层依据的列名
        sample_size: 每类抽取样本数(默认15000)
        random_state: 随机种子
    
    返回:
        抽样后的新DataFrame
    """
    re_df = df.groupby(column_name, group_keys=True).apply(
        lambda x: x.sample(min(len(x), sample_size), 
                          random_state=random_state),
        include_groups=False,
    )
    re_df[re_df.index.names[0]] =  [i[0] for i in re_df.index]
    re_df.index = [i[1] for i in re_df.index]
    return re_df

def train_pipeline(train_objs, test_objs, labels, y_code_dict, all_peps, train_name='clean_data', train_sample_size=14000):
    # 读取pkl文件，生成readid，X，y组成的df
    train_df = nps.ml.get_X_y_from_objs(objs=train_objs, labels=labels, y_code_dict=y_code_dict, down_sample_to=1000, att='signal')
    train_df = stratified_sample(train_df, 'y', sample_size=train_sample_size, random_state=42)
    train_df, valid_df = train_test_split(train_df, test_size=1/8, random_state=42, stratify=train_df['y'])
    test_df = nps.ml.get_X_y_from_objs(objs=test_objs, labels=labels, y_code_dict=y_code_dict, down_sample_to=1000, att='signal')
    test_df = stratified_sample(test_df, 'y', sample_size=3000, random_state=42)

    # 通过data_df构建dataloader
    batch_size = 64
    train_dl = nps.ml.construct_dataloader_from_data_df(train_df, batch_size=batch_size, augment=False)
    valid_dl = nps.ml.construct_dataloader_from_data_df(valid_df, batch_size=batch_size)
    test_dl = nps.ml.construct_dataloader_from_data_df(test_df, batch_size=batch_size, shuffle=False)

    # train
    nps.ml.seed_everything(42)
    clf = nps.ml.Trainer(lr=0.005, num_classes=len(all_peps), epochs=200, device='cuda', lr_scheduler_patience=3, label_smoothing=0.1, model_name='CNN1DL1000')
    clf.fit(train_dl, valid_dl, early_stopping_patience=30, name=train_name)

    # pred
    pred_df = clf.predict(test_dl, name=train_name, y_to_label_dict=y_to_label_dict)
    test_all_reads_s = pred_df['true'].value_counts()
    cm_df = nps.ml.get_cm(pred_df, label_order=all_peps)
    cm_df.to_csv(f"../../../03.results/classification_on_clean_data/GSXGS/diff_task/clean/{train_name}_cm.csv")
    acc = np.sum(np.diag(cm_df))/len(pred_df)
    print(f'{train_sample_size}: {acc}')
    pred_proba_df = clf.predict_proba(test_dl, name=train_name)
    pred_proba_df.to_csv(f"../../../03.results/classification_on_clean_data/GSXGS/diff_task/clean/{train_name}_pred_proba.csv")
    
    return acc

In [4]:
acc_df = []
for task_name, task_peps in all_task_dict.items():
    torch.set_num_threads(10)
    
    all_peps = task_peps
    
    y_code_dict = nps.ml.set_y_codes_for_classes(np.array(all_peps)[:,None])
    y_to_label_dict = {v:k for k,v in y_code_dict.items()}

    train_objs = [f"../../../03.results/classification_on_clean_data/GSXGS/{pep}/{pep}_valid80_clean_obj.pkl" for pep in all_peps]
    test_objs = [f"../../../03.results/classification_on_clean_data/GSXGS/{pep}/{pep}_valid20_clean_obj.pkl" for pep in all_peps]
    labels = all_peps

    acc = train_pipeline(train_objs, test_objs, labels, y_code_dict, all_peps, train_name=f'clean_data_{task_name}', train_sample_size=14000)

    acc_df.append([task_name, acc])
    
acc_df = pd.DataFrame(acc_df)
acc_df.columns = [task_name, 'acc']
acc_df.to_csv("../../../03.results/classification_on_clean_data/GSXGS/diff_task/clean/acc.csv")
acc_df

Model CNN1DL1000 has total parameter number: 6.21 M
Epoch   0 / 200 train_loss: 0.4862 train_acc: 0.8883 val_loss: 0.3389 val_acc: 0.9123 lr: 0.005
Epoch   1 / 200 train_loss: 0.3148 train_acc: 0.9289 val_loss: 0.3296 val_acc: 0.9155 lr: 0.005
Epoch   2 / 200 train_loss: 0.2974 train_acc: 0.9408 val_loss: 0.2990 val_acc: 0.9396 lr: 0.005
Epoch   3 / 200 train_loss: 0.2930 train_acc: 0.9424 val_loss: 0.3761 val_acc: 0.8798 lr: 0.005
Epoch   4 / 200 train_loss: 0.2859 train_acc: 0.9471 val_loss: 0.3306 val_acc: 0.9171 lr: 0.005
Epoch   5 / 200 train_loss: 0.2840 train_acc: 0.9488 val_loss: 0.3012 val_acc: 0.9407 lr: 0.005
Epoch   6 / 200 train_loss: 0.2825 train_acc: 0.9483 val_loss: 0.3263 val_acc: 0.9199 lr: 0.005
Epoch   7 / 200 train_loss: 0.2781 train_acc: 0.9496 val_loss: 0.3597 val_acc: 0.8992 lr: 0.005
Epoch   8 / 200 train_loss: 0.2745 train_acc: 0.9533 val_loss: 0.2796 val_acc: 0.9480 lr: 0.005
Epoch   9 / 200 train_loss: 0.2703 train_acc: 0.9551 val_loss: 0.3929 val_acc: 0.874

Unnamed: 0,c18,acc
0,c02,0.957833
1,c03,0.757333
2,c04,0.794333
3,c05,0.910556
4,c06,0.7568
5,c07,0.819833
6,c08,0.719167
7,c09,0.801833
8,c10,0.845667
9,c11,0.786833
