In [1]:
%load_ext autoreload
%autoreload 2
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.sans-serif'] = "Arial"
mpl.rcParams['font.family'] = "sans-serif"
import multiprocessing as mp
import time
import glob
import re
import random
from sklearn.model_selection import train_test_split
import torch

sys.path.insert(0, '/Data/user/panhailin/git_lab/npspy')
import npspy as nps

# 全局配置

In [2]:
torch.set_num_threads(10)

all_peps = [
    'hp1_1', 'hp1_2', 'hp1_3', 'hp1_4', 'hp1_5', 'hp1_6', 'hp1_7', 'hp1_8', 'hp1_9',
    'hp2_1', 'hp2_2', 'hp2_3', 'hp2_4', 'hp2_5', 'hp2_6',
]

y_code_dict = nps.ml.set_y_codes_for_classes(np.array(all_peps)[:,None])
y_code_dict
y_to_label_dict = {v:k for k,v in y_code_dict.items()}
y_code_dict, y_to_label_dict

({np.str_('hp1_1'): 0,
  np.str_('hp1_2'): 1,
  np.str_('hp1_3'): 2,
  np.str_('hp1_4'): 3,
  np.str_('hp1_5'): 4,
  np.str_('hp1_6'): 5,
  np.str_('hp1_7'): 6,
  np.str_('hp1_8'): 7,
  np.str_('hp1_9'): 8,
  np.str_('hp2_1'): 9,
  np.str_('hp2_2'): 10,
  np.str_('hp2_3'): 11,
  np.str_('hp2_4'): 12,
  np.str_('hp2_5'): 13,
  np.str_('hp2_6'): 14},
 {0: np.str_('hp1_1'),
  1: np.str_('hp1_2'),
  2: np.str_('hp1_3'),
  3: np.str_('hp1_4'),
  4: np.str_('hp1_5'),
  5: np.str_('hp1_6'),
  6: np.str_('hp1_7'),
  7: np.str_('hp1_8'),
  8: np.str_('hp1_9'),
  9: np.str_('hp2_1'),
  10: np.str_('hp2_2'),
  11: np.str_('hp2_3'),
  12: np.str_('hp2_4'),
  13: np.str_('hp2_5'),
  14: np.str_('hp2_6')})

In [3]:
train_objs = [f"../../../00.data/{pep}_valid80.pkl" for pep in all_peps]
test_objs = [f"../../../00.data/{pep}_valid20.pkl" for pep in all_peps]
labels = all_peps

In [4]:
def stratified_sample(df, column_name, sample_size=15000, random_state=42):
    """
    对DataFrame按指定列类别分层随机抽样
    
    参数:
        df: 输入DataFrame
        column_name: 分层依据的列名
        sample_size: 每类抽取样本数(默认15000)
        random_state: 随机种子
    
    返回:
        抽样后的新DataFrame
    """
    re_df = df.groupby(column_name, group_keys=True).apply(
        lambda x: x.sample(min(len(x), sample_size), 
                          random_state=random_state),
        include_groups=False,
    )
    re_df[re_df.index.names[0]] =  [i[0] for i in re_df.index]
    re_df.index = [i[1] for i in re_df.index]
    return re_df

In [13]:
def train_pipeline(train_objs, labels, y_code_dict, all_peps, train_name='valid_data', train_sample_size=14000, seed=42):
    # 读取pkl文件，生成readid，X，y组成的df
    train_df = nps.ml.get_X_y_from_objs(objs=train_objs, labels=labels, y_code_dict=y_code_dict, down_sample_to=1000, att='signal')
    train_df = stratified_sample(train_df, 'y', sample_size=train_sample_size, random_state=seed)
    train_df, valid_df = train_test_split(train_df, test_size=1/8, random_state=seed, stratify=train_df['y'])
    test_df = nps.ml.get_X_y_from_objs(objs=test_objs, labels=labels, y_code_dict=y_code_dict, down_sample_to=1000, att='signal')
    test_df = stratified_sample(test_df, 'y', sample_size=6000, random_state=seed)

    # 通过data_df构建dataloader
    batch_size = 128
    train_dl = nps.ml.construct_dataloader_from_data_df(train_df, batch_size=batch_size, augment=False)
    valid_dl = nps.ml.construct_dataloader_from_data_df(valid_df, batch_size=batch_size)
    test_dl = nps.ml.construct_dataloader_from_data_df(test_df, batch_size=batch_size, shuffle=False)

    # train
    nps.ml.seed_everything(seed)
    clf = nps.ml.Trainer(lr=0.005, num_classes=len(all_peps), epochs=200, device='cuda', lr_scheduler_patience=3, label_smoothing=0.1, model_name='CNN1DL1000')
    clf.fit(train_dl, valid_dl, early_stopping_patience=30, name=train_name)

    # pred
    pred_df = clf.predict(test_dl, name=train_name, y_to_label_dict=y_to_label_dict)
    test_all_reads_s = pred_df['true'].value_counts()
    cm_df = nps.ml.get_cm(pred_df, label_order=all_peps)
    cm_df.to_csv(f"../../../03.results/classification_on_clean_data/hp12/diff_data_size/valid/{train_name}_cm.csv")
    acc = np.sum(np.diag(cm_df))/len(pred_df)
    print(f'{train_sample_size}: {acc}')
    pred_proba_df = clf.predict_proba(test_dl, name=train_name)
    pred_proba_df.to_csv(f"../../../03.results/classification_on_clean_data/hp12/diff_data_size/valid/{train_name}_pred_proba.csv")
    return acc


In [14]:
# 42
acc_df = []
# train_sample_sizes = [14000, 12000, 10000, 8000, 6000, 4000]
train_sample_sizes = [25000, 20000, 15000, 10000, 5000]

for train_sample_size in train_sample_sizes:
    acc = train_pipeline(train_objs, labels, y_code_dict, all_peps, train_name=f'valid_data_ds_{train_sample_size}', train_sample_size=train_sample_size, seed=1)
    acc_df.append([train_sample_size, acc])
acc_df =pd.DataFrame(acc_df)
acc_df.columns = ['train_sample_size', 'acc']
acc_df.to_csv("../../../03.results/classification_on_clean_data/hp12/diff_data_size/valid/acc.csv")
acc_df

Model CNN1DL1000 has total parameter number: 6.21 M
Epoch   0 / 200 train_loss: 1.2432 train_acc: 0.7271 val_loss: 1.0308 val_acc: 0.8137 lr: 0.005
Epoch   1 / 200 train_loss: 0.9540 train_acc: 0.8509 val_loss: 1.2691 val_acc: 0.7122 lr: 0.005
Epoch   2 / 200 train_loss: 0.8799 train_acc: 0.8801 val_loss: 1.1615 val_acc: 0.7591 lr: 0.005
Epoch   3 / 200 train_loss: 0.8370 train_acc: 0.8964 val_loss: 1.2595 val_acc: 0.7271 lr: 0.005
Epoch   4 / 200 train_loss: 0.8053 train_acc: 0.9082 val_loss: 0.9356 val_acc: 0.8480 lr: 0.005
Epoch   5 / 200 train_loss: 0.7833 train_acc: 0.9168 val_loss: 0.8729 val_acc: 0.8784 lr: 0.005
Epoch   6 / 200 train_loss: 0.7631 train_acc: 0.9244 val_loss: 1.3552 val_acc: 0.6829 lr: 0.005
Epoch   7 / 200 train_loss: 0.7472 train_acc: 0.9309 val_loss: 0.8967 val_acc: 0.8620 lr: 0.005
Epoch   8 / 200 train_loss: 0.7335 train_acc: 0.9362 val_loss: 0.9961 val_acc: 0.8242 lr: 0.005
Epoch   9 / 200 train_loss: 0.7210 train_acc: 0.9407 val_loss: 0.8413 val_acc: 0.887

Unnamed: 0,train_sample_size,acc
0,25000,0.934222
1,20000,0.930289
2,15000,0.923078
3,10000,0.916567
4,5000,0.896489
