In [1]:
%load_ext autoreload
%autoreload 2
import sys
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42
mpl.rcParams['font.sans-serif'] = "Arial"
mpl.rcParams['font.family'] = "sans-serif"
import multiprocessing as mp
import time
import glob
import re
import random
from sklearn.model_selection import train_test_split
import torch

sys.path.insert(0, '/Data/user/panhailin/git_lab/npspy')
import npspy as nps

# 全局配置

In [2]:
torch.set_num_threads(10)

all_peps = [['1K', '1R'], ['1D', '1E'], ['1F', '1W', '1Y', '1I', '1L', '1M', '1V', '1H', '1Q', '1A', '1G', '1S', '1C', '1P', '1T', '1N']]

y_code_dict = nps.ml.set_y_codes_for_classes(all_peps)
y_code_dict
y_to_label_dict = {0: 'positive', 1: 'negative', 2: 'neutral'}
y_code_dict, y_to_label_dict

({'1K': 0,
  '1R': 0,
  '1D': 1,
  '1E': 1,
  '1F': 2,
  '1W': 2,
  '1Y': 2,
  '1I': 2,
  '1L': 2,
  '1M': 2,
  '1V': 2,
  '1H': 2,
  '1Q': 2,
  '1A': 2,
  '1G': 2,
  '1S': 2,
  '1C': 2,
  '1P': 2,
  '1T': 2,
  '1N': 2},
 {0: 'positive', 1: 'negative', 2: 'neutral'})

In [3]:
tmp = []
for i in all_peps:
    tmp.extend(i)
all_peps = tmp

In [4]:
train_objs = [f"../../../00.data/GSXGS/{pep}_valid80.pkl" for pep in all_peps]
test_objs = [f"../../../00.data/GSXGS/{pep}_valid20.pkl" for pep in all_peps]
labels = all_peps

In [5]:
def stratified_sample(df, column_name, sample_size=15000, random_state=42):
    """
    对DataFrame按指定列类别分层随机抽样
    
    参数:
        df: 输入DataFrame
        column_name: 分层依据的列名
        sample_size: 每类抽取样本数(默认15000)
        random_state: 随机种子
    
    返回:
        抽样后的新DataFrame
    """
    re_df = df.groupby(column_name, group_keys=True).apply(
        lambda x: x.sample(min(len(x), sample_size), 
                          random_state=random_state),
        include_groups=False,
    )
    re_df[re_df.index.names[0]] =  [i[0] for i in re_df.index]
    re_df.index = [i[1] for i in re_df.index]
    return re_df

In [6]:
def train_pipeline(train_objs, test_objs, labels, train_name='clean_data', train_sample_size=14000):
    train_df = nps.ml.get_X_y_from_objs(objs=train_objs, labels=labels, y_code_dict=y_code_dict, down_sample_to=1000, att='signal')
    train_df = stratified_sample(train_df, 'y', sample_size=train_sample_size, random_state=42)
    train_df, valid_df = train_test_split(train_df, test_size=1/8, random_state=42, stratify=train_df['y'])
    test_df = nps.ml.get_X_y_from_objs(objs=test_objs, labels=labels, y_code_dict=y_code_dict, down_sample_to=1000, att='signal')
    test_df = stratified_sample(test_df, 'y', sample_size=6000, random_state=42)

    batch_size = 64
    train_dl = nps.ml.construct_dataloader_from_data_df(train_df, batch_size=batch_size, augment=False)
    valid_dl = nps.ml.construct_dataloader_from_data_df(valid_df, batch_size=batch_size)
    test_dl = nps.ml.construct_dataloader_from_data_df(test_df, batch_size=batch_size, shuffle=False)

    nps.ml.seed_everything(42)
    clf = nps.ml.Trainer(lr=0.005, num_classes=len(y_to_label_dict), epochs=200, device='cuda', lr_scheduler_patience=3, label_smoothing=0.1, model_name='CNN1DL1000')
    clf.fit(train_dl, valid_dl, early_stopping_patience=30, name=train_name)

    pred_df = clf.predict(test_dl, name=train_name, y_to_label_dict=y_to_label_dict)
    cm_df = nps.ml.get_cm(pred_df, label_order=y_to_label_dict.values())
    acc = np.sum(np.diag(cm_df))/len(pred_df)
    print(f'{train_sample_size}: {acc}')
    cm_df.to_csv(f"../../../04.tables/classification/GSXGS/valid/{train_name}_cm.csv")
    pred_proba_df = clf.predict_proba(test_dl, name=train_name)
    pred_proba_df.to_csv(f"../../../04.tables/classification/GSXGS/valid/{train_name}_pred_proba.csv")
    
    return acc

In [7]:
acc_df = []
train_sample_sizes = [25000, 20000, 15000, 10000, 5000]
for train_sample_size in train_sample_sizes:
    acc = train_pipeline(train_objs, test_objs, labels, train_name=f'valid_data_large_group_ds_{train_sample_size}', train_sample_size=train_sample_size)
    acc_df.append([train_sample_size, acc])
acc_df =pd.DataFrame(acc_df)
acc_df.columns = ['train_sample_size', 'acc']
acc_df

Model CNN1DL1000 has total parameter number: 6.21 M
Epoch   0 / 200 train_loss: 0.7464 train_acc: 0.7495 val_loss: 0.8834 val_acc: 0.6656 lr: 0.005
Epoch   1 / 200 train_loss: 0.5984 train_acc: 0.8276 val_loss: 0.9231 val_acc: 0.5738 lr: 0.005
Epoch   2 / 200 train_loss: 0.5604 train_acc: 0.8489 val_loss: 0.5408 val_acc: 0.8563 lr: 0.005
Epoch   3 / 200 train_loss: 0.5420 train_acc: 0.8600 val_loss: 0.5933 val_acc: 0.8300 lr: 0.005
Epoch   4 / 200 train_loss: 0.5204 train_acc: 0.8721 val_loss: 0.7176 val_acc: 0.7425 lr: 0.005
Epoch   5 / 200 train_loss: 0.5102 train_acc: 0.8787 val_loss: 0.7462 val_acc: 0.7301 lr: 0.005
Epoch   6 / 200 train_loss: 0.5000 train_acc: 0.8852 val_loss: 0.6668 val_acc: 0.7781 lr: 0.0025
Epoch   7 / 200 train_loss: 0.4690 train_acc: 0.9009 val_loss: 0.5791 val_acc: 0.8351 lr: 0.0025
Epoch   8 / 200 train_loss: 0.4588 train_acc: 0.9065 val_loss: 0.5807 val_acc: 0.8376 lr: 0.0025
Epoch   9 / 200 train_loss: 0.4515 train_acc: 0.9102 val_loss: 0.4959 val_acc: 0.

Unnamed: 0,train_sample_size,acc
0,25000,0.907833
1,20000,0.906667
2,15000,0.896778
3,10000,0.884944
4,5000,0.875667
