In [5]:
import pandas as pd
import numpy as np
import os
import _pickle as pickle
import time
from tqdm import tqdm
import gc
from sklearn.model_selection import StratifiedKFold   # 补

'''
    在这里修改路径
    👇👇这一整段都可以自行修改
'''
part = 'Part2'
train_path_list = ['feature\\train_feature\Part2\split_by_feat\df_train_ordered_1.parquet']          # 需要加路径

label_path = 'feature\\train_feature\labels\labels.parquet'  # label路径

model_path = 'model/'            # 需要创建名为{model_path}的文件夹
if not os.path.isdir(model_path):
    os.makedirs(os.path.join(model_path, part))

id_path = 'id/'            # 需要创建名为{id_path}的文件夹
if not os.path.isdir(id_path):
    os.makedirs(os.path.join(id_path, part))

importance_path = 'importance/'            # 需要创建名为{importance_path}的文件夹
if not os.path.isdir(importance_path):
    os.makedirs(os.path.join(importance_path, part))

label_feat = ['label_5','label_10','label_20', 'label_40', 'label_60']



def saveDict(data, path):
    pickle.dump(data, open(path, 'wb'), protocol=4)

def fromDict(path):
    print('loading file:', path, end=' ')
    t = time.time()
    data = pickle.loads(open(path,'rb').read())
    print('Time spent:', time.time() - t)
    return data

def load_data_list(path_list):
    datas = []
    for path in tqdm(path_list):
        datas.append(pd.read_parquet(path))
    return pd.concat(datas, axis=0)

def get_timestamp(data):
    h, m, s = data['time'].split(':')
    h, m, s = int(h), int(m), int(s)
    timestamp = h * 3600 + m * 60 + s
    return timestamp
    

def load_data(path):
    print('reading', path, '...')
    data = pd.read_parquet(path)
    print('End reading')
    return data


In [8]:
import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping
callbacks = [log_evaluation(period=10), early_stopping(stopping_rounds=100)]

In [3]:
from sklearn.metrics import f1_score

def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = y_hat.reshape(3,-1).T  # 3 分类
    y_hat = np.argmax(y_hat, axis=-1)
    return 'f1', f1_score(y_true, y_hat, average='macro'), True

In [11]:



for index, path in enumerate(train_path_list):
    X = load_data(path)
    
    X = X.iloc[:100]


    
    file_name = path.split('\\')[-1].split('.')[0]
    print(file_name)
    print(X)
    # input_data['time_stamp'] = input_data.apply(get_timestamp, axis=1)

    Y = pd.read_parquet(label_path).set_index(['date', 'time', 'sym'])
    Y = Y.reindex(X.index)


    # 定义LightGBM模型参数
    params = {
        'objective': 'multiclass',
        'num_class': 3,
        'metric': 'multi_logloss',
        # 'metric': 'custom',  # 设置为'custom'，因为我们使用自定义的评估函数
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        # 'device': 'gpu',
        # 'gpu_platform_id': 0,
        # 'gpu_devices_id': 0
    }

    n_fold = 3            # 补
    ids_folds = {}          # 补

    # 对每个标签单独训练一个模型
    models = {}
    for i, label in enumerate(label_feat):
        print('Begin training', label, '...')

        kf = StratifiedKFold(n_splits=n_fold)          # 补
        importance_df = None          # 补

        for fold, (idx_tr, idx_va) in enumerate(kf.split(X, Y.iloc[:, 0])):          # 不能用multi-label
            X_train = X.iloc[idx_tr]          # 补
            X_val = X.iloc[idx_va]          # 补
            Y_train = Y.iloc[idx_tr]          # 补
            Y_val = Y.iloc[idx_va]          # 补

            ids_folds[(label, fold)] = (idx_tr, idx_va)          # 补

            y_train = Y_train[label]
            y_val = Y_val[label]

            # 创建LightGBM数据集
            train_dataset = lgb.Dataset(X_train, label=y_train)
            val_dataset = lgb.Dataset(X_val, label=y_val)

            # 训练模型
            model = lgb.train(params, train_dataset, valid_sets=[train_dataset, val_dataset], num_boost_round=10000, callbacks=callbacks, feval=lgb_f1_score)

            importance = model.feature_importance(importance_type='split')
            feature_name = model.feature_name()
            
            
            # 补
            if importance_df is None:
                importance_df = pd.DataFrame({
                    'Feature': feature_name,
                    'Importance': importance
                })
            else:
                importance_df['Importance'] += importance
            print(importance_df)

            models[(fold, label)] = model
            print('End training', label, '...\n\n')
            del X_train, X_val, Y_train, Y_val
            gc.collect()

        # 补
        # 求均值，并保存特征重要性
        importance_df['Importance'] /= n_fold
        importance_path = os.path.join(importance_path, part, f'{file_name}_{label}_fold{n_fold}.csv')
        print('saving importance ...')
        importance_df.to_csv(importance_path, index=False)
        del importance_df
        gc.collect()

    del X, Y
    gc.collect()

    print('saving model ...')
    saveDict(models, os.path.join(model_path, part, f'Model_{file_name}_{label}.pkl'))
    print('saving id_fold ...')
    id_fold_path = os.path.join(id_path, part, f'ID_{file_name}.pkl')
    saveDict(ids_folds, id_fold_path)
    del models, ids_folds
    gc.collect()


reading feature\train_feature\Part2\split_by_feat\df_train_ordered_1.parquet ...
