In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold,KFold,train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import accuracy_score
from tqdm import tqdm
# warnings.simplefilter('ignore')
import time
import lightgbm as lgb
from lightgbm import early_stopping,log_evaluation
# pip 安装3.3.2版本 lightgbm，conda无法安装
from keras.layers import MaxPooling1D,Conv1D,UpSampling1D
from keras.layers import Dense, Input
from keras.models import Model
from keras import initializers
from sklearn.metrics import f1_score

In [2]:
# 读取
data_to_split ='F:\\zzqaq\\data\\data_to_split.csv'
data_to_split = pd.read_csv(data_to_split,converters={'code':str,'year':str})
data_to_split[['board','year']] = data_to_split[['code','year']].astype(int)
# 按行业列行分组
grouped = data_to_split.groupby("industry_code")

# 空df
data  = pd.DataFrame()

# 对于每个行业，按1:1的比例随机选择"0"和"1"标签的数据
for name, group in grouped:
    group1  = group[group["label"] == 1]
    num_to_extract_0 = len(group1)
    group0 = group[group['label'] == 0].sample(n=num_to_extract_0, random_state=42)
    data = pd.concat([data, group1, group0])
    
data = data.sample(frac=1, random_state=42)

train_df, test_df = train_test_split(data, test_size=0.2, random_state=42)

print(train_df.shape)
print(test_df.shape)
train_df.head(2)

(382, 779)
(96, 779)


Unnamed: 0,code,year,board,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,...,feature_766,feature_767,label,industry_code,word_count,sentence_count,readability,pos_count,neg_count,tone
7014,603703,2020,603703,0.139735,-0.177732,0.284961,0.033765,-0.16408,-0.896397,0.213114,...,0.305694,0.177161,0,7,4723,146,32.349315,527,129,0.606707
1825,300158,2019,300158,-0.560366,0.02428,0.735965,0.12329,-0.692182,-1.308341,0.413451,...,-1.149953,-0.731047,0,7,8829,249,35.457831,1002,183,0.691139


In [3]:
# 纯文本向量
feature_text = [i for i in data_to_split.columns if
                 i not in ['code', 'label','year','board',
                           'industry_code','word_count','sentence_count','readability',
                           'pos_count','neg_count','tone']]
data_np = data[feature_text]
data_np.shape

(478, 768)

In [4]:
# AE神经网络降维
# 经比较选择将文本向量从768降至128维
def bulid_BPNNmodel(data_np):
#   三维转二维数组后拆分。 
#     batch_size = 512
    batch_size = 478
    num_batches = data_np.shape[0] // batch_size
    data_batches = np.array_split(data_np[:num_batches*batch_size], num_batches)

    # 定义自编码器模型
    input_dim = 768; encoding_dim = 128  # 降维后的维度
    input_layer = Input(shape=(input_dim,))
    print(input_layer.shape)
    
    # 编码层
    h1, h2, h3 = 64, 32, 16  #隐藏层1 2 3
    encoded1 = Dense(h1, activation='relu', kernel_initializer=initializers.random_normal(stddev=0.01),bias_initializer='zeros')(input_layer)
    encoded2 = Dense(h2, activation='relu', kernel_initializer=initializers.random_normal(stddev=0.01),bias_initializer='zeros')(encoded1)
    encoded3 = Dense(h3, activation='relu', kernel_initializer=initializers.random_normal(stddev=0.01),bias_initializer='zeros')(encoded2)
    encoder_output = Dense(encoding_dim)(encoded3)
    # 解码层
    decoded1 = Dense(h3, activation='relu', kernel_initializer=initializers.random_normal(stddev=0.01),bias_initializer='zeros')(encoder_output)
    decoded2 = Dense(h2, activation='relu', kernel_initializer=initializers.random_normal(stddev=0.01),bias_initializer='zeros')(decoded1)
    decoded3 = Dense(h1, activation='relu', kernel_initializer=initializers.random_normal(stddev=0.01),bias_initializer='zeros')(decoded2)
    decoded_output = Dense(input_dim, activation='relu',kernel_initializer=initializers.random_normal(stddev=0.01), bias_initializer='zeros')(decoded3)
    autoencoder = Model(inputs=input_layer, outputs = decoded_output)
    # 编译AE神经网络
    autoencoder.compile(optimizer='adam', loss='mean_squared_error',metrics=['mse'])
    bs = 128; epochs = max(int(bs / 2), 320)
 
    encoded_outputs = []
    reconstructed_outputs= []
    for i, batch in enumerate(data_batches):
        print(f"Processing batch {i+1}/{num_batches}")  
        print(batch.shape)  # (512, 768)
        # fit()方法默认情况下在每个epoch结束时更新模型的权重参数
        history = autoencoder.fit(batch, batch, epochs=epochs, 
                            validation_split=0.2)
         # 编码数据并保存到文件
        encoder = Model(inputs=input_layer, outputs=encoder_output)
        encoded_data_batch= encoder.predict(batch)
        encoded_outputs.append(encoded_data_batch)
        with open('encoded_np.txt', 'a') as f:
            np.savetxt(f, encoded_data_batch)  # 将编码数据写入文件中
        reconstructed_data_batches = autoencoder.predict(batch)
        reconstructed_outputs.append(reconstructed_data_batches)
    encoded_data = np.concatenate(encoded_outputs, axis=0)
    # 计算重构误差
    reconstructed_data = np.concatenate(reconstructed_outputs, axis=0)
    reconstruction_error = np.mean(np.square(data_np - reconstructed_data))
    print("Reconstruction error:", reconstruction_error)
    return encoded_data,reconstruction_error
encoder_np, reconstruction_np = bulid_BPNNmodel(data_np)
encoder_np = pd.DataFrame(encoder_np)


(None, 768)
Processing batch 1/1
(478, 768)
Epoch 1/320
Epoch 2/320
Epoch 3/320
Epoch 4/320
Epoch 5/320
Epoch 6/320
Epoch 7/320
Epoch 8/320
Epoch 9/320
Epoch 10/320
Epoch 11/320
Epoch 12/320
Epoch 13/320
Epoch 14/320
Epoch 15/320
Epoch 16/320
Epoch 17/320
Epoch 18/320
Epoch 19/320
Epoch 20/320
Epoch 21/320
Epoch 22/320
Epoch 23/320
Epoch 24/320
Epoch 25/320
Epoch 26/320
Epoch 27/320
Epoch 28/320
Epoch 29/320
Epoch 30/320
Epoch 31/320
Epoch 32/320
Epoch 33/320
Epoch 34/320
Epoch 35/320
Epoch 36/320
Epoch 37/320
Epoch 38/320
Epoch 39/320
Epoch 40/320
Epoch 41/320
Epoch 42/320
Epoch 43/320
Epoch 44/320
Epoch 45/320
Epoch 46/320
Epoch 47/320
Epoch 48/320
Epoch 49/320
Epoch 50/320
Epoch 51/320
Epoch 52/320
Epoch 53/320
Epoch 54/320
Epoch 55/320
Epoch 56/320
Epoch 57/320
Epoch 58/320
Epoch 59/320
Epoch 60/320
Epoch 61/320
Epoch 62/320


Epoch 63/320
Epoch 64/320
Epoch 65/320
Epoch 66/320
Epoch 67/320
Epoch 68/320
Epoch 69/320
Epoch 70/320
Epoch 71/320
Epoch 72/320
Epoch 73/320
Epoch 74/320
Epoch 75/320
Epoch 76/320
Epoch 77/320
Epoch 78/320
Epoch 79/320
Epoch 80/320
Epoch 81/320
Epoch 82/320
Epoch 83/320
Epoch 84/320
Epoch 85/320
Epoch 86/320
Epoch 87/320
Epoch 88/320
Epoch 89/320
Epoch 90/320
Epoch 91/320
Epoch 92/320
Epoch 93/320
Epoch 94/320
Epoch 95/320
Epoch 96/320
Epoch 97/320
Epoch 98/320
Epoch 99/320
Epoch 100/320
Epoch 101/320
Epoch 102/320
Epoch 103/320
Epoch 104/320
Epoch 105/320
Epoch 106/320
Epoch 107/320
Epoch 108/320
Epoch 109/320
Epoch 110/320
Epoch 111/320
Epoch 112/320
Epoch 113/320
Epoch 114/320
Epoch 115/320
Epoch 116/320
Epoch 117/320
Epoch 118/320
Epoch 119/320
Epoch 120/320
Epoch 121/320
Epoch 122/320
Epoch 123/320
Epoch 124/320


Epoch 125/320
Epoch 126/320
Epoch 127/320
Epoch 128/320
Epoch 129/320
Epoch 130/320
Epoch 131/320
Epoch 132/320
Epoch 133/320
Epoch 134/320
Epoch 135/320
Epoch 136/320
Epoch 137/320
Epoch 138/320
Epoch 139/320
Epoch 140/320
Epoch 141/320
Epoch 142/320
Epoch 143/320
Epoch 144/320
Epoch 145/320
Epoch 146/320
Epoch 147/320
Epoch 148/320
Epoch 149/320
Epoch 150/320
Epoch 151/320
Epoch 152/320
Epoch 153/320
Epoch 154/320
Epoch 155/320
Epoch 156/320
Epoch 157/320
Epoch 158/320
Epoch 159/320
Epoch 160/320
Epoch 161/320
Epoch 162/320
Epoch 163/320
Epoch 164/320
Epoch 165/320
Epoch 166/320
Epoch 167/320
Epoch 168/320
Epoch 169/320
Epoch 170/320
Epoch 171/320
Epoch 172/320
Epoch 173/320
Epoch 174/320
Epoch 175/320
Epoch 176/320
Epoch 177/320
Epoch 178/320
Epoch 179/320
Epoch 180/320
Epoch 181/320
Epoch 182/320
Epoch 183/320
Epoch 184/320
Epoch 185/320
Epoch 186/320


Epoch 187/320
Epoch 188/320
Epoch 189/320
Epoch 190/320
Epoch 191/320
Epoch 192/320
Epoch 193/320
Epoch 194/320
Epoch 195/320
Epoch 196/320
Epoch 197/320
Epoch 198/320
Epoch 199/320
Epoch 200/320
Epoch 201/320
Epoch 202/320
Epoch 203/320
Epoch 204/320
Epoch 205/320
Epoch 206/320
Epoch 207/320
Epoch 208/320
Epoch 209/320
Epoch 210/320
Epoch 211/320
Epoch 212/320
Epoch 213/320
Epoch 214/320
Epoch 215/320
Epoch 216/320
Epoch 217/320
Epoch 218/320
Epoch 219/320
Epoch 220/320
Epoch 221/320
Epoch 222/320
Epoch 223/320
Epoch 224/320
Epoch 225/320
Epoch 226/320
Epoch 227/320
Epoch 228/320
Epoch 229/320
Epoch 230/320
Epoch 231/320
Epoch 232/320
Epoch 233/320
Epoch 234/320
Epoch 235/320
Epoch 236/320
Epoch 237/320
Epoch 238/320
Epoch 239/320
Epoch 240/320
Epoch 241/320
Epoch 242/320
Epoch 243/320
Epoch 244/320
Epoch 245/320
Epoch 246/320
Epoch 247/320


Epoch 248/320
Epoch 249/320
Epoch 250/320
Epoch 251/320
Epoch 252/320
Epoch 253/320
Epoch 254/320
Epoch 255/320
Epoch 256/320
Epoch 257/320
Epoch 258/320
Epoch 259/320
Epoch 260/320
Epoch 261/320
Epoch 262/320
Epoch 263/320
Epoch 264/320
Epoch 265/320
Epoch 266/320
Epoch 267/320
Epoch 268/320
Epoch 269/320
Epoch 270/320
Epoch 271/320
Epoch 272/320
Epoch 273/320
Epoch 274/320
Epoch 275/320
Epoch 276/320
Epoch 277/320
Epoch 278/320
Epoch 279/320
Epoch 280/320
Epoch 281/320
Epoch 282/320
Epoch 283/320
Epoch 284/320
Epoch 285/320
Epoch 286/320
Epoch 287/320
Epoch 288/320
Epoch 289/320
Epoch 290/320
Epoch 291/320
Epoch 292/320
Epoch 293/320
Epoch 294/320
Epoch 295/320
Epoch 296/320
Epoch 297/320
Epoch 298/320
Epoch 299/320
Epoch 300/320
Epoch 301/320
Epoch 302/320
Epoch 303/320
Epoch 304/320
Epoch 305/320
Epoch 306/320
Epoch 307/320
Epoch 308/320
Epoch 309/320


Epoch 310/320
Epoch 311/320
Epoch 312/320
Epoch 313/320
Epoch 314/320
Epoch 315/320
Epoch 316/320
Epoch 317/320
Epoch 318/320
Epoch 319/320
Epoch 320/320
Reconstruction error: feature_0      0.110263
feature_1      0.084156
feature_2      0.066859
feature_3      0.092640
feature_4      0.098747
                 ...   
feature_763    0.065290
feature_764    0.097029
feature_765    0.088321
feature_766    0.186521
feature_767    0.095793
Length: 768, dtype: float64


In [5]:
encoder_np.shape

(478, 128)

In [6]:
df1 = data[['code', 'label','year','board',
            'industry_code','word_count','sentence_count','readability',
            'pos_count','neg_count','tone']].reset_index(drop=True)
new_df = pd.concat([df1, encoder_np], axis=1)

train_df, test_df = train_test_split(new_df, test_size=0.2, random_state=42)

feature_names = [i for i in new_df.columns if i not in ['code', 'label']]
len(feature_names)

137

In [7]:
def lgb_model(train, target, test, k, seed, parameters):
    feats = [f for f in train.columns if f not in ['code', 'label']]
    print('Current num of features:', len(feats))

    oof_probs = np.zeros((train.shape[0],))
    output_preds = 0
    offline_score = []
    feature_importance_df = pd.DataFrame()
    parameters = parameters
    seeds = [2]
    for seed in seeds:
        folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        for i, (train_index, test_index) in enumerate(folds.split(train, target)):
            train_y, test_y = target.iloc[train_index], target.iloc[test_index]
            train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]

            dtrain = lgb.Dataset(train_X,
                                 label=train_y)
            dval = lgb.Dataset(test_X,
                               label=test_y)

            lgb_model = lgb.train(
                parameters,
                dtrain,
                num_boost_round=8000,
                valid_sets=[dval],
                callbacks=[early_stopping(100), log_evaluation(100)],
            )

            oof_probs[test_index] = lgb_model.predict(test_X[feats], num_iteration=lgb_model.best_iteration) / len(
                seeds)

            offline_score.append(lgb_model.best_score['valid_0']['auc'])
            output_preds += lgb_model.predict(test[feats],
                                              num_iteration=lgb_model.best_iteration) / folds.n_splits / len(seeds)
            print(offline_score)
            # feature importance
            fold_importance_df = pd.DataFrame()
            fold_importance_df["feature"] = feats
            fold_importance_df["importance"] = lgb_model.feature_importance(importance_type='gain')
            fold_importance_df["fold"] = i + 1
            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    print('OOF-MEAN-AUC:%.6f, OOF-STD-AUC:%.6f' % (np.mean(offline_score), np.std(offline_score)))
    print('feature importance:')
    print(feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False).head(50))

    return output_preds, oof_probs, np.mean(offline_score), feature_importance_df

In [10]:
parameters = {
         'boosting_type': 'gbdt','objective': 'binary',
         'tree_learner':'serial','metric': 'auc',
            'max_depth' : 6,
            'num_leaves' : 15,
            'min_child_weight': 0.01,
            'min_child_samples': 19,
            'num_leaves': 40,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 16,
            'learning_rate': 0.2,
            'silent': True,
            'verbose': -1
    }

print('开始模型训练train')
lgb_preds, lgb_oof, lgb_score, feature_importance_df = lgb_model(
    train=train_df[feature_names],target=train_df['label'],
    test=test_df[feature_names], k=5,seed=42, parameters=parameters)

开始模型训练train
Current num of features: 137
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.693657
Early stopping, best iteration is:
[7]	valid_0's auc: 0.774966
[0.774966261808367]
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.761134


Please use silent argument of the Dataset constructor to pass this parameter.


Early stopping, best iteration is:
[34]	valid_0's auc: 0.813765
[0.774966261808367, 0.8137651821862348]
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.783241
Early stopping, best iteration is:
[10]	valid_0's auc: 0.850416
[0.774966261808367, 0.8137651821862348, 0.850415512465374]
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.836565
Early stopping, best iteration is:
[11]	valid_0's auc: 0.893352
[0.774966261808367, 0.8137651821862348, 0.850415512465374, 0.8933518005540166]
Training until validation scores don't improve for 100 rounds
[100]	valid_0's auc: 0.774775
Early stopping, best iteration is:
[28]	valid_0's auc: 0.802495
[0.774966261808367, 0.8137651821862348, 0.850415512465374, 0.8933518005540166, 0.8024948024948025]
OOF-MEAN-AUC:0.826999, OOF-STD-AUC:0.041071
feature importance:
feature
board             171.268075
tone              126.124869
word_count         21.573843
pos_count          18.686862
rea

In [11]:
val_pred = lgb_oof.copy()
t0 = 0.1
v = 0.02
best_t = t0
best_f1 = 0
for step in range(1000):
    curr_t = t0 + step * v
    val_y = [1 if x >= curr_t else 0 for x in val_pred]
    curr_f1 = f1_score(train_df['label'], val_y)
    if curr_f1 > best_f1:
        best_t = curr_t
        best_f1 = curr_f1
        print('step: {}   best threshold: {}   best auc: {}'.format(step, best_t, best_f1))
print('search finish.')
label=[1 if x >= best_t else 0 for x in lgb_preds]
sum(label)
acc = accuracy_score(label, test_df['label'])
# 输出准确率
print('Accuracy:', acc)

step: 0   best threshold: 0.1   best auc: 0.6911764705882354
step: 1   best threshold: 0.12000000000000001   best auc: 0.704119850187266
step: 2   best threshold: 0.14   best auc: 0.7137404580152672
step: 3   best threshold: 0.16   best auc: 0.7181467181467182
step: 4   best threshold: 0.18   best auc: 0.7322834645669292
step: 5   best threshold: 0.2   best auc: 0.738095238095238
step: 6   best threshold: 0.22   best auc: 0.7454909819639278
step: 8   best threshold: 0.26   best auc: 0.7458333333333333
step: 9   best threshold: 0.28   best auc: 0.75
step: 10   best threshold: 0.30000000000000004   best auc: 0.755939524838013
step: 11   best threshold: 0.32   best auc: 0.756637168141593
step: 12   best threshold: 0.33999999999999997   best auc: 0.7606263982102908
step: 13   best threshold: 0.36   best auc: 0.761904761904762
step: 16   best threshold: 0.42000000000000004   best auc: 0.7637231503579951
step: 17   best threshold: 0.44000000000000006   best auc: 0.7658536585365853
search fin