In [2]:
# coding:utf-8
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import numpy as np
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.optimizers import *
import tensorflow as tf
import warnings

warnings.filterwarnings('ignore')



In [7]:
# 读取数据，简单处理list数据
train = pd.read_csv('./data/train.txt', header=None)
test = pd.read_csv('./data/test.txt', header=None)

train.columns = ['pid', 'label', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'model', 'make']
test.columns = ['pid', 'gender', 'age', 'tagid', 'time', 'province', 'city', 'model', 'make']

train['label'] = train['label'].astype(int)

data = pd.concat([train,test])
data['label'] = data['label'].fillna(-1)

data['tagid'] = data['tagid'].apply(lambda x:eval(x))
data['tagid'] = data['tagid'].apply(lambda x:[str(i) for i in x])

# 超参数
# embed_size  embedding size
# MAX_NB_WORDS  tagid中的单词出现次数
# MAX_SEQUENCE_LENGTH  输入tagid list的长度
embed_size = 256   #64
MAX_NB_WORDS = 230637
MAX_SEQUENCE_LENGTH = 300   #128
# 训练word2vec，这里可以考虑elmo，bert等预训练
w2v_model = Word2Vec(sentences=data['tagid'].tolist(), vector_size=embed_size, window=5, min_count=1,epochs=7)
#w2v_model.save("./w2vmodel/w2vmodel.model")
#w2v_model = Word2Vec.load("./w2vmodel/w2vmodel.model")

In [8]:
# 这里是划分训练集和测试数据
X_train = data[:train.shape[0]]['tagid']
X_test = data[train.shape[0]:]['tagid']

# 创建词典，利用了tf.keras的API，其实就是编码一下，具体可以看看API的使用方法
tokenizer = text.Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_test = sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
word_index = tokenizer.word_index
# 计算一共出现了多少个单词，其实MAX_NB_WORDS我直接就用了这个数据

nb_words = len(word_index) + 1
print('Total %s word vectors.' % nb_words)
# 构建一个embedding的矩阵，之后输入到模型使用
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    try:
        embedding_vector = w2v_model.wv.get_vector(word)
        #print(np.shape(embedding_vector))
    except KeyError:
        continue
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

y_categorical = train['label'].values

Total 230638 word vectors.


In [5]:
cols= ["tagidEmb_"+str(i+1) for i in range(128)]
dfem = pd.DataFrame(data=embedding_matrix,columns=cols)
len(dfem),len(train)

(230638, 300000)

In [10]:
def my_model():
    embedding_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    # 词嵌入（使用预训练的词向量）
    embedder = Embedding(nb_words,
                         embed_size,
                         input_length=MAX_SEQUENCE_LENGTH,
                          weights=[embedding_matrix],
                         trainable=False      #考虑解冻？过拟合了
                         )
    embed = embedder(embedding_input)
    g1 = GRU(units=512, return_sequences=True)(embed)
    bn1 = BatchNormalization()(g1)
    drop1 = Dropout(0.15)(bn1)
    g2 = GRU(units=512)(drop1)
    bn2 = BatchNormalization()(g2)
    drop2 = Dropout(0.15)(bn2)
    d2 = Dense(256,activation='relu')(drop2)
    #gap1 =AveragePooling1D()(drop2)
    main_output = Dense(1, activation='sigmoid')(d2)
    model = Model(inputs=embedding_input, outputs=main_output)
    model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=Adam(lr=0.0009), metrics=['accuracy'])
    return model

# 五折交叉验证
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=2048)
oof = np.zeros([len(train), 1])
predictions = np.zeros([len(test), 1])

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['label'])):
    print("fold n{}".format(fold_ + 1))
    model = my_model()
    #model.load_weights('./model/DGRU74341_73508/DGRU_5.h5')
    if fold_ == 0:
        model.summary()

    reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.1, patience=3, mode='auto',min_delta=0.0001, verbose=2)  #改监控指标为val_acc
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=5,verbose=2)
    bst_model_path = "./model/DGRU_{}.h5".format(fold_+1)
    model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

    X_tra, X_val = X_train[trn_idx], X_train[val_idx]
    y_tra, y_val = y_categorical[trn_idx], y_categorical[val_idx]

    model.fit(X_tra, y_tra,
              validation_data=(X_val, y_val),
              epochs=128, batch_size=300, shuffle=True,
              callbacks=[early_stopping, model_checkpoint,reduce_lr])

    model.load_weights(bst_model_path)

    oof[val_idx] = model.predict(X_val)

    predictions += model.predict(X_test) / folds.n_splits
    print(predictions)
    del model



fold n1
Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 300)]             0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 300, 256)          59043328  
_________________________________________________________________
gru_4 (GRU)                  (None, 300, 512)          1182720   
_________________________________________________________________
batch_normalization_4 (Batch (None, 300, 512)          2048      
_________________________________________________________________
dropout_4 (Dropout)          (None, 300, 512)          0         
_________________________________________________________________
gru_5 (GRU)                  (None, 512)               1575936   
_________________________________________________________________
batch_normalization_5 (Batch (None, 512)           

In [None]:
train.head()

In [11]:
tmp = train[['pid','label']].copy()
tmp['predict'] = oof
tmp['rank'] = tmp['predict'].rank()
tmp['p'] = 1
tmp.loc[tmp['rank'] <= tmp.shape[0] * 0.5, 'p'] = 0
bst_f1_tmp = f1_score(tmp['label'].values, tmp['p'].values)
print('f1_score:{}'.format(bst_f1_tmp))

f1_score:0.7432266666666667


In [12]:
tmp1 = train[['pid','label']].copy()
tmp1['predict'] = oof
tmp1 = tmp1.sort_values(['predict'],ascending=False)
tmp1 = tmp1.reset_index(drop=True)
tmp1['p'] = 1
tmp1.loc[len(tmp1)*0.5:, 'p'] = 0
bst_f1_tmp = f1_score(tmp1['label'].values, tmp1['p'].values)
print('f1_score:{}'.format(bst_f1_tmp))

f1_score:0.7432266666666667


In [13]:
submit = test[['pid']]
submit['proba'] = predictions
submit.columns = ['user_id', 'proba']

submit = submit.sort_values(['proba'],ascending=False)
submit = submit.reset_index(drop=True)
submit['category_id'] = 1
submit.loc[int(len(submit)*0.5):,'category_id'] = 0

submit[['user_id', 'proba']].to_csv('./submit/sub_proba_{}.csv'.format(str(bst_f1_tmp).split('.')[1]), index=False)
submit[['user_id', 'category_id']].to_csv('./submit/sub_{}.csv'.format(str(bst_f1_tmp).split('.')[1]), index=False)

In [11]:
# submit = test[['pid']]
# submit['proba'] = predictions
# submit.columns = ['user_id', 'proba']

# submit['rank'] = submit['proba'].rank()
# submit['category_id'] = 1
# submit.loc[submit['rank'] <= int(submit.shape[0] * 0.5), 'category_id'] = 0

#submit[['user_id', 'proba']].to_csv('./sub/sub_proba_{}.csv'.format(str(bst_f1_tmp).split('.')[1]), index=False)
#submit[['user_id', 'category_id']].to_csv('./sub/sub_{}.csv'.format(str(bst_f1_tmp).split('.')[1]), index=False)

0.5
   user_id       tmp     rank  category_id
0  1400001  0.470601  50171.0            1
1  1400002  0.307447  29450.0            0
2  1400003  0.363748  36186.0            0
3  1400004  0.489368  52677.0            1
4  1400005  0.071545   5443.0            0
