In [1]:
from keras import Input, Model
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from keras.layers import Dropout, Dense
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

from tiny.util import attach_device_train_label, replace_invalid_filename_char
from utils_.util_log import *

def read_result_for_ensemble(file):
    #file = f'./output/best/{name}.h5'
    store = pd.HDFStore(file)
    return store["train"], store["label"], store["test"]

def get_label_cat():
    label =  attach_device_train_label(None)
    return pd.Categorical(label.sex_age).categories


file_list = [
    './output/best/2.621213_2510_xgb.h5' ,
    './output/best/2.635281090037028_1569_dnn.h5' ,
]

train_list =[]
label_list = []
test_list  = []
for file in file_list:
    train, label, test = read_result_for_ensemble(file)

    train_list.append(train)
    label_list.append(label)
    test_list.append(test)

train = pd.concat(train_list, axis=1)
test = pd.concat(test_list, axis=1)
label = label_list[0]


train = train.sort_index()
label = label.sort_index()

X_train, X_test, y_train, y_test = train_test_split(train, label.iloc[:,0], test_size=0.3, random_state=666)

#搭建融合后的模型
inputs = Input((X_train.shape[1:]))
x = Dropout(0.7)(inputs)
x = Dense(22, activation='softmax')(x)
model = Model(inputs, x)


########################################
early_stop = EarlyStopping(monitor='val_loss', verbose=1,
                           patience=300,
                           )

model_file ='./model/checkpoint/ensemble.h5'
check_best = ModelCheckpoint(filepath= model_file,
                             monitor='val_loss', verbose=1,
                             save_best_only=True, mode='min')

from keras.utils import np_utils
adam = Adam(0.0001)
model.compile(loss='categorical_crossentropy', optimizer=adam,
              # loss="binary_crossentropy", optimizer="adam",
              # metrics=["accuracy"]
              )


print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

print(np_utils.to_categorical(y_train).shape)

history = model.fit(X_train, np_utils.to_categorical(y_train),
                    validation_data=(X_test, np_utils.to_categorical(y_test)),
                    callbacks=[check_best, early_stop],
                    batch_size=128,
                    # steps_per_epoch= len(X_test)//128,
                    epochs=2,
                    verbose=1,
                    )


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


(35000, 44) (35000,) (15000, 44) (15000,)
(35000, 22)
Train on 35000 samples, validate on 15000 samples
Epoch 1/2

Epoch 00001: val_loss improved from inf to 3.06844, saving model to ./model/checkpoint/ensemble.h5
Epoch 2/2

Epoch 00002: val_loss improved from 3.06844 to 3.04011, saving model to ./model/checkpoint/ensemble.h5


In [4]:

from keras import models
model_load = models.load_model(model_file)

#

print(type(model_load))

dir(model_load)



<class 'keras.engine.training.Model'>


['__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_inbound_node',
 '_built',
 '_check_num_samples',
 '_check_trainable_weights_consistency',
 '_collected_trainable_weights',
 '_container_nodes',
 '_feed_input_names',
 '_feed_input_shapes',
 '_feed_inputs',
 '_feed_loss_fns',
 '_feed_output_names',
 '_feed_output_shapes',
 '_feed_outputs',
 '_feed_sample_weight_modes',
 '_feed_sample_weights',
 '_feed_targets',
 '_fit_loop',
 '_function_kwargs',
 '_get_node_attribute_at_index',
 '_inbound_nodes',
 '_internal_input_shapes',
 '_internal_output_shapes',
 '_make_predict_function',
 '_make_test_function',
 '_make_train_function',
 '_node_key',
 '_nodes_by_dep

In [None]:
best_epoch = np.array(history.history['val_loss']).argmin() + 1
best_score = np.array(history.history['val_loss']).min()

#pre_x = test.drop(['sex', 'age', 'sex_age', 'device'], axis=1)
sub = pd.DataFrame(model_load.predict_proba(test), columns=get_label_cat())


sub['DeviceID'] = test['device'].values
sub = sub[
    ['DeviceID', '1-0', '1-1', '1-2', '1-3', '1-4', '1-5', '1-6', '1-7', '1-8', '1-9', '1-10', '2-0', '2-1', '2-2',
     '2-3', '2-4', '2-5', '2-6', '2-7', '2-8', '2-9', '2-10']]

file = f'./sub/ensemble_{best}_epoch_{best_epoch}.csv'
file = replace_invalid_filename_char(file)
logger.info(f'sub file save to {file}')
sub = round(sub, 10)
sub.to_csv(file, index=False)



In [14]:
print( np.unique(y_train) )
print( np.unique(y_test) )

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21]
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21]


In [16]:

gbm = XGBClassifier(
                objective='multi:softprob',
                eval_metric='mlogloss',
                num_class=22,
                n_estimators=20,
                max_depth=3,

                min_child_weight=1,
                learning_rate=0.1,

                silent=True,
                gamma=0,
                max_delta_step=0,
                subsample=1,
                colsample_bytree=1,
                colsample_bylevel=1,
                reg_alpha=1,
                reg_lambda=1,
                scale_pos_weight=1,
                seed=1,
                missing=None)
# print(random_search.grid_scores_)
gbm.fit(X_train, y_train,  eval_set=[(X_test, y_test)], early_stopping_rounds=50, verbose=True )

results = gbm.evals_result()

print(results)

[0]	validation_0-mlogloss:3.05177
Will train until validation_0-mlogloss hasn't improved in 50 rounds.
[1]	validation_0-mlogloss:3.01924
[2]	validation_0-mlogloss:2.99001
[3]	validation_0-mlogloss:2.96482
[4]	validation_0-mlogloss:2.94357
[5]	validation_0-mlogloss:2.92285
[6]	validation_0-mlogloss:2.90434
[7]	validation_0-mlogloss:2.88765
[8]	validation_0-mlogloss:2.87236
[9]	validation_0-mlogloss:2.8587
[10]	validation_0-mlogloss:2.84676
[11]	validation_0-mlogloss:2.8353
[12]	validation_0-mlogloss:2.82478
[13]	validation_0-mlogloss:2.81524
[14]	validation_0-mlogloss:2.80597
[15]	validation_0-mlogloss:2.79737
[16]	validation_0-mlogloss:2.79012
[17]	validation_0-mlogloss:2.78346
[18]	validation_0-mlogloss:2.77655
[19]	validation_0-mlogloss:2.77088
{'validation_0': {'mlogloss': [3.051766, 3.019241, 2.990009, 2.964824, 2.943568, 2.922853, 2.904342, 2.887646, 2.872357, 2.858696, 2.846756, 2.835304, 2.824777, 2.815245, 2.805973, 2.797369, 2.790115, 2.783456, 2.776547, 2.770876]}}


[3.051766,
 3.019241,
 2.990009,
 2.964824,
 2.943568,
 2.922853,
 2.904342,
 2.887646,
 2.872357,
 2.858696,
 2.846756,
 2.835304,
 2.824777,
 2.815245,
 2.805973,
 2.797369,
 2.790115,
 2.783456,
 2.776547,
 2.770876]

In [19]:
best_epoch = np.array(results['validation_0']['mlogloss']).argmin()+1
best_score = np.array(results['validation_0']['mlogloss']).min()

print(f'{best_epoch}_{best_score}')


20_2.770876
