In [5]:
import pickle
import glob
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import xgboost as xgb

# import time
# print('sleeping')
# time.sleep(7200)
# print('sleep done =======================')

# load feats
train_x,test_x = [],[]
for feat in sorted(glob.glob('../features/*.pkl')):
    if '3_feat' in feat:
        continue
    print('file path',feat)
    a,b = pickle.load(open(feat,'rb'))
    print(a.shape,b.shape)
    train_x.append(a)
    test_x.append(b)
train_x = np.nan_to_num(np.hstack(train_x))
test_x = np.nan_to_num(np.hstack(test_x))
print(train_x.shape)
    
# load y
train = pd.read_csv("../input/train.csv")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = train[list_classes].values.astype('int')
print(train_x.shape)

file path ../features/glove_cnn2d_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_gru_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_v2_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cudnn_gru_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_gru_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_lstm_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/lr_feat1.pkl
(159571, 6) (153164, 6)
file path ../features/lr_feat2.pkl
(159571, 6) (153164, 6)
file path ../features/lstm_attention_fasttext_10_feat.pkl
(159571, 6) (153164, 6)
file path ../features/lstm_attention_fasttext_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/lstm_attention_glove_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/mnb_feat1.pkl
(159571, 6) (153164, 6)
file path ../features/mnb_feat2.pkl
(159571, 6) (153164, 6)
file path ../features/muse_

In [6]:
from sklearn.model_selection import KFold
def simple_ens(model_name,k=3,rnd=233,lr=0.05,c_bytree=0.9,s_sample=0.9):
    kf = KFold(n_splits=k, shuffle=True, random_state=rnd)
    test_pred = np.zeros((153164,6))
    cache_test_pred = np.zeros((153164,6))
    single_best = 100
    single_best_pred = None
    all_train_loss_l,all_val_loss_l = 0,0
    
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        val_loss_l,train_loss_l = 0,0
        d_test = xgb.DMatrix(test_x)
        
        # share params
        params = {
                'subsample': s_sample,
                'eta': lr,
                'max_depth': 3,
                'eval_metric':'logloss',
                #'eval_metric':'auc',
                'objective':'binary:logistic',
                'scale_pos_weight':0.9,
                'colsample_bytree':c_bytree
            
                }
        
        # train for each class
        for i in range(6):
            d_train = xgb.DMatrix(curr_x, curr_y[:,i])
            d_valid = xgb.DMatrix(hold_out_x, hold_out_y[:,i])
            watchlist = [(d_train, 'train'), (d_valid, 'valid')]

            model = xgb.train(params, d_train, 1000, watchlist,
                              early_stopping_rounds=50,
                              verbose_eval=None)
            print(i)
            try:
                curr_train_loss = log_loss(curr_y[:,i],model.predict(d_train))
                curr_val_loss = log_loss(hold_out_y[:,i],model.predict(d_valid))
                print(curr_train_loss,curr_val_loss)
                val_loss_l += curr_val_loss
                train_loss_l += curr_train_loss
            except:
                pass
            curr_test_pred = model.predict(d_test)
            
            test_pred[:,i] += curr_test_pred
            cache_test_pred[:,i] += curr_test_pred
            
        # avg 6 class
        train_loss_l = train_loss_l/6
        val_loss_l = val_loss_l/6
        print('this fold avg train',train_loss_l,'avg val',val_loss_l)
        
        # save best one fold result
        if val_loss_l < single_best:
            single_best = val_loss_l
            single_best_pred = cache_test_pred
            print('new single best')
        
        cache_test_pred = np.zeros((153164,6))
        
        # avg k fold
        all_train_loss_l += train_loss_l/k
        all_val_loss_l += val_loss_l/k
        print('========================')
    test_pred = test_pred/k
    print('all train avg',all_train_loss_l,'all val avg',all_val_loss_l)
    return test_pred, single_best_pred

print('done')

done


In [None]:
%%time
# adj lr, colsample_bytree, sample
for lr in [0.05]:
    for c1 in [0.8]:
        for c2 in [0.7]:
            xgb_res,b = simple_ens('xgb',5,233,lr,c1,c2)
            sample_submission = pd.read_csv("../input/sample_submission.csv")
            sample_submission[list_classes] = xgb_res
            fname = "../results/xgb_some_csv_fold5_{}_{}_{}.gz".format(lr,c1,c2)
            sample_submission.to_csv(fname, index=False, compression='gzip')
            print(sample_submission.head())
            print('save done')

# no rm, 0.0699781296574 0.0780951434931, 0.0161680844181 0.0199438522849,
# final, all train avg 0.031309680463 all val avg 0.0368863155994
# rm muse and pretrain, not good
# rm gru_v1, 0.0687157498068 0.0780051849506, 0.0152173938614 0.0200296896045
# rm tfidf, 0.072673329496 0.0787288243817, 0.0169421114344 0.0199760695638, not good
# only rm no pretrain, not good,
# test rm gru_v1, lstm_v1, all train avg 0.0313024384367 all val avg 0.0369036640753
# rm cnn2d, 0.0704654561461 0.0780916497355, 0.0170009305396 0.0200476295259
# 1st fold, this fold avg train 0.0318658486615 avg val 0.0362271742281

# adj params
# col sample by tree: 0.9 all train avg 0.031309680463 all val avg 0.0368863155994


# adj lr, colsample_bytree, sample
#   0.05  0.7  0.7 all train avg 0.031149993004 all val avg 0.0368516540068
#   0.05  0.7  0.8 all train avg 0.0311077763624 all val avg 0.036855567286
#   0.05  0.7  0.9 all train avg 0.0312078560147 all val avg 0.0368798432732
#   0.05  0.8  0.7 all train avg 0.0309175391583 all val avg 0.0368485662838
#   0.05  0.8  0.8 all train avg 0.0314143017087 all val avg 0.0368859121266
#   0.05  0.8  0.9 all train avg 0.0312955837465 all val avg 0.0368866903553
#   0.05  0.9  0.7 all train avg 0.0311044998336 all val avg 0.0368667306586
#   0.05  0.9  0.8 all train avg 0.0311835661273 all val avg 0.0368992582647
#   0.05  0.9  0.9 all train avg 0.0313585027516 all val avg 0.0368996796111
#    0.1  0.7  0.7 all train avg 0.0306404949681 all val avg 0.0370501103027
#    0.1  0.7  0.8 all train avg 0.0307017678518 all val avg 0.037048645753
#    0.1  0.7  0.9 all train avg 0.030880668966 all val avg 0.0370266615654
#    0.1  0.8  0.7 all train avg 0.0307153292658 all val avg 0.0370698994366
#    0.1  0.8  0.8 all train avg 0.0308616897847 all val avg 0.0370446456042
# large lr is worse

# adj lr, colsample_bytree, sample
#   0.03  0.8  0.7 all train avg 0.0320478053971 all val avg 0.036823783635
#   0.05  0.8  0.7 all train avg 0.0309175391583 all val avg 0.0368485662838


In [7]:
%%time
xgb_res,b = simple_ens('xgb',10,233,0.05,0.8,0.7)
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = xgb_res
sample_submission.to_csv("../results/xgb_some_csv_fold10.gz", index=False, compression='gzip')
print(sample_submission.head())
print('save done')
# all train avg 0.0321542340346 all val avg 0.0367885979049, PUB 9862, rnd 42, lr 0.03
# all train avg 0.0318508428487 all val avg 0.0368012450966, rnd 233, lr 0.05, PUB

0
0.0715445621255 0.0777686806239
1
0.0167661675801 0.0203887910049
2
0.0355492689091 0.0419493081363
3
0.00376773833307 0.00710639406701
4
0.0497243095625 0.058259026495
5
0.0143989866965 0.0197137042688
this fold avg train 0.0319585055345 avg val 0.0375309840993
new single best
0
0.0695445140217 0.0780855780057
1
0.0167715961432 0.0195617982575
2
0.0356496258971 0.0368136593064
3
0.00363275365437 0.0075979260006
4
0.0507245961677 0.0493637442885
5
0.0146891432019 0.0168799144076
this fold avg train 0.0318353715143 avg val 0.0347171033777
new single best
0
0.0676621959332 0.0748877792594
1
0.0172348182651 0.0184432435904
2
0.0362571997654 0.0405191095317
3
0.00426062992333 0.00784190302653
4
0.0512273372962 0.054807708031
5
0.0129719426644 0.01903442907
this fold avg train 0.0316023539746 avg val 0.0359223620848
0
0.0706209282431 0.0832473963746
1
0.0160319344678 0.0201861570977
2
0.0358450478093 0.0425667601179
3
0.00438345928996 0.00658656461427
4
0.0496700902488 0.0545739360555
5
0