In [3]:
import pickle
import glob
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import lightgbm as lgb

# import time
# print('sleeping')
# time.sleep(7200)
# print('sleep done =======================')

# load feats
train_x,test_x = [],[]
for feat in sorted(glob.glob('../features/*.pkl')):
    if '3_feat' in feat or 'tfidf' in feat:
        continue
    print('file path',feat)
    a,b = pickle.load(open(feat,'rb'))
    print(a.shape,b.shape)
    train_x.append(a)
    test_x.append(b)
train_x = np.nan_to_num(np.hstack(train_x))
test_x = np.nan_to_num(np.hstack(test_x))
print(train_x.shape)
    
# load y
train = pd.read_csv("../input/train.csv")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = train[list_classes].values.astype('int')
print(train_x.shape)

file path ../features/fasttext_cnn2d_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_cnn_v1_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_cnn_v2_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn2d_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_gru_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_v1_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_v2_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cudnn_gru_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_lstm_v1_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/lgb1_feat.pkl
(159571, 6) (153164, 6)
file path ../features/lr_feat1.pkl
(159571, 6) (153164, 6)
file path ../features/lr_feat2.pkl
(159571, 6) (153164, 6)
file path ../features/lstm_attention_fasttext_10_feat.pkl
(159571, 6) (153164, 6)
file path ../features/lstm_attention_fasttext_4_feat.pkl
(159571, 6) (153164, 6)
file path ../feat

In [4]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
def simple_ens(model_name,k=3,rnd=233,lr=0.05,feature_fraction=0.9,bagging_fraction=0.9,
               bag_frec=3,met='binary_logloss'):
    kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=rnd)
    test_pred = np.zeros((153164,6))
    all_train_loss_l,all_val_loss_l = 0,0
    all_train_auc_l,all_val_auc_l = 0,0
    
    for i in range(6):
        val_loss_l,train_loss_l = 0,0
        val_auc_l,train_auc_l = 0,0
        fold_cnt = 0
        for train_index, test_index in kf.split(train_x,train_y[:,i]):
            # x,y
            curr_x,curr_y = train_x[train_index],train_y[train_index]
            hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
            d_test = test_x

            # share params
            params = {
                    'application': 'binary',
                    #'num_leaves': 8,
                    #'lambda_l1': 1,
                    'lambda_l2': 1.0,
                    'max_depth': 3,
                    #'scale_pos_weight':0.9,
                    'metric': met, # or auc
                    'data_random_seed': 2,
                    'learning_rate':lr,
                    # 'bagging_fraction': bagging_fraction,
                    # 'bagging_freq':bag_frec,
                    'feature_fraction': feature_fraction,

                    }
            if met == 'auc':
                s_round = 100
            else:
                s_round = 50
            # train for each class
            d_train = lgb.Dataset(curr_x, curr_y[:,i])
            d_valid = lgb.Dataset(hold_out_x, hold_out_y[:,i])
            watchlist = [d_train, d_valid]
            model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=2000,
                      valid_sets=watchlist,
                      early_stopping_rounds=s_round,
                      verbose_eval=None)
            print(fold_cnt,'fold: ',end='')
            fold_cnt += 1
            try:
                train_pred = model.predict(curr_x)
                tmp_test_pred = model.predict(hold_out_x)
                
                curr_train_loss = log_loss(curr_y[:,i],train_pred)
                curr_val_loss = log_loss(hold_out_y[:,i],tmp_test_pred)
                
                curr_train_auc = roc_auc_score(curr_y[:,i],train_pred)
                curr_val_auc = roc_auc_score(hold_out_y[:,i],tmp_test_pred)
                
                print('ls',curr_train_loss,curr_val_loss,'auc',curr_train_auc,curr_val_auc)
                val_loss_l += curr_val_loss
                train_loss_l += curr_train_loss
                val_auc_l += curr_val_auc
                train_auc_l += curr_train_auc
            except:
                pass
            curr_test_pred = model.predict(d_test)
            test_pred[:,i] += curr_test_pred
            
        # avg k fold
        train_loss_l = train_loss_l/k
        val_loss_l = val_loss_l/k
        train_auc_l = train_auc_l/k
        val_auc_l = val_auc_l/k
        print('this class avg train',train_loss_l,'avg val',val_loss_l)
        print('this class auc train',train_auc_l,'auc val',val_auc_l)
        
        
        # avg 6 class
        all_train_loss_l += train_loss_l/6
        all_val_loss_l += val_loss_l/6
        all_train_auc_l += train_auc_l/6
        all_val_auc_l += val_auc_l/6
        print('========================')
    test_pred = test_pred/k
    print('all loss avg',all_train_loss_l,all_val_loss_l)
    print('all auc avg',all_train_auc_l,all_val_auc_l)
    print('=======================================================')
    return test_pred

print('done')

done


In [5]:

lgb_res = simple_ens('lgb',10,666,0.05,0.6)
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = lgb_res
sample_submission.to_csv("../results/lgb_log_csv_fold10_stratified.gz", index=False, compression='gzip')
print(sample_submission.head())
print('save done')

# add 7 base fasttext NN models
# all loss avg 0.0319116484218 0.0358161833583 all auc avg 0.994546891137 0.991249510282 PUB 9866

# rm tfidf test
# all loss avg 0.0319555637279 0.0357756335619 all auc avg 0.994512718368 0.991251471241

# change to stratified
# all loss avg 0.0316412744167 0.0357232681944 all auc avg 0.994584962017 0.991293307537 pub 9866

# change to 5 fold, add word batch, tilli, lgb feat
# all loss avg 0.031983129767200906 0.035387924581 all auc avg 0.9945427088311004 0.99188514127 PUB 9870

0 fold: ls 0.07133962781015392 0.07867731357447029 auc 0.9901785840557712 0.9867917049455398
1 fold: ls 0.07031379104401782 0.07192475190436497 auc 0.9904897297278498 0.989718339974378
2 fold: ls 0.07044510547166037 0.07619551096328572 auc 0.9904409942920555 0.9881916244919556
3 fold: ls 0.06938250039375353 0.0734897574112091 auc 0.9907808823003494 0.9888950497489449
4 fold: ls 0.0645193643981223 0.07762425176309679 auc 0.9922955400622832 0.9878692202122065
5 fold: ls 0.06766842877967498 0.07671020264662948 auc 0.9913911354583347 0.9875699057660392
6 fold: ls 0.06978165189967536 0.07834710896012417 auc 0.9906364663750916 0.98750150269179
7 fold: ls 0.0729094386718389 0.07128892390514657 auc 0.9895745321273073 0.9899826750067082
8 fold: ls 0.06773516599512525 0.07446469819249199 auc 0.9913256367087062 0.9892464183249896
9 fold: ls 0.06927103427348853 0.07820885644636913 auc 0.9907672914946568 0.9873755620354848
this class avg train 0.06933661087375109 avg val 0.07569313757671883
this cl

In [5]:
for r in [9,42,666,2333,2017,2018]:
    lgb_res = simple_ens('lgb',10,r,0.05,0.6)
    sample_submission = pd.read_csv("../input/sample_submission.csv")
    sample_submission[list_classes] = lgb_res
    out_f = "../results/lgb_log_csv_fold10_stratified_rnd{}.gz".format(r)
    sample_submission.to_csv(out_f, index=False, compression='gzip')
    print(sample_submission.head())
    print('save done')
    
# rnd 9 all loss avg 0.0316776831549 0.0357680948057 all auc avg 0.994637538501 0.991248050457
# rnd 42 all loss avg 0.031767752013 0.0357328049327 all auc avg 0.994570341031 0.991432648525
# rnd 666 all loss avg 0.0318820623553 0.0357432179815 all auc avg 0.994571218245 0.991539112398 PUB 9866
# rnd 2333 all loss avg 0.0318003315909 0.035767236509 all auc avg 0.994577983865 0.991500420475
# rnd 2017 all loss avg 0.0316402719659 0.0357576661933 all auc avg 0.994653325285 0.991439396357
# rnd 2018 all loss avg 0.0318714210939 0.0357494240023 all auc avg 0.994553247334 0.991441666215

0 fold: ls 0.0686201101797 0.0794303344952 auc 0.991016347103 0.986465315264
1 fold: ls 0.0670001086008 0.0792713469368 auc 0.99153500109 0.987289013193
2 fold: ls 0.0698991214456 0.0722008373659 auc 0.99063457416 0.989268370688
3 fold: ls 0.0700451029387 0.0776201333408 auc 0.990613346476 0.987199997826
4 fold: ls 0.0696584506044 0.0740838661402 auc 0.990686057353 0.988955283337
5 fold: ls 0.064768140106 0.0744467437862 auc 0.992270587779 0.989214299352
6 fold: ls 0.069487408526 0.0746101969135 auc 0.990813428748 0.98816944126
7 fold: ls 0.0706074471573 0.0779201995632 auc 0.990372870091 0.987011128351
8 fold: ls 0.0686602041564 0.0741641202932 auc 0.991020361167 0.989122885325
9 fold: ls 0.068904949978 0.0782649871536 auc 0.99097985563 0.987282787619
this class avg train 0.0687651043693 avg val 0.0762012765989
this class auc train 0.99099424296 auc val 0.987997852222
0 fold: ls 0.0167041834625 0.0191065057084 auc 0.99535173385 0.99319850614
1 fold: ls 0.0174750960995 0.0196947563071 

3 fold: ls 0.034414264336 0.037421854421 auc 0.996419213652 0.995236592418
4 fold: ls 0.0320155405969 0.0386453957241 auc 0.996962081203 0.995393409681
5 fold: ls 0.0356729810932 0.0390272613711 auc 0.996108160407 0.995201822448
6 fold: ls 0.0365249266294 0.0422090860706 auc 0.995908828551 0.99374602573
7 fold: ls 0.0340004000368 0.0397186817864 auc 0.996499957591 0.994818726291
8 fold: ls 0.0361224978512 0.0408816310811 auc 0.99601036352 0.994276894259
9 fold: ls 0.0341345725078 0.0375733025407 auc 0.996476650376 0.995397948086
this class avg train 0.0348433866953 avg val 0.0389523514679
this class auc train 0.996305180332 auc val 0.99504483773
0 fold: ls 0.00534640842443 0.00707302416963 auc 0.99817954548 0.993452755081
1 fold: ls 0.00529885923134 0.00899561553353 auc 0.998370616623 0.983454457364
2 fold: ls 0.00401269486746 0.00712250367626 auc 0.999262685303 0.99430389692
3 fold: ls 0.0047144734144 0.00563793709381 auc 0.998821901304 0.998054036499
4 fold: ls 0.00532660315635 0.005

5 fold: ls 0.0471479450083 0.0532214287679 auc 0.992797657863 0.990232299845
6 fold: ls 0.0500588056685 0.0564641601017 auc 0.991662678216 0.988940174206
7 fold: ls 0.0467136652514 0.0558244867396 auc 0.992936720931 0.989251971205
8 fold: ls 0.0513266940686 0.0532900755642 auc 0.991197151025 0.990162676287
9 fold: ls 0.0515648355983 0.0549728513341 auc 0.991097675555 0.989316052274
this class avg train 0.0496074267612 avg val 0.0541179372507
this class auc train 0.991848353779 auc val 0.989697454596
0 fold: ls 0.0151310989899 0.0172610237061 auc 0.995314508152 0.992524875605
1 fold: ls 0.0142515362292 0.018592272802 auc 0.995962677218 0.989179431234
2 fold: ls 0.0150166476656 0.0178492368463 auc 0.995353101123 0.992341035344
3 fold: ls 0.0155334030559 0.0185983350783 auc 0.994893978705 0.988174138877
4 fold: ls 0.0153116284233 0.0191693899836 auc 0.995055008767 0.988475233354
5 fold: ls 0.0154819862724 0.0170451455379 auc 0.994951471634 0.992459740424
6 fold: ls 0.0147454443793 0.01700

0 fold: ls 0.0713002624315 0.0745211933959 auc 0.990129096623 0.988552578411
1 fold: ls 0.0700230176996 0.0769229482502 auc 0.990577017869 0.987693999141
2 fold: ls 0.0671226651999 0.0743063235485 auc 0.991564980808 0.988630223367
3 fold: ls 0.0704626287644 0.0749775225588 auc 0.990467728786 0.988610653577
4 fold: ls 0.0684948920808 0.0756727906381 auc 0.991097624006 0.988742413333
5 fold: ls 0.0702684492237 0.080350797028 auc 0.990507487591 0.986199305797
6 fold: ls 0.0659937793439 0.0789634133154 auc 0.991852464011 0.986878531552
7 fold: ls 0.0700270946978 0.0776834570371 auc 0.990566359715 0.987416679258
8 fold: ls 0.0704257264783 0.0764521456008 auc 0.990463456605 0.987522350973
9 fold: ls 0.0658498602048 0.0728265957378 auc 0.991960824338 0.989627035966
this class avg train 0.0689968376125 avg val 0.0762677187111
this class auc train 0.990918704035 auc val 0.987987377137
0 fold: ls 0.0180050671276 0.0194959889691 auc 0.994169655714 0.992781127358
1 fold: ls 0.0183171389827 0.02034

3 fold: ls 0.0344202729367 0.0366884972097 auc 0.996415733252 0.995703716001
4 fold: ls 0.0329938976553 0.0373351584105 auc 0.996749302015 0.995341058949
5 fold: ls 0.0356999499202 0.0427748949979 auc 0.996094779308 0.994205239928
6 fold: ls 0.0338227045117 0.0389521266285 auc 0.996543826399 0.994898055074
7 fold: ls 0.0334576228473 0.0372640887143 auc 0.996641026217 0.995731046451
8 fold: ls 0.0353406227497 0.0409626934516 auc 0.996198989724 0.994038281424
9 fold: ls 0.0305495793496 0.0372661136498 auc 0.99728777726 0.995600778014
this class avg train 0.0342946953976 avg val 0.038958249124
this class auc train 0.996432765539 auc val 0.995041571194
0 fold: ls 0.00491689034836 0.00771127436019 auc 0.998647664909 0.994066886654
1 fold: ls 0.005444802568 0.00739306230796 auc 0.998273659875 0.988899800964
2 fold: ls 0.00512184742316 0.00676702169781 auc 0.998504061074 0.995839880578
3 fold: ls 0.00389091029882 0.00681826025854 auc 0.99935543987 0.996422360509
4 fold: ls 0.00529784387646 0.