In [3]:
import pickle
import glob
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import lightgbm as lgb

# import time
# print('sleeping')
# time.sleep(25200)
# print('sleep done =======================')

# load feats
train_x,test_x = [],[]
for feat in sorted(glob.glob('../features/*.pkl')):
    if '3_feat' in feat or 'tfidf' in feat:
        continue
    print('file path',feat)
    a,b = pickle.load(open(feat,'rb'))
    print(a.shape,b.shape)
    train_x.append(a)
    test_x.append(b)
train_x = np.hstack(train_x)
test_x = np.hstack(test_x)
print(train_x.shape)
    
# load y
train = pd.read_csv("../input/train.csv")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = train[list_classes].values.astype('int')
print(train_x.shape)

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score
def simple_ens(model_name,k=3,rnd=233,lr=0.05,feature_fraction=0.9,bagging_fraction=0.9,
               bag_frec=3,met='binary_logloss',max_d=3):
    kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=rnd)
    test_pred = np.zeros((153164,6))
    cls_ls_res = [0,0,0,0,0,0]
    all_train_loss_l,all_val_loss_l = 0,0
    all_train_auc_l,all_val_auc_l = 0,0
    
    for i in range(6):
        val_loss_l,train_loss_l = 0,0
        val_auc_l,train_auc_l = 0,0
        fold_cnt = 0
        for train_index, test_index in kf.split(train_x,train_y[:,i]):
            # x,y
            curr_x,curr_y = train_x[train_index],train_y[train_index]
            hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
            d_test = test_x

            # share params
            params = {
                    'application': 'binary',
                    #'num_leaves': 8,
                    'lambda_l1': 1.0,
                    'lambda_l2': 1.0,
                    'max_depth': max_d,
                    'metric': met, # or auc
                    'data_random_seed': 2,
                    'learning_rate':lr,
                    # 'bagging_fraction': bagging_fraction,
                    # 'bagging_freq':bag_frec,
                    'feature_fraction': feature_fraction,

                    }
            if met == 'auc':
                s_round = 60
            else:
                s_round = 30
            # train for each class
            d_train = lgb.Dataset(curr_x, curr_y[:,i])
            d_valid = lgb.Dataset(hold_out_x, hold_out_y[:,i])
            watchlist = [d_train, d_valid]
            model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=2000,
                      valid_sets=watchlist,
                      early_stopping_rounds=s_round,
                      verbose_eval=None)
            print(fold_cnt,'fold: ',end='')
            fold_cnt += 1
            try:
                train_pred = model.predict(curr_x)
                tmp_test_pred = model.predict(hold_out_x)
                
                curr_train_loss = log_loss(curr_y[:,i],train_pred)
                curr_val_loss = log_loss(hold_out_y[:,i],tmp_test_pred)
                
                curr_train_auc = roc_auc_score(curr_y[:,i],train_pred)
                curr_val_auc = roc_auc_score(hold_out_y[:,i],tmp_test_pred)
                
                print('ls',curr_train_loss,curr_val_loss,'auc',curr_train_auc,curr_val_auc)
                val_loss_l += curr_val_loss
                train_loss_l += curr_train_loss
                val_auc_l += curr_val_auc
                train_auc_l += curr_train_auc
            except:
                pass
            curr_test_pred = model.predict(d_test)
            test_pred[:,i] += curr_test_pred
            
        # avg k fold
        train_loss_l = train_loss_l/k
        val_loss_l = val_loss_l/k
        train_auc_l = train_auc_l/k
        val_auc_l = val_auc_l/k
        print(list_classes[i], lr, feature_fraction, max_d)
        print('this class avg train',train_loss_l,'avg val',val_loss_l)
        print('this class auc train',train_auc_l,'auc val',val_auc_l)
        cls_ls_res[i] = val_loss_l
        
        
        # avg 6 class
        all_train_loss_l += train_loss_l/6
        all_val_loss_l += val_loss_l/6
        all_train_auc_l += train_auc_l/6
        all_val_auc_l += val_auc_l/6
        print('========================')
    test_pred = test_pred/k
    print('all loss avg',all_train_loss_l,all_val_loss_l)
    print('all auc avg',all_train_auc_l,all_val_auc_l)
    print('=======================================================')
    return test_pred, cls_ls_res

print('done')

file path ../features/cnn_word_char_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_cnn2d_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_cnn_gru_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_cnn_v1_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_cnn_v2_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_cudnn_gru_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_gru_v1_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_lstm_v1_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/gbrt1_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn2d_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_gru_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_v1_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_v2_5_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cudnn_gru_5_feat.pkl
(159571, 6) (153164, 6)
fi

In [4]:
lgb_res,tmp_ls_res = simple_ens('lgb',10,666,0.05,0.9)
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = lgb_res
sample_submission.to_csv("../results/lgb_log_csv_fold10_stratified.gz", index=False, compression='gzip')
print(sample_submission.head())
print('save done')

# add 7 base fasttext NN models
# all loss avg 0.0319116484218 0.0358161833583 all auc avg 0.994546891137 0.991249510282 PUB 9866

# rm tfidf test
# all loss avg 0.0319555637279 0.0357756335619 all auc avg 0.994512718368 0.991251471241

# change to stratified
# all loss avg 0.0316412744167 0.0357232681944 all auc avg 0.994584962017 0.991293307537 pub 9866

# change to base model 5 fold, add word batch, tilli, lgb feat
# feat dim 217
# all loss avg 0.031983129767200906 0.035387924581 all auc avg 0.9945427088311004 0.99188514127 PUB 9870

# add more base models
# feat dim 247, all loss avg 0.031885221777487156 0.0353958  all auc avg 0.9945986685511962 0.9919487331094685

# change early stopping to 30, and test later part
# all loss avg 0.03208696729295824 0.035406264613808025 all auc avg 0.9945261773637045 0.991946174432887

# add fasttext lstm v1
# all loss avg 0.031977558955160815 0.03537909655655411 all auc avg 0.9945299073418443 0.9919307178575658

# add muse base model, feat dim 295, lower loss, but lower auc
# all loss avg 0.03196610276261484 0.03530962013098028 all auc avg 0.9945635742334386 0.9918682952070021

# fix pool gru fold to 5, and adj params
# all loss avg 0.03172365669814678 0.03528054768190897 all auc avg 0.9946437910139179 0.991903618166854

# updated pool gru v2
# all loss avg 0.03204678384090053 0.035272185628381025 all auc avg 0.9944980867573662 0.9919033708443966

# updated other feat, change some cnt to ratio, a bit worse
# all loss avg 0.03197308230179816 0.03527747086566205 all auc avg 0.9945654974291834 0.991902658184264

# updated pool gru v2 10 fold PUB 9870
# all loss avg 0.03188938973578164 0.035151701851945265 all auc avg 0.9945816630869728 0.9919824289801534

# rm lr, mnb feat1
# worse all loss avg 0.032097370918022436 0.03520186226431002
# all auc avg 0.9944811076726177 0.9919214297276806

# add ridge , change lr,mnb,ridge to fold 6
# all loss avg 0.03187703661408994 0.0351472518210873 all auc avg 0.9946115814252721 0.9920990121249438

# ridge, lr, mnb fold 10, feat frac 0.6
# all loss avg 0.031754244562510095 0.03510006533423048 all auc avg 0.9946626820706096 0.9921195758845225

# lgb v1 feat fold 10, 5 fold is better, change feat file back
# all loss avg 0.031846113030824186 0.03510235514884163 all auc avg 0.9946147807624219 0.9920710159886731

# feat frac 0.45 PUB 9871, HIGHER TRAIN CV AUC?
# all loss avg 0.03178085998781136 0.035091249281527216 all auc avg 0.9946391992503673 0.9921624859169091

# tilli feat 10 fold
# all loss avg 0.031769047743715494 0.035076842284761586 all auc avg 0.9946504692097619 0.9921581333743735

# update pool gru v2
# all loss avg 0.031982729360866644 0.035041219881259106 all auc avg 0.9945421522346883 0.9922172582996719

# updated rf, gbrt feat, new lstm att 5 fold feat, PUB 9870
# all loss avg 0.031907540308479226 0.034974441310714074 all auc avg 0.994571268692479 0.9922363188745803

# change feat frac to 0.6
# all loss avg 0.03182978026930192 0.03499268176882896 all auc avg 0.9946172652665185 0.9922342808393372

# change early stopping to 50, feat frac 0.6, new lstm fold 10, new gru v2
# all loss avg 0.03156190981079377 0.03503282697860017 all auc avg 0.994716804833151 0.9922054864194375

# change early stopping to 30, feat frac 0.45, new lstm fold 10, old gru v2
# all loss avg 0.031911144927610886 0.034940109753291336 all auc avg 0.9945906725858359 0.992327389697206

# add cnn word char， sub fail
# all loss avg 0.03173121587236955 0.03490815640935038 all auc avg 0.9946490329906843 0.9923210856840503

# new gru v2, add cnn word char, make train loss around 317 PUB 9871
# all loss avg 0.03172599356966326 0.03489449783051101 all auc avg 0.9946648942774441 0.9923032825244114

# add l1 lambda 1, feat frac 0.9, auc metrics, worse private score
# all loss avg 0.05233130954882184 0.054452927786201144 all auc avg 0.9923918963501944 0.9919204708273053

0 fold: ls 0.07262667098305266 0.07828527004452254 auc 0.989765181925294 0.9871551050879644
1 fold: ls 0.07455575160517923 0.07205779794971094 auc 0.9892165966191713 0.990177301398334
2 fold: ls 0.06770034443410783 0.07456530478716718 auc 0.9913176692577903 0.9891083242279445
3 fold: ls 0.06687598087605144 0.07268668991586692 auc 0.991598439070832 0.9893534449173811
4 fold: ls 0.0680157879370256 0.07619549714256563 auc 0.9912024125191861 0.9885683911977708
5 fold: ls 0.06728444951643983 0.07549045627834228 auc 0.991484014303831 0.9879834066562311
6 fold: ls 0.06833645150737794 0.07680321994825035 auc 0.9911140850498685 0.9882195309860938
7 fold: ls 0.07000853538047425 0.0689688934177122 auc 0.9906002967183486 0.9908057447877121
8 fold: ls 0.0635268524739894 0.07350598279609777 auc 0.9925733783469457 0.9896426759233456
9 fold: ls 0.07355503987774081 0.07712575683555574 auc 0.9895195551714439 0.9881616625828245
toxic 0.05 0.9 3
this class avg train 0.0692485864591439 avg val 0.0745684869

In [None]:
def special_ens(model_name,k=3,rnd=233):
    kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=rnd)
    test_pred = np.zeros((153164,6))
    all_train_loss_l,all_val_loss_l = 0,0
    all_train_auc_l,all_val_auc_l = 0,0
    
    params_list = [
        [0.05, 3,0.4], # depth should be 3
        [0.075,3,0.6],
        [0.095,3,0.6],
        [0.05, 3,0.6],
        [0.075,3,0.4],
        [0.095,3,0.4],
    ]
    
    for i in range(6):
        val_loss_l,train_loss_l = 0,0
        val_auc_l,train_auc_l = 0,0
        fold_cnt = 0
        
        # special params
        params = {
                'application': 'binary',
                #'num_leaves': 8,
                #'lambda_l1': 1,
                'lambda_l2': 1.0,
                'max_depth': params_list[i][1],
                'metric': 'binary_logloss', # or auc
                'data_random_seed': 2,
                'learning_rate':params_list[i][0],
                'feature_fraction': params_list[i][2],

                }
        print(params)
            
        for train_index, test_index in kf.split(train_x,train_y[:,i]):
            # x,y
            curr_x,curr_y = train_x[train_index],train_y[train_index]
            hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
            d_test = test_x
            s_round = 50
            # train for each class
            d_train = lgb.Dataset(curr_x, curr_y[:,i])
            d_valid = lgb.Dataset(hold_out_x, hold_out_y[:,i])
            watchlist = [d_train, d_valid]
            model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=2000,
                      valid_sets=watchlist,
                      early_stopping_rounds=s_round,
                      verbose_eval=None)
            print(fold_cnt,'fold: ',end='')
            fold_cnt += 1
            try:
                train_pred = model.predict(curr_x)
                tmp_test_pred = model.predict(hold_out_x)
                
                curr_train_loss = log_loss(curr_y[:,i],train_pred)
                curr_val_loss = log_loss(hold_out_y[:,i],tmp_test_pred)
                
                curr_train_auc = roc_auc_score(curr_y[:,i],train_pred)
                curr_val_auc = roc_auc_score(hold_out_y[:,i],tmp_test_pred)
                
                print('ls',curr_train_loss,curr_val_loss,'auc',curr_train_auc,curr_val_auc)
                val_loss_l += curr_val_loss
                train_loss_l += curr_train_loss
                val_auc_l += curr_val_auc
                train_auc_l += curr_train_auc
            except:
                pass
            curr_test_pred = model.predict(d_test)
            test_pred[:,i] += curr_test_pred
            
        # avg k fold
        train_loss_l = train_loss_l/k
        val_loss_l = val_loss_l/k
        train_auc_l = train_auc_l/k
        val_auc_l = val_auc_l/k
        print(list_classes[i])
        print('this class avg train',train_loss_l,'avg val',val_loss_l)
        print('this class auc train',train_auc_l,'auc val',val_auc_l)
        
        
        # avg 6 class
        all_train_loss_l += train_loss_l/6
        all_val_loss_l += val_loss_l/6
        all_train_auc_l += train_auc_l/6
        all_val_auc_l += val_auc_l/6
        print('========================')
    test_pred = test_pred/k
    print('all loss avg',all_train_loss_l,all_val_loss_l)
    print('all auc avg',all_train_auc_l,all_val_auc_l)
    print('=======================================================')
    return test_pred

print('done')

lgb_res = special_ens('lgb',10,666)
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = lgb_res
sample_submission.to_csv("../results/lgb_log_csv_fold10_stratified_special.gz", index=False, compression='gzip')
print(sample_submission.head())
print('save done')

# best params changed when base models changed
# all loss avg 0.03111512578966158 0.03534320053008906 all auc avg 0.9948524214184179 0.9918450388634261

# change lr, ridge, mnb fold to 10, train loss too low ?
# all loss avg 0.030612393454975732 0.035095753229703264 all auc avg 0.9950485697696142 0.9921210772750391

# tilli feat fold 10
# all loss avg 0.03172720686364922 0.0350882288604753 all auc avg 0.9946451061433877 0.9921962852188195

# update pool gru v2
# all loss avg 0.03178018332247421 0.035037128263355345 all auc avg 0.9946071803186727 0.9922390115824994

# update lstm, s_round = 50 , new gru v2
# all loss avg 0.03141437460591465 0.035002079150415574 all auc avg 0.9947934528849732 0.9921988821755023

# change to 9865 10fold gru v2, PUB 9870
# all loss avg 0.03142825818869454 0.03493827055369443 all auc avg 0.9947738168751431 0.9922685221839775


done
{'feature_fraction': 0.4, 'application': 'binary', 'lambda_l2': 1.0, 'metric': 'binary_logloss', 'learning_rate': 0.05, 'data_random_seed': 2, 'max_depth': 3}
0 fold: ls 0.06612442029795938 0.07766324888629762 auc 0.9918567731934822 0.9872249583688942
1 fold: ls 0.0664403593273723 0.06995272624421472 auc 0.9917646961085009 0.9902594990495966
2 fold: ls 0.06826129136259998 0.07485690710173401 auc 0.9911838306121467 0.9889605541874823
3 fold: ls 0.06781354002150908 0.07239196703372557 auc 0.991349777954703 0.9894264239287804
4 fold: ls 0.06625163494405484 0.07637490464784137 auc 0.9917910366573852 0.988441829644886
5 fold: ls 0.06409771584762254 0.07517679427758434 auc 0.9924794324443418 0.9880465514424662
6 fold: ls 0.06433231128901244 0.07663567644798093 auc 0.9923869038338159 0.9882604187084084
7 fold: ls 0.06953701922119335 0.06908538269027603 auc 0.9907728711535466 0.9907141717012599
8 fold: ls 0.06599396517309118 0.0736002103113236 auc 0.9918627330062167 0.9895389535363146
9 f

In [None]:
# find best params for each column, early stopping = 30

best_pred = np.zeros((153164,6))
val_ls_res = [0,0,0,0,0,0]
for lr in [0.095,0.075,0.05]:
    for max_d in [3]:
        for s_rate in [0.4,0.48,0.56,0.64]:
            print('learning rate',lr,'max depth',max_d,'feature fraction',s_rate)
            lgb_res,tmp_ls_res = simple_ens('lgb',k=10,rnd=666,lr=lr,
                                 feature_fraction=s_rate,bagging_fraction=0.9,
                                 bag_frec=3,met='binary_logloss',max_d=max_d)
            # check for each cls
            for i in range(6):
                # find better params for this class
                if tmp_ls_res[i] < val_ls_res[i]:
                    val_ls_res[i] = tmp_ls_res[i]
                    best_pred[:,i] = lgb_res[:,i]
                    print('FIND BETTER PARAMS',lr,max_d,s_rate,list_classes[i])
            print('TEST PARAM DONE ------------------------------------------')

print(val_auc_res)
print(np.mean(val_auc_res))
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = best_pred
sample_submission.to_csv("../results/lgb_grid_search_fold10_stratified.gz", index=False, compression='gzip')
print(sample_submission.head())
print('save done')
                
            
# best auc params
# toxic 0.05,4,0.5
# severe toxic 0.075 3 0.5
# obs 0.075 3 0.6
# threat 0.095 3 0.5
# insult 0.075 0.5 4
# hate 0.05 0.5 3

# TEST PARAM DONE ------------------------------------------
# [0.9884873039634368, 0.9920148292218363, 0.9954573112511824, 
#  0.994769628570376, 0.9898761527824232, 0.9912880359235366]
# 0.9919822102854652 PUB 9870

# updated pool gru v2 10 fold
# TEST PARAM DONE ------------------------------------------
# [0.9887825645527965, 0.9921426168741385, 0.9955318446611215, 
#  0.9955039709421902, 0.9900536177079309, 0.9915995552784374]
# 0.9922690283361025 PUB 9869

# change to val loss

learning rate 0.095 max depth 3 feature fraction 0.4
0 fold: ls 0.06846237090011284 0.07764328091583284 auc 0.9911300108050973 0.9871456372956724
1 fold: ls 0.06723213171044759 0.06989098010442507 auc 0.9915405244061806 0.9902482192396412
2 fold: ls 0.06874061258786318 0.07475439565814274 auc 0.9910031482726716 0.9889943483169075
3 fold: ls 0.06769464391444865 0.07237637236902984 auc 0.9913613337508438 0.9894165484325141
4 fold: ls 0.0672207378112683 0.0764113723781736 auc 0.9914930997122849 0.988422020404696
5 fold: ls 0.06911278596677176 0.07570570445992433 auc 0.9909296947299122 0.9878458298965587
6 fold: ls 0.06606093078530936 0.07677310545686032 auc 0.991857284217556 0.9881428325092025
7 fold: ls 0.0713609970818049 0.06920464332797222 auc 0.9901563076566734 0.990652337201299
8 fold: ls 0.06743963081989617 0.07330889354691171 auc 0.9913926944401048 0.9895555454915826
9 fold: ls 0.07095720160435365 0.07622636321176418 auc 0.9902891628987092 0.9882017598080557
toxic 0.095 0.4 3
this 

1 fold: ls 0.018158607121400026 0.020387808432662743 auc 0.9941049431926207 0.9918939501835675
2 fold: ls 0.017503160401603785 0.020721186351445684 auc 0.994590823800739 0.9914941764780352
3 fold: ls 0.016849853684874554 0.019565526594103582 auc 0.9951433505127277 0.992801303962527
4 fold: ls 0.01800482937302796 0.02071777496890712 auc 0.9941975885064915 0.9916468856817318
5 fold: ls 0.016245737839298183 0.01979017735839065 auc 0.9956336952333791 0.9924490879746739
6 fold: ls 0.018088670450469812 0.019276437742634284 auc 0.9941415090330736 0.9928813806299501
7 fold: ls 0.01779503119117521 0.019941951925378464 auc 0.994358593852259 0.9924127780013958
8 fold: ls 0.017880622501309375 0.020834070690092528 auc 0.9942898346528969 0.9914636287520558
9 fold: ls 0.018016054450055986 0.01835805718354905 auc 0.9941707666515791 0.9937481163328917
severe_toxic 0.095 0.48 3
this class avg train 0.017642427081576924 avg val 0.020044315555814325
this class auc train 0.9944919457137807 auc val 0.992237