In [1]:
import pickle
import glob
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import lightgbm as lgb

# import time
# print('sleeping')
# time.sleep(7200)
# print('sleep done =======================')

# load feats
train_x,test_x = [],[]
for feat in sorted(glob.glob('../features/*.pkl')):
    if '3_feat' in feat:
        continue
    print('file path',feat)
    a,b = pickle.load(open(feat,'rb'))
    print(a.shape,b.shape)
    train_x.append(a)
    test_x.append(b)
train_x = np.nan_to_num(np.hstack(train_x))
test_x = np.nan_to_num(np.hstack(test_x))
print(train_x.shape)
    
# load y
train = pd.read_csv("../input/train.csv")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = train[list_classes].values.astype('int')
print(train_x.shape)

file path ../features/glove_cnn2d_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_gru_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_v2_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cudnn_gru_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_gru_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_lstm_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/lr_feat1.pkl
(159571, 6) (153164, 6)
file path ../features/lr_feat2.pkl
(159571, 6) (153164, 6)
file path ../features/lstm_attention_fasttext_10_feat.pkl
(159571, 6) (153164, 6)
file path ../features/lstm_attention_fasttext_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/lstm_attention_glove_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/mnb_feat1.pkl
(159571, 6) (153164, 6)
file path ../features/mnb_feat2.pkl
(159571, 6) (153164, 6)
file path ../features/muse_

In [2]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
def simple_ens(model_name,k=3,rnd=233,lr=0.05,feature_fraction=0.9,bagging_fraction=0.9,
               bag_frec=3,met='binary_logloss'):
    kf = KFold(n_splits=k, shuffle=True, random_state=rnd)
    test_pred = np.zeros((153164,6))
    all_train_loss_l,all_val_loss_l = 0,0
    all_train_auc_l,all_val_auc_l = 0,0
    
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        val_loss_l,train_loss_l = 0,0
        val_auc_l,train_auc_l = 0,0
        d_test = test_x
        
        # share params
        params = {
                'application': 'binary',
                #'num_leaves': 8,
                #'lambda_l1': 1,
                'lambda_l2': 1.2,
                'max_depth': 3,
                'scale_pos_weight':0.9,
                'metric': met, # or auc
                'data_random_seed': 2,
                'learning_rate':lr,
#                 'bagging_fraction': bagging_fraction,
#                 'bagging_freq':bag_frec,
                'feature_fraction': feature_fraction,
            
                }
        if met == 'auc':
            s_round = 100
        else:
            s_round = 50
        # train for each class
        for i in range(6):
            d_train = lgb.Dataset(curr_x, curr_y[:,i])
            d_valid = lgb.Dataset(hold_out_x, hold_out_y[:,i])
            watchlist = [d_train, d_valid]
            model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=2000,
                      valid_sets=watchlist,
                      early_stopping_rounds=s_round,
                      verbose_eval=None)
            print(i)
            try:
                train_pred = model.predict(curr_x)
                tmp_test_pred = model.predict(hold_out_x)
                
                curr_train_loss = log_loss(curr_y[:,i],train_pred)
                curr_val_loss = log_loss(hold_out_y[:,i],tmp_test_pred)
                
                curr_train_auc = roc_auc_score(curr_y[:,i],train_pred)
                curr_val_auc = roc_auc_score(hold_out_y[:,i],tmp_test_pred)
                
                print('ls',curr_train_loss,curr_val_loss,'auc',curr_train_auc,curr_val_auc)
                val_loss_l += curr_val_loss
                train_loss_l += curr_train_loss
                val_auc_l += curr_val_auc
                train_auc_l += curr_train_auc
            except:
                pass
            curr_test_pred = model.predict(d_test)
            test_pred[:,i] += curr_test_pred
            
        # avg 6 class
        train_loss_l = train_loss_l/6
        val_loss_l = val_loss_l/6
        train_auc_l = train_auc_l/6
        val_auc_l = val_auc_l/6
        print('this fold avg train',train_loss_l,'avg val',val_loss_l)
        print('this fold auc train',train_auc_l,'auc val',val_auc_l)
        
        
        # avg k fold
        all_train_loss_l += train_loss_l/k
        all_val_loss_l += val_loss_l/k
        all_train_auc_l += train_auc_l/k
        all_val_auc_l += val_auc_l/k
        print('========================')
    test_pred = test_pred/k
    print('all loss avg',all_train_loss_l,all_val_loss_l)
    print('all auc avg',all_train_auc_l,all_val_auc_l)
    print('=======================================================')
    return test_pred

print('done')

done


In [25]:
%%time
# adj lr, feat_frac, bag_frac
for lr in [0.05]:
    for c1 in [0.6,0.7,0.8]:
        lgb_res = simple_ens('lgb',5,233,lr,c1)
        sample_submission = pd.read_csv("../input/sample_submission.csv")
        sample_submission[list_classes] = lgb_res
        fname = "../results/lgb_csv_fold5_{}_{}_{}.gz".format(lr,c1)
        sample_submission.to_csv(fname, index=False, compression='gzip')
        print(fname)
        print(sample_submission.head())
        print('save done')

# lr, feat_frac, bag_frac, bag_freq=40
# 0.03   0.7   0.7       all loss avg 0.0322530666235 0.0369762464226 all auc avg 0.994271044586 0.990285134782
# 0.03   0.7   0.8       all loss avg 0.0322971080147 0.0369559723174 all auc avg 0.994334369995 0.990306416634
# 0.03   0.8   0.7       all loss avg 0.0322229511884 0.0369948009383 all auc avg 0.994234840050 0.990368331216
# 0.03   0.8   0.8       all loss avg 0.0324206645596 0.0369651316109 all auc avg 0.994257933028 0.990295813203
# 0.05   0.7   0.7       all loss avg 0.0318163169078 0.0370403091733 all auc avg 0.994337086137 0.990254302965

# bag_freq = 3
# 0.05   0.6   0.6       all loss avg 0.0329345052647 0.0368498383873 all auc avg 0.9939819323 0.990664190969
# 0.05   0.6   0.7       all loss avg 0.0326017869132 0.0368044492709 all auc avg 0.994163898916 0.990696436189
# 0.05   0.6   0.8       all loss avg 0.0324589992208 0.0368163370811 all auc avg 0.994233438434 0.990618323557
# 0.05   0.7   0.6       all loss avg 0.0331541550186 0.0368670775083 all auc avg 0.993940708877 0.990597573572
# 0.05   0.7   0.7       all loss avg 0.0325753698442 0.0368285766463 all auc avg 0.994218398119 0.990663705936
# 0.05   0.7   0.8       all loss avg 0.0325779317956 0.0368281454058 all auc avg 0.994220915054 0.990630132284
# 0.05   0.8   0.6       all loss avg 0.0333336885412 0.0368817399664 all auc avg 0.993874933751 0.990630976344
# 0.05   0.8   0.7       all loss avg 0.0329593178033 0.0368484307605 all auc avg 0.994000316129 0.99065277921
# 0.05   0.8   0.8       all loss avg 0.0328682234385 0.0368555923112 all auc avg 0.994090826609 0.990708113059

# rm bagging
# 0.05   0.6             all loss avg 0.0328499864930 0.0368782748851 all auc avg 0.994090121326 0.990573017994        

0
ls 0.0715756263364 0.0778918992813 auc 0.990044174728 0.987115901447
1
ls 0.0183406987171 0.0200192823683 auc 0.993882999672 0.992897558441
2
ls 0.0360643999099 0.0395503089266 auc 0.996025734909 0.994474435122
3
ls 0.00492291524905 0.00739562627481 auc 0.998516726113 0.995068637349
4
ls 0.0521875531828 0.0539351974535 auc 0.990922087735 0.989088400946
5
ls 0.0159414040964 0.0184446938637 auc 0.994458410342 0.987025937713
this fold avg train 0.0331720995819 avg val 0.036206168028
this fold auc train 0.99397502225 auc val 0.99094514517
0
ls 0.0717912915926 0.0793601066039 auc 0.989945429095 0.986210083352
1
ls 0.018485220253 0.0194687940123 auc 0.993962349104 0.99222646
2
ls 0.0345804837778 0.04162096936 auc 0.996309288835 0.994417816128
3
ls 0.00527103314331 0.00727455757059 auc 0.998300241738 0.988273299277
4
ls 0.0491005959018 0.0546654128942 auc 0.992157536817 0.9888137062
5
ls 0.014993066048 0.0172407853217 auc 0.995351668993 0.990100990954
this fold avg train 0.0323702817861 avg

this fold avg train 0.0330331563257 avg val 0.036652196435
this fold auc train 0.994153685759 auc val 0.989930700675
0
ls 0.0680556047657 0.080029984193 auc 0.99114490849 0.986840765025
1
ls 0.0181234446655 0.0213300968037 auc 0.99419699418 0.989918896738
2
ls 0.0385151306695 0.0413620648733 auc 0.99531766085 0.993961496061
3
ls 0.00492984443482 0.00694799787544 auc 0.998658262734 0.995307046284
4
ls 0.0521200080303 0.0568299355525 auc 0.990778877086 0.988813454577
5
ls 0.0160454423024 0.0185734675518 auc 0.994043922869 0.988469288511
this fold avg train 0.0329649124781 avg val 0.0375122578083
this fold auc train 0.994023437701 auc val 0.990551824533
0
ls 0.0718391246056 0.0783763336007 auc 0.989888200213 0.98727150672
1
ls 0.0181693759993 0.0213272951438 auc 0.994008037147 0.991458131641
2
ls 0.0376469422762 0.0400071679387 auc 0.995600998904 0.994559415005
3
ls 0.00545605317474 0.00580313720539 auc 0.998283064851 0.993717319707
4
ls 0.0489124967526 0.0547231399595 auc 0.992072765987 

In [26]:
# check bag freq, lr 0.03, feat_frac 0.7, bag_frac 0.8
# lgb_res1 = simple_ens('lgb',5,233,0.05,0.7,0.8,3)
# 10 all loss avg 0.0320961395694 0.036894249837 all auc avg 0.994414864008 0.99045512745
# lgb_res2 = simple_ens('lgb',5,233,0.03,0.7,0.8,20)
# 20 all loss avg 0.0323257411629 0.0369172677966 all auc avg 0.994317193818 0.990356748918
# 40 all loss avg 0.0322971080147 0.0369559723174 all auc avg 0.994334369995 0.990306416634

# bag freq=10, rm num_leaves, max_depth=3
# all loss avg 0.0331234951497 0.0368040058292 all auc avg 0.993925024791 0.990579738086
# bag freq=3, rm num_leaves, max_depth=3
# all loss avg 0.0326936843064 0.0367777223524 all auc avg 0.994060646688 0.990669971757

In [3]:
%%time
lgb_res = simple_ens('lgb',10,233,0.05,0.6)
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = lgb_res
sample_submission.to_csv("../results/lgb_log_csv_fold10.gz", index=False, compression='gzip')
print(sample_submission.head())
print('save done')

# all loss avg 0.0335311567862 0.036750116241
# all auc avg 0.993683089291 0.990779711364 PUB 9862

# change params lr to 0.05, rnd 42
# all loss avg 0.0331165975108 0.036782889115 all auc avg 0.993888891968 0.99078987792

# rm bagging, rnd 233
# all loss avg 0.0330784234373 0.036800843461 all auc avg 0.993977317117 0.990582906649

# fix lr, mnb feat, no bagging, rnd 233
# all loss avg 0.032247976023 0.0361280397445 all auc avg 0.994391483047 0.991118473891

0
ls 0.069645024349 0.0759610723783 auc 0.990690529777 0.988526467231
1
ls 0.0182990215399 0.0206603745489 auc 0.993931716606 0.992192935877
2
ls 0.0365139244815 0.0410551587257 auc 0.995903933176 0.99395057005
3
ls 0.0046896333045 0.00722029264013 auc 0.998700062441 0.997388951806
4
ls 0.0499780229859 0.0567759176102 auc 0.991719169591 0.988069353054
5
ls 0.0158267439628 0.0196871885254 auc 0.994543601894 0.988712972331
this fold avg train 0.0324920617706 avg val 0.0368933340714
this fold auc train 0.994248168914 auc val 0.991473541725
0
ls 0.0691263984349 0.0774643865758 auc 0.990916371247 0.986770292943
1
ls 0.0180126220209 0.019269718098 auc 0.994210304914 0.993592169682
2
ls 0.0360659019934 0.0358204530166 auc 0.996055571372 0.995773699004
3
ls 0.00503408083956 0.00745776253734 auc 0.998564429872 0.992566112327
4
ls 0.0505256831793 0.0492018707967 auc 0.991572864189 0.991469856635
5
ls 0.0161379029446 0.0166401417017 auc 0.994414691582 0.986723863947
this fold avg train 0.03248376

In [28]:
# %%time
# lgb_res = simple_ens('lgb',10,42,0.03,0.8,0.7,3,'auc')
# sample_submission = pd.read_csv("../input/sample_submission.csv")
# sample_submission[list_classes] = lgb_res
# sample_submission.to_csv("../results/lgb_auc_csv_fold10.gz", index=False, compression='gzip')
# print(sample_submission.head())
# print('save done')

# all loss avg 0.0618597313284 0.0645588610375
# all auc avg 0.990260124 0.990006850107 PUB 9862