In [1]:
import pickle
import glob
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import xgboost as xgb

# import time
# print('sleeping')
# time.sleep(1800)
# print('sleep done =======================')

# load feats
train_x,test_x = [],[]
for feat in sorted(glob.glob('../features/*.pkl')):
    if '3_feat' in feat or 'tfidf' in feat:
        continue
    print('file path',feat)
    a,b = pickle.load(open(feat,'rb'))
    print(a.shape,b.shape)
    train_x.append(a)
    test_x.append(b)
train_x = np.nan_to_num(np.hstack(train_x))
test_x = np.nan_to_num(np.hstack(test_x))
print(train_x.shape)
    
# load y
train = pd.read_csv("../input/train.csv")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = train[list_classes].values.astype('int')
print(train_x.shape)

file path ../features/fasttext_cnn2d_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_cnn_gru_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_cnn_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_cnn_v2_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_cudnn_gru_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_gru_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_lstm_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn2d_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_gru_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_v2_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cudnn_gru_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_gru_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_lstm_v1_4_feat.pkl
(159571, 6) (15

In [2]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

def simple_ens(model_name,k=3,rnd=233,lr=0.05,c_bytree=0.9,s_sample=0.9):
    kf = KFold(n_splits=k, shuffle=True, random_state=rnd)
    test_pred = np.zeros((153164,6))
    all_train_loss_l,all_val_loss_l = 0,0
    all_train_auc_l,all_val_auc_l = 0,0
    
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        val_loss_l,train_loss_l = 0,0
        val_auc_l,train_auc_l = 0,0
        d_test = xgb.DMatrix(test_x)
        
        # share params
        params = {
                'subsample': s_sample,
                'eta': lr,
                'max_depth': 3,
                'eval_metric':'logloss',
                #'eval_metric':'auc',
                'objective':'binary:logistic',
                #'scale_pos_weight':0.9,
                'colsample_bytree':c_bytree
            
                }
        
        # train for each class
        for i in range(6):
            d_train = xgb.DMatrix(curr_x, curr_y[:,i])
            d_valid = xgb.DMatrix(hold_out_x, hold_out_y[:,i])
            watchlist = [(d_train, 'train'), (d_valid, 'valid')]

            model = xgb.train(params, d_train, 1000, watchlist,
                              early_stopping_rounds=50,
                              verbose_eval=None)
            print(i)
            try:
                train_pred = model.predict(d_train)
                tmp_test_pred = model.predict(d_valid)
                
                curr_train_loss = log_loss(curr_y[:,i],train_pred)
                curr_val_loss = log_loss(hold_out_y[:,i],tmp_test_pred)
                
                curr_train_auc = roc_auc_score(curr_y[:,i],train_pred)
                curr_val_auc = roc_auc_score(hold_out_y[:,i],tmp_test_pred)
                
                print('ls',curr_train_loss,curr_val_loss,'auc',curr_train_auc,curr_val_auc)
                val_loss_l += curr_val_loss
                train_loss_l += curr_train_loss
                val_auc_l += curr_val_auc
                train_auc_l += curr_train_auc
            except:
                pass
            curr_test_pred = model.predict(d_test)
            test_pred[:,i] += curr_test_pred
            
            
        # avg 6 class
        train_loss_l = train_loss_l/6
        val_loss_l = val_loss_l/6
        train_auc_l = train_auc_l/6
        val_auc_l = val_auc_l/6
        print('this fold avg train',train_loss_l,'avg val',val_loss_l)
        print('this fold auc train',train_auc_l,'auc val',val_auc_l)
        
        
        # avg k fold
        all_train_loss_l += train_loss_l/k
        all_val_loss_l += val_loss_l/k
        all_train_auc_l += train_auc_l/k
        all_val_auc_l += val_auc_l/k
        print('========================')
    test_pred = test_pred/k
    print('all loss avg',all_train_loss_l,all_val_loss_l)
    print('all auc avg',all_train_auc_l,all_val_auc_l)
    print('=======================================================')
    return test_pred

print('done')




done


In [3]:
%%time
xgb_res = simple_ens('xgb',10,233,0.05,0.8,0.7)
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = xgb_res
sample_submission.to_csv("../results/xgb_adj_fold10.gz", index=False, compression='gzip')
print(sample_submission.head())
print('save done')
# all train avg 0.0321542340346 all val avg 0.0367885979049, PUB 9862, rnd 42, lr 0.03
# all train avg 0.0318508428487 all val avg 0.0368012450966, rnd 233, lr 0.05, PUB unknown

# fix lr, mnb bug, rm scale_pos_weight
# all train avg 0.0304768900903 all val avg 0.0360563778853 PUB 9866

# add many base models, rm tfidf
# all loss avg 0.0307207945903 0.0357848161087 all auc avg 0.995053758756 0.991359127088

0
ls 0.0688003911772 0.074704226865 auc 0.990813728283 0.988837137781
1
ls 0.0166296795286 0.0206951787293 auc 0.995296202396 0.992008688248
2
ls 0.0338646588139 0.0400604727117 auc 0.996490287614 0.994652346205
3
ls 0.00373618224407 0.00690765057967 auc 0.999125975598 0.997928660186
4
ls 0.0495808672918 0.0563505278254 auc 0.991773050114 0.987942038823
5
ls 0.0124027476631 0.0195981944745 auc 0.996977594694 0.989235540239
this fold avg train 0.0308357544531 avg val 0.0363860418642
this fold auc train 0.995079473117 auc val 0.991767401914
0
ls 0.0690765581718 0.0773713140222 auc 0.990755527966 0.986864754851
1
ls 0.0165496768128 0.0192693533141 auc 0.995327154816 0.993383214571
2
ls 0.0348556682482 0.034944524965 auc 0.996315828998 0.996056369352
3
ls 0.00410681480011 0.00756393688232 auc 0.998832243967 0.993075987043
4
ls 0.0503002297735 0.0490632270101 auc 0.991559056988 0.991543706112
5
ls 0.0139601925387 0.0164958092669 auc 0.99595357695 0.988320073246
this fold avg train 0.0314748

In [4]:
%%time
# adj lr, colsample_bytree, sample
for lr in [0.05,0.075]:
    for c1 in [0.8,0.7,0.6]:
        for c2 in [0.7,0.6]:
            xgb_res = simple_ens('xgb',5,233,lr,c1,c2)
            sample_submission = pd.read_csv("../input/sample_submission.csv")
            sample_submission[list_classes] = xgb_res
            fname = "../results/xgb_adj_fold5_{}_{}_{}.gz".format(lr,c1,c2)
            sample_submission.to_csv(fname, index=False, compression='gzip')
            print(sample_submission.head())
            print('save done')

# no rm, 0.0699781296574 0.0780951434931, 0.0161680844181 0.0199438522849,
# final, all train avg 0.031309680463 all val avg 0.0368863155994
# rm muse and pretrain, not good
# rm gru_v1, 0.0687157498068 0.0780051849506, 0.0152173938614 0.0200296896045
# rm tfidf, 0.072673329496 0.0787288243817, 0.0169421114344 0.0199760695638, not good
# only rm no pretrain, not good,
# test rm gru_v1, lstm_v1, all train avg 0.0313024384367 all val avg 0.0369036640753
# rm cnn2d, 0.0704654561461 0.0780916497355, 0.0170009305396 0.0200476295259
# 1st fold, this fold avg train 0.0318658486615 avg val 0.0362271742281

# adj params
# col sample by tree: 0.9 all train avg 0.031309680463 all val avg 0.0368863155994


# adj lr, colsample_bytree, sample
#   0.05  0.7  0.7 all train avg 0.031149993004 all val avg 0.0368516540068
#   0.05  0.7  0.8 all train avg 0.0311077763624 all val avg 0.036855567286
#   0.05  0.7  0.9 all train avg 0.0312078560147 all val avg 0.0368798432732
#   0.05  0.8  0.7 all train avg 0.0309175391583 all val avg 0.0368485662838
#   0.05  0.8  0.8 all train avg 0.0314143017087 all val avg 0.0368859121266
#   0.05  0.8  0.9 all train avg 0.0312955837465 all val avg 0.0368866903553
#   0.05  0.9  0.7 all train avg 0.0311044998336 all val avg 0.0368667306586
#   0.05  0.9  0.8 all train avg 0.0311835661273 all val avg 0.0368992582647
#   0.05  0.9  0.9 all train avg 0.0313585027516 all val avg 0.0368996796111
#    0.1  0.7  0.7 all train avg 0.0306404949681 all val avg 0.0370501103027
#    0.1  0.7  0.8 all train avg 0.0307017678518 all val avg 0.037048645753
#    0.1  0.7  0.9 all train avg 0.030880668966 all val avg 0.0370266615654
#    0.1  0.8  0.7 all train avg 0.0307153292658 all val avg 0.0370698994366
#    0.1  0.8  0.8 all train avg 0.0308616897847 all val avg 0.0370446456042
# large lr is worse

# adj lr, colsample_bytree, sample
#   0.03  0.8  0.7 all train avg 0.0320478053971 all val avg 0.036823783635
#   0.05  0.8  0.7 all train avg 0.0309175391583 all val avg 0.0368485662838

# fix lr, mnb bug, rm scale_pos_weight
# all train avg 0.0299882438015 all val avg 0.0360842600203
# 0
# 0.0635105397429 0.0763360044684
# 1
# 0.0162476813928 0.0198345568551
# 2
# 0.0353173073883 0.0380803762678
# 3
# 0.00364481098573 0.00719515731424
# 4
# 0.0494110049729 0.0531263392106
# 5
# 0.0140568534717 0.0182191483172
# 1st fold avg train 0.0303646996591 avg val 0.0354652637389

# improve lr mnb feat (changed word, char tfidf params)
# all train avg 0.0306645437303 all val avg 0.0362385982181, worse

# add 2 feats, all train avg 0.0303858812518 all val avg 0.0360233929721

# 0.05 0.8 0.7 all loss avg 0.0301132052997 0.0358027724291 all auc avg 0.995342928355 0.991225135239
# 0.05 0.8 0.6 all loss avg 0.0298014563128 0.0357908149609 all auc avg 0.995454251084 0.991281985455
# 0.05 0.7 0.7 all loss avg 0.0298566005217 0.0357916261613 all auc avg 0.995455486421 0.991267068298
# 0.05 0.7 0.6 all loss avg 0.0304736964775 0.0357935439674 all auc avg 0.995184017599 0.991297075786
# 0.05 0.6 0.7 all loss avg 0.0301082092089 0.0357956095426 all auc avg 0.99530916461 0.991228036587
# 0.05 0.6 0.6 all loss avg 0.0301598024504 0.0358157493235 all auc avg 0.995283428759 0.991267908527


0
ls 0.0665750156779 0.0761578797365 auc 0.991626428172 0.987886995545
1
ls 0.0163873100813 0.0198899219435 auc 0.995469910995 0.99281797154
2
ls 0.0342686766459 0.0375387753073 auc 0.996433475328 0.995171177562
3
ls 0.00409166777748 0.00718869480271 auc 0.998813588977 0.996021432506
4
ls 0.0482071097314 0.0527547465068 auc 0.992332088584 0.989513439385
5
ls 0.014102401981 0.0181316152242 auc 0.99571170767 0.988437461404
this fold avg train 0.0306053636492 avg val 0.0352769389202
this fold auc train 0.995064533288 auc val 0.99164141299
0
ls 0.0674609998606 0.076736927229 auc 0.991307231248 0.987385551355
1
ls 0.0153823007163 0.0192965524941 auc 0.996406703197 0.992463249362
2
ls 0.0310401216165 0.0399914139285 auc 0.997121892584 0.994906232623
3
ls 0.00416442953175 0.00734797032905 auc 0.999132953867 0.984149203632
4
ls 0.0436058692404 0.0536283504558 auc 0.993977448829 0.989316045983
5
ls 0.0125699032046 0.016762315523 auc 0.99702254955 0.991606056722
this fold avg train 0.02903727069

0
ls 0.0648620828829 0.0770121625557 auc 0.9920419623 0.987971177276
1
ls 0.0163507488206 0.0202962117058 auc 0.995562250222 0.991602553568
2
ls 0.034290959001 0.0397462369322 auc 0.996408713352 0.99453241697
3
ls 0.00400138206974 0.0069788981694 auc 0.999078465516 0.995851999947
4
ls 0.0467626184651 0.0552638677063 auc 0.9927059462 0.98972093428
5
ls 0.0127191866095 0.0178063163112 auc 0.996780636655 0.989668161296
this fold avg train 0.0298311629748 avg val 0.0361839488968
this fold auc train 0.995429662374 auc val 0.991557873889
0
ls 0.0679972458843 0.0758108258332 auc 0.991090393626 0.988259537639
1
ls 0.0156394861139 0.0208225065684 auc 0.996011486015 0.991833524847
2
ls 0.034029352773 0.0383744679531 auc 0.996494711034 0.995313400924
3
ls 0.00289462327357 0.00591125855699 auc 0.999707246879 0.993299266603
4
ls 0.0477379059667 0.0539052156935 auc 0.992427640308 0.990005079895
5
ls 0.0137195629287 0.017960740094 auc 0.99583930867 0.993195119131
this fold avg train 0.0303363628234 a

0
ls 0.0655694227549 0.075029316998 auc 0.991812029537 0.988854043476
1
ls 0.0154181233544 0.0214299999779 auc 0.99623260343 0.990656897525
2
ls 0.0339094849149 0.0401384598942 auc 0.996502249621 0.994872865126
3
ls 0.00403462620568 0.0082312128291 auc 0.998890313633 0.993621923187
4
ls 0.0486888371432 0.0550464602165 auc 0.99202927332 0.989717566672
5
ls 0.0140438010077 0.0187380120163 auc 0.995793815671 0.987769440992
this fold avg train 0.0302773825635 avg val 0.0364355769887
this fold auc train 0.995210047535 auc val 0.990915456163
all loss avg 0.0301082092089 0.0357956095426
all auc avg 0.99530916461 0.991228036587
                 id     toxic  severe_toxic   obscene    threat    insult  \
0  00001cee341fdb12  0.999384      0.313022  0.978869  0.132661  0.924628   
1  0000247867823ef7  0.000374      0.000020  0.000062  0.000022  0.000044   
2  00013b17ad220c46  0.000317      0.000023  0.000202  0.000026  0.000139   
3  00017563c3f7919a  0.000087      0.000021  0.000051  0.000026 

0
ls 0.0659480664292 0.0896616715203 auc 0.991334830052 0.984396736399
1
ls 0.011365636614 0.0292182783341 auc 0.997809098543 0.989205277724
2
ls 0.0292838397293 0.0474111150983 auc 0.997309554392 0.993171652687
3
ls 0.00108575596091 0.0117842957439 auc 0.999984743869 0.98839907125
4
ls 0.0438587187553 0.0639806780687 auc 0.993517942889 0.985972075248
5
ls 0.00963964685593 0.0258900280954 auc 0.998073702553 0.984780628311
this fold avg train 0.0268636107241 avg val 0.0446576778101
this fold auc train 0.99633831205 auc val 0.98765424027
0
ls 0.0668148704501 0.089896752682 auc 0.991113161706 0.983967390914
1
ls 0.0115524855302 0.0286877927733 auc 0.997808895897 0.988921327361
2
ls 0.0287949271997 0.0507323964351 auc 0.997422160099 0.992796358294
3
ls 0.00104678666293 0.0129383824252 auc 0.999986270491 0.966244968373
4
ls 0.0446109629791 0.0642405104202 auc 0.993269181987 0.985929524562
5
ls 0.00919779579349 0.0252962894179 auc 0.998223889648 0.981026173062
this fold avg train 0.027002971

  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)
  loss = -(transformed_labels * np.log(y_pred)).sum(axis=1)


1
ls 0.0110852329951 0.0297741121803 auc 0.99797113594 0.987860314593
2
ls 0.0284831601306 0.0522454280627 auc 0.997449266605 0.991916338352
3
ls 0.00111544330953 0.0124349224073 auc 0.999965072619 0.974526258074
4
ls 0.043882326232 0.0667330890139 auc 0.993468513533 0.986124890645
5
ls 0.00926427845864 0.0275307150999 auc 0.997962400556 0.975058589072
this fold avg train nan avg val 0.0465669631467
this fold auc train 0.996384704808 auc val 0.983266213764
0
ls 0.0660304231778 0.0897520094895 auc 0.99137981012 0.984625207191
1
ls 0.0109147994024 0.0294694210582 auc 0.997947815806 0.989253273828
2
ls 0.0290199351442 0.0492604851987 auc 0.997387246677 0.992534577603
3
ls 0.00127927780145 0.00977813980592 auc 0.999945205439 0.96638173978
4
ls 0.0435554708895 0.0643282787451 auc 0.993488868585 0.98707898234
5
ls 0.00913571594984 0.0267469820793 auc 0.998113838604 0.986169641338
this fold avg train 0.0266559370609 avg val 0.0448892193961
this fold auc train 0.996377130872 auc val 0.98434057

1
ls 0.0107851947281 0.0305644904976 auc 0.99812318303 0.986706845118
2
ls 0.0287186033717 0.050194339736 auc 0.997397915984 0.99252948572
3
ls 0.000974526671925 0.0155254061487 auc 0.999979807284 0.969474080129
4
ls 0.0436709823405 0.0653016473617 auc 0.993537720367 0.986386772073
5
ls 0.00896179480279 0.0282501227712 auc 0.998346880331 0.97921172719
this fold avg train 0.0265146535971 avg val 0.0464693650706
this fold auc train 0.996431391573 auc val 0.983253467712
all loss avg nan nan
all auc avg 0.996389983363 0.985346231185
                 id     toxic  severe_toxic   obscene        threat    insult  \
0  00001cee341fdb12  0.999890  2.800859e-01  0.996744  7.826273e-02  0.878981   
1  0000247867823ef7  0.000025  4.344897e-06  0.000002  1.448181e-06  0.000008   
2  00013b17ad220c46  0.000038  5.759779e-07  0.000026  1.659680e-06  0.000013   
3  00017563c3f7919a  0.000028  1.512231e-06  0.000007  4.402440e-06  0.000003   
4  00017695ad8997eb  0.000280  4.358912e-07  0.000008  7.759