In [1]:
import pickle
import glob
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import xgboost as xgb

# import time
# print('sleeping')
# time.sleep(1800)
# print('sleep done =======================')

# load feats
train_x,test_x = [],[]
for feat in sorted(glob.glob('../features/*.pkl')):
    if '3_feat' in feat or 'tfidf' in feat:
        continue
    print('file path',feat)
    a,b = pickle.load(open(feat,'rb'))
    print(a.shape,b.shape)
    train_x.append(a)
    test_x.append(b)
train_x = np.nan_to_num(np.hstack(train_x))
test_x = np.nan_to_num(np.hstack(test_x))
print(train_x.shape)
    
# load y
train = pd.read_csv("../input/train.csv")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = train[list_classes].values.astype('int')
print(train_x.shape)

file path ../features/fasttext_cnn2d_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_cnn_gru_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_cnn_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_cnn_v2_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_cudnn_gru_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_gru_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/fasttext_lstm_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn2d_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_gru_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cnn_v2_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_cudnn_gru_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_gru_v1_4_feat.pkl
(159571, 6) (153164, 6)
file path ../features/glove_lstm_v1_4_feat.pkl
(159571, 6) (15

In [4]:
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.metrics import roc_auc_score

def simple_ens(model_name,k=3,rnd=233,lr=0.05,c_bytree=0.9,s_sample=0.9):
    kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=rnd)
    test_pred = np.zeros((153164,6))
    all_train_loss_l,all_val_loss_l = 0,0
    all_train_auc_l,all_val_auc_l = 0,0
    
    for i in range(6):
        val_loss_l,train_loss_l = 0,0
        val_auc_l,train_auc_l = 0,0
        fold_cnt = 0
        for train_index, test_index in kf.split(train_x,train_y[:,i]):
            # x,y
            curr_x,curr_y = train_x[train_index],train_y[train_index]
            hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
            d_test = xgb.DMatrix(test_x)

            # share params
            params = {
                    'subsample': s_sample,
                    'eta': lr,
                    'max_depth': 3,
                    'eval_metric':'logloss',
                    #'eval_metric':'auc',
                    'objective':'binary:logistic',
                    #'scale_pos_weight':0.9,
                    'colsample_bytree':c_bytree

                    }

            d_train = xgb.DMatrix(curr_x, curr_y[:,i])
            d_valid = xgb.DMatrix(hold_out_x, hold_out_y[:,i])
            watchlist = [(d_train, 'train'), (d_valid, 'valid')]

            model = xgb.train(params, d_train, 1000, watchlist,
                              early_stopping_rounds=50,
                              verbose_eval=None)
            print(fold_cnt,'fold: ',end='')
            fold_cnt += 1
            try:
                train_pred = model.predict(d_train)
                tmp_test_pred = model.predict(d_valid)
                
                curr_train_loss = log_loss(curr_y[:,i],train_pred)
                curr_val_loss = log_loss(hold_out_y[:,i],tmp_test_pred)
                
                curr_train_auc = roc_auc_score(curr_y[:,i],train_pred)
                curr_val_auc = roc_auc_score(hold_out_y[:,i],tmp_test_pred)
                
                print('ls',curr_train_loss,curr_val_loss,'auc',curr_train_auc,curr_val_auc)
                val_loss_l += curr_val_loss
                train_loss_l += curr_train_loss
                val_auc_l += curr_val_auc
                train_auc_l += curr_train_auc
            except:
                pass
            curr_test_pred = model.predict(d_test)
            test_pred[:,i] += curr_test_pred
            
            
        # avg k fold
        train_loss_l = train_loss_l/k
        val_loss_l = val_loss_l/k
        train_auc_l = train_auc_l/k
        val_auc_l = val_auc_l/k
        print('this class avg train',train_loss_l,'avg val',val_loss_l)
        print('this class auc train',train_auc_l,'auc val',val_auc_l)
        
        
        # avg 6 class
        all_train_loss_l += train_loss_l/6
        all_val_loss_l += val_loss_l/6
        all_train_auc_l += train_auc_l/6
        all_val_auc_l += val_auc_l/6
        print('========================')
    test_pred = test_pred/k
    print('all loss avg',all_train_loss_l,all_val_loss_l)
    print('all auc avg',all_train_auc_l,all_val_auc_l)
    print('=======================================================')
    return test_pred

print('done')




done


In [5]:
%%time
xgb_res = simple_ens('xgb',10,233,0.05,0.8,0.6)
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = xgb_res
sample_submission.to_csv("../results/xgb_adj_fold10_stratified.gz", index=False, compression='gzip')
print(sample_submission.head())
print('save done')
# all train avg 0.0321542340346 all val avg 0.0367885979049, PUB 9862, rnd 42, lr 0.03
# all train avg 0.0318508428487 all val avg 0.0368012450966, rnd 233, lr 0.05, PUB unknown

# fix lr, mnb bug, rm scale_pos_weight
# all train avg 0.0304768900903 all val avg 0.0360563778853 PUB 9866

# add many base models, rm tfidf
# all loss avg 0.0307207945903 0.0357848161087 all auc avg 0.995053758756 0.991359127088

# change to stratified
# all loss avg 0.0305611243591 0.0357477317816 all auc avg 0.995083861229 0.991408476154 PUB 9866

0 fold: ls 0.068601561542 0.0737398426855 auc 0.990874280415 0.988955525838
1 fold: ls 0.0651319943182 0.0786973120727 auc 0.991944741433 0.987396737643
2 fold: ls 0.0687204616737 0.07372641548 auc 0.990860430861 0.989178449312
3 fold: ls 0.0678187672201 0.0755270718717 auc 0.991134513274 0.988437787092
4 fold: ls 0.0666269968593 0.0818243455237 auc 0.991429194698 0.986807544664
5 fold: ls 0.0697068937329 0.074512447024 auc 0.990538502129 0.987950950327
6 fold: ls 0.0664468065448 0.0744595740503 auc 0.991513225075 0.988725731868
7 fold: ls 0.0715642457541 0.0726869956477 auc 0.989974643687 0.98893180584
8 fold: ls 0.0631764780234 0.0780034398012 auc 0.9925734494 0.986683595901
9 fold: ls 0.0700498915094 0.0777344832045 auc 0.990353144628 0.987799427559
this class avg train 0.0677844097178 avg val 0.0760911927361
this class auc train 0.99111961256 auc val 0.988086755604
0 fold: ls 0.0154216342693 0.0198842452237 auc 0.996260154923 0.992501819851
1 fold: ls 0.0166421260716 0.019897779404