In [1]:
import pickle
import glob
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import xgboost as xgb

# load feats
train_x,test_x = [],[]
for feat in sorted(glob.glob('../features/*.pkl')):
#     if 'tfidf' in feat or 'lr' in feat or 'mnb' in feat:
#         continue
    print('file path',feat)
    a,b = pickle.load(open(feat,'rb'))
    print(a.shape,b.shape)
    train_x.append(a)
    test_x.append(b)
train_x = np.nan_to_num(np.hstack(train_x))
test_x = np.nan_to_num(np.hstack(test_x))
print(train_x.shape)
    
# load y
train = pd.read_csv("../input/train.csv")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = train[list_classes].values.astype('int')
print(train_x.shape)

file path ../features/cnn2d_muse_1_feat.pkl
(159571, 6) (153164, 6)
file path ../features/cnn_glove_1_feat.pkl
(159571, 6) (153164, 6)
file path ../features/cnn_gru_glove_1_feat.pkl
(159571, 6) (153164, 6)
file path ../features/cnn_muse_adj_1_feat.pkl
(159571, 6) (153164, 6)
file path ../features/cnn_muse_adj_1_feat_de_fr.pkl
(159571, 6) (153164, 6)
file path ../features/cudnn_gru_glove_1_feat.pkl
(159571, 6) (153164, 6)
file path ../features/cudnn_gru_glove_1_sample_feat.pkl
(159571, 6) (153164, 6)
file path ../features/gru_glove_1_feat.pkl
(159571, 6) (153164, 6)
file path ../features/gru_muse_1_feat.pkl
(159571, 6) (153164, 6)
file path ../features/lr_feat1.pkl
(159571, 6) (153164, 6)
file path ../features/lr_feat2.pkl
(159571, 6) (153164, 6)
file path ../features/lstm_glove_1_feat.pkl
(159571, 6) (153164, 6)
file path ../features/mnb_feat1.pkl
(159571, 6) (153164, 6)
file path ../features/mnb_feat2.pkl
(159571, 6) (153164, 6)
file path ../features/other_feat.pkl
(159571, 16) (15316

In [2]:
train[list_classes].describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
def aug_data_ratio(x,y,ratio=0.2):
    print(x.shape)
    neg_index = np.where(y[:,0]==0)[0]
    print(neg_index)
    data_cnt = len(neg_index)
    add_cnt = int(data_cnt*ratio)
    add_index = neg_index[:add_cnt]
    add_x = np.concatenate([x,x[add_index]])
    add_y = np.concatenate([y,y[add_index]])
    print(add_x.shape,data_cnt)
    return add_x,add_y

def del_data_ratio(x,y,ratio=0.8):
    print(x.shape)
    pos_index = np.where(y[:,0]==1)[0]
    neg_index = np.where(y[:,0]==0)[0]
    print(pos_index)
    data_cnt = len(pos_index)
    add_cnt = int(data_cnt*ratio)
    add_index = pos_index[:add_cnt]
    add_x = np.concatenate([x[add_index],x[neg_index]])
    add_y = np.concatenate([y[add_index],y[neg_index]])
    print(add_x.shape,data_cnt)
    return add_x,add_y

# add neg data
#train_x,train_y = aug_data_ratio(train_x,train_y)
# del pos data
train_x,train_y = del_data_ratio(train_x,train_y)

(159571, 160)
[     6     12     16 ..., 159541 159546 159554]
(156512, 160) 15294


In [4]:
from sklearn.model_selection import KFold
def simple_ens(model_name,k=3,rnd=233):
    kf = KFold(n_splits=k, shuffle=True, random_state=rnd)
    test_pred = np.zeros((153164,6))
    cache_test_pred = np.zeros((153164,6))
    single_best = 100
    single_best_pred = None
    all_train_loss_l,all_val_loss_l = 0,0
    
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        val_loss_l,train_loss_l = 0,0
        d_test = xgb.DMatrix(test_x)
        
        # share params
        params = {
                'colsample_bytree': 0.9,
                'subsample': 0.9,
                'eta': 0.05,
                'max_depth': 3,
                'eval_metric':'logloss',
                'objective':'binary:logistic',
                'scale_pos_weight':0.9,
                'colsample_bylevel':0.9,
                'colsample_bytree':0.9,
            
                }
        
        # train for each class
        for i in range(6):
            d_train = xgb.DMatrix(curr_x, curr_y[:,i])
            d_valid = xgb.DMatrix(hold_out_x, hold_out_y[:,i])
            watchlist = [(d_train, 'train'), (d_valid, 'valid')]

            model = xgb.train(params, d_train, 1000, watchlist,
                              early_stopping_rounds=50,
                              verbose_eval=2000)
            print(i)
            try:
                curr_train_loss = log_loss(curr_y[:,i],model.predict(d_train))
                curr_val_loss = log_loss(hold_out_y[:,i],model.predict(d_valid))
                print(curr_train_loss,curr_val_loss)
                val_loss_l += curr_val_loss
                train_loss_l += curr_train_loss
            except:
                pass
            curr_test_pred = model.predict(d_test)
            
            test_pred[:,i] += curr_test_pred
            cache_test_pred[:,i] += curr_test_pred
            
        # avg 6 class
        train_loss_l = train_loss_l/6
        val_loss_l = val_loss_l/6
        print('this fold avg train',train_loss_l,'avg val',val_loss_l)
        
        # save best one fold result
        if val_loss_l < single_best:
            single_best = val_loss_l
            single_best_pred = cache_test_pred
            print('new single best')
        
        cache_test_pred = np.zeros((153164,6))
        
        # avg k fold
        all_train_loss_l += train_loss_l/k
        all_val_loss_l += val_loss_l/k
        print('========================')
    test_pred = test_pred/k
    print('all train avg',all_train_loss_l,'all val avg',all_val_loss_l)
    return test_pred, single_best_pred

print('done')

done


In [5]:
# xgb_res,b = simple_ens('xgb',k=3)
# sample_submission = pd.read_csv("../input/sample_submission.csv")
# sample_submission[list_classes] = xgb_res
# sample_submission.to_csv("../results/xgb_ens_sample_csv_fold3.gz", index=False, compression='gzip')
# print(sample_submission.head())
# print('save done')


[0]	train-logloss:0.648585	valid-logloss:0.648662
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[537]	train-logloss:0.058003	valid-logloss:0.070881

0
0.0569477864684 0.0709223552454
[0]	train-logloss:0.64543	valid-logloss:0.645384
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[183]	train-logloss:0.014612	valid-logloss:0.016652

1
0.0136318514357 0.0166669946835
[0]	train-logloss:0.646546	valid-logloss:0.64657
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[238]	train-logloss:0.03181	valid-logloss:0.03645

2
0.0306316968461 0.0364851378257
[0]	train-logloss:0.644684	valid-logloss:0.644723
Multiple eva

In [6]:
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = xgb_res/1.2
sample_submission.to_csv("../results/xgb_ens_sample_csv_fold3_div2.gz", index=False, compression='gzip')
print(sample_submission.head())
print('save done')

                 id     toxic  severe_toxic   obscene    threat    insult  \
0  00001cee341fdb12  0.831708      0.226895  0.814608  0.111047  0.778188   
1  0000247867823ef7  0.000068      0.000020  0.000062  0.000019  0.000061   
2  00013b17ad220c46  0.000396      0.000024  0.000299  0.000030  0.000281   
3  00017563c3f7919a  0.000066      0.000030  0.000082  0.000027  0.000093   
4  00017695ad8997eb  0.001761      0.000026  0.000478  0.000037  0.000235   

   identity_hate  
0       0.399420  
1       0.000039  
2       0.000149  
3       0.000048  
4       0.000078  
save done


In [7]:
xgb_res,b = simple_ens('xgb',k=5)
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = xgb_res
sample_submission.to_csv("../results/xgb_ens_sample_csv_fold5.gz", index=False, compression='gzip')
print(sample_submission.head())
print('save done')


[0]	train-logloss:0.648624	valid-logloss:0.648698
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[567]	train-logloss:0.059114	valid-logloss:0.070927

0
0.0581462623283 0.0709437131355
[0]	train-logloss:0.645408	valid-logloss:0.64541
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[196]	train-logloss:0.014455	valid-logloss:0.016786

1
0.0136270849377 0.0168117832044
[0]	train-logloss:0.646534	valid-logloss:0.646514
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[396]	train-logloss:0.029886	valid-logloss:0.035317

2
0.0289965224012 0.0353767543775
[0]	train-logloss:0.644705	valid-logloss:0.644729
Multiple 

[0]	train-logloss:0.645401	valid-logloss:0.645484
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[204]	train-logloss:0.014271	valid-logloss:0.017286

1
0.0134289526644 0.0173437518977
[0]	train-logloss:0.646525	valid-logloss:0.646489
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[271]	train-logloss:0.032008	valid-logloss:0.035754

2
0.0310311683655 0.0358146133167
[0]	train-logloss:0.644696	valid-logloss:0.644727
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
Stopping. Best iteration:
[217]	train-logloss:0.003712	valid-logloss:0.006461

3
0.00315642284674 0.0065305210376
[0]	train-logloss:0.647232	valid-logloss:0.647342
Multipl

In [None]:
xgb_res,b = simple_ens('xgb',k=5)
sample_submission = pd.read_csv("../input/sample_submission.csv")
sample_submission[list_classes] = xgb_res/1.2
sample_submission.to_csv("../results/xgb_ens_sample_csv_fold5_div2.gz", index=False, compression='gzip')
print(sample_submission.head())
print('save done')


In [13]:
from scipy.special import logit,expit

# The logit function is defined as logit(p) = log(p/(1-p)). 
# Note that logit(0) = -inf, logit(1) = inf, and logit(p) for p<0 or p>1 yields nan.

# The expit function, also known as the logistic function, 
# is defined as expit(x) = 1/(1+exp(-x)). It is the inverse of the logit function.



def new_trans(x):
    return expit(logit(x)-0.5)

test_x = np.array([0,0.5,0.8,1])
print(new_trans(test_x))

div_num = np.sqrt(np.e)
print(div_num)
    

[ 0.          0.37754067  0.70812487  1.        ]
1.6487212707


In [10]:
sample_submission[list_classes] = new_trans(xgb_res)
sample_submission.to_csv("../results/xgb_ens_sample_csv_fold5_new_trans.gz", index=False, compression='gzip')
print(sample_submission.head())
print('save done')

                 id     toxic  severe_toxic   obscene    threat    insult  \
0  00001cee341fdb12  0.996878      0.188993  0.953767  0.096143  0.893775   
1  0000247867823ef7  0.000046      0.000014  0.000038  0.000013  0.000035   
2  00013b17ad220c46  0.000346      0.000017  0.000207  0.000016  0.000181   
3  00017563c3f7919a  0.000057      0.000019  0.000052  0.000017  0.000052   
4  00017695ad8997eb  0.001306      0.000018  0.000390  0.000019  0.000148   

   identity_hate  
0       0.348785  
1       0.000030  
2       0.000149  
3       0.000031  
4       0.000072  
save done


In [11]:
pre_3fold_res = pd.read_csv('../results/xgb_ens_sample_csv_fold3.gz')
print(pre_3fold_res.head())

pre_3fold_res[list_classes] = new_trans(pre_3fold_res[list_classes].values)
pre_3fold_res.to_csv("../results/xgb_ens_sample_csv_fold3_new_trans.gz", index=False, compression='gzip')
print(pre_3fold_res.head())
print('save done')

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.998049,0.272274,0.977529,0.133257,0.933826,0.479305
1,0000247867823ef7,8.1e-05,2.4e-05,7.4e-05,2.3e-05,7.3e-05,4.7e-05
2,00013b17ad220c46,0.000475,2.9e-05,0.000359,3.6e-05,0.000337,0.000179
3,00017563c3f7919a,7.9e-05,3.6e-05,9.8e-05,3.2e-05,0.000112,5.8e-05
4,00017695ad8997eb,0.002113,3.1e-05,0.000573,4.5e-05,0.000282,9.4e-05
