In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import pickle

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = train[list_classes].values
train['comment_text'] = train['comment_text'].fillna('nan')
test['comment_text'] = test['comment_text'].fillna('nan')
print('load done')

load done


In [2]:
def simple_eval(x,y,model_f):
    y_cnt = len(y)
    split_idx = y_cnt * 4 // 5  # 80%
    train_x,test_x = x[:split_idx],x[split_idx:]
    train_y,test_y = y[:split_idx],y[split_idx:]
    for i in range(6):
        model = model_f()
        model.fit(train_x,train_y[:,i])
        train_pred = model.predict_proba(train_x)
        val_pred = model.predict_proba(test_x)
        print(list_classes[i])
        print('train log loss',log_loss(train_y[:,i],train_pred))
        print('valid log loss',log_loss(test_y[:,i],val_pred))
print('done')

done


In [3]:
tf_vec1 = TfidfVectorizer(lowercase=True,ngram_range=(1,3),stop_words='english',sublinear_tf=True)
train_tfidf1 = tf_vec1.fit_transform(train['comment_text'].values)
test_tfidf1 = tf_vec1.transform(test['comment_text'].values)

print(train_tfidf1.shape)

(95851, 4318593)


In [4]:
n_comp = 30
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
train_svd1 = svd_obj.fit_transform(train_tfidf1)
test_svd1 = svd_obj.transform(test_tfidf1)

print(type(train_svd1),train_svd1.shape)
with open('../features/tfidf_feat1.pkl','wb') as fout:
    pickle.dump([train_svd1,test_svd1],fout)
print('dump done')
simple_eval(train_svd1,train_y,XGBClassifier)
print('-----------')

<class 'numpy.ndarray'> (95851, 30)
dump done
toxic
train log loss 0.156087285225
valid log loss 0.15844687924
severe_toxic
train log loss 0.0248242897728
valid log loss 0.0277244484351
obscene
train log loss 0.0733959670668
valid log loss 0.0813155773677
threat
train log loss 0.0131837753882
valid log loss 0.0171517542511
insult
train log loss 0.0921806540488
valid log loss 0.101992104456
identity_hate
train log loss 0.0300460165205
valid log loss 0.0336325282022
-----------


In [5]:
simple_eval(train_tfidf1,train_y,LogisticRegression)
print('-----------')

toxic
train log loss 0.138082166744
valid log loss 0.173925010542
severe_toxic
train log loss 0.0297231055233
valid log loss 0.037684593817
obscene
train log loss 0.0802560145545
valid log loss 0.104363729341
threat
train log loss 0.0147055727416
valid log loss 0.0190503889489
insult
train log loss 0.0893909776011
valid log loss 0.116306827604
identity_hate
train log loss 0.0304432395079
valid log loss 0.0386920585633
-----------


In [6]:
tf_vec2 = TfidfVectorizer(lowercase=True,ngram_range=(1,2),stop_words='english',analyzer='char',sublinear_tf=True)
train_tfidf2 = tf_vec2.fit_transform(train['comment_text'].values)
test_tfidf2 = tf_vec2.transform(test['comment_text'].values)
print(train_tfidf2.shape)

(95851, 14989)


In [7]:
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
train_svd2 = svd_obj.fit_transform(train_tfidf2)
test_svd2 = svd_obj.transform(test_tfidf2)

print(type(train_svd2),train_svd2.shape)
with open('../features/tfidf_feat2.pkl','wb') as fout:
    pickle.dump([train_svd1,test_svd1],fout)
print('dump done')
simple_eval(train_svd2,train_y,XGBClassifier)
print('-----------')

<class 'numpy.ndarray'> (95851, 30)
dump done
toxic
train log loss 0.202172742131
valid log loss 0.212628835819
severe_toxic
train log loss 0.0259215834694
valid log loss 0.0321390577282
obscene
train log loss 0.112231201045
valid log loss 0.12610024022
threat
train log loss 0.0120430378248
valid log loss 0.0173528518812
insult
train log loss 0.11482623699
valid log loss 0.130960604286
identity_hate
train log loss 0.0323169132281
valid log loss 0.0402807760645
-----------


In [8]:
simple_eval(train_tfidf2,train_y,LogisticRegression)
print('-----------')

toxic
train log loss 0.158536648617
valid log loss 0.16843244878
severe_toxic
train log loss 0.0253842077802
valid log loss 0.0289628113994
obscene
train log loss 0.0811363015801
valid log loss 0.0906727570695
threat
train log loss 0.0121889946573
valid log loss 0.0147282865467
insult
train log loss 0.092172449373
valid log loss 0.104807954174
identity_hate
train log loss 0.0271011137533
valid log loss 0.0318123294515
-----------


In [9]:
from sklearn.model_selection import KFold
def gen_base_lr_feat(train_x,train_y,test_x,fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((95851,6)),np.zeros((226998,6))
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        
        for i in range(6):
            model = LogisticRegression()
            # train and pred
            # fit for i
            model.fit(curr_x, curr_y[:,i])
            
            # prepare for i on this fold
            hold_out_pred = model.predict_proba(hold_out_x)
            curr_train_pred = model.predict_proba(curr_x)
            print(i,log_loss(hold_out_y[:,i],hold_out_pred),log_loss(curr_y[:,i],curr_train_pred))
            train_pred[test_index][:,i] = hold_out_pred[:,1]
            
            # prepare test
            y_test = model.predict_proba(test_x)[:,1]
            test_pred[:,i] += y_test
        print('===========this fold done')
    test_pred = test_pred / fold_cnt
    return [train_pred, test_pred]

with open('../features/lr_feat1.pkl','wb') as fout:
    lr_feat1 = gen_base_lr_feat(train_tfidf1,train_y,test_tfidf1,3)
    pickle.dump(lr_feat1,fout)

0 0.183709322034 0.142118183325
1 0.0361012958923 0.0309334465615
2 0.102758281541 0.0850492838986
3 0.0204264004475 0.0145438654738
4 0.114756011046 0.0933248302223
5 0.0385575824299 0.0313512875189
0 0.179364883477 0.143445253302
1 0.0385623187675 0.0304851324524
2 0.105337067785 0.0839688180663
3 0.0156586948616 0.0163683836061
4 0.116403999643 0.0926314876987
5 0.0370199932371 0.031905783671
0 0.183190267469 0.142483757282
1 0.0391438009942 0.0303304573319
2 0.110917809094 0.0827475889184
3 0.0187648622205 0.0151438581718
4 0.121571856591 0.0912802260511
5 0.0402040336252 0.0307147657223


In [10]:
with open('../features/lr_feat2.pkl','wb') as fout:
    lr_feat2 = gen_base_lr_feat(train_tfidf2,train_y,test_tfidf2,3)
    pickle.dump(lr_feat2,fout)

0 0.167882418744 0.159722854406
1 0.0270481166489 0.0261903319928
2 0.0841428930945 0.0846953963422
3 0.0156749310934 0.0120235577822
4 0.0962323175092 0.0957883289087
5 0.0304340636466 0.027877073222
0 0.166998042835 0.160141838654
1 0.0285104675816 0.0255213095161
2 0.0893279852502 0.0822358791181
3 0.0118354610366 0.0136240587181
4 0.0992139442314 0.0942347655798
5 0.0295749890155 0.0281134282338
0 0.170509789353 0.158269923358
1 0.0298753775332 0.025152839486
2 0.0921485363036 0.0808628205458
3 0.0143592555057 0.0124768583705
4 0.106275971973 0.0910016031726
5 0.0318065756693 0.0273685559511


In [11]:
from sklearn.naive_bayes import MultinomialNB
def gen_base_mnb_feat(train_x,train_y,test_x,fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((95851,6)),np.zeros((226998,6))
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        
        for i in range(6):
            model = MultinomialNB()
            # train and pred
            # fit for i
            model.fit(curr_x, curr_y[:,i])
            
            # prepare for i on this fold
            hold_out_pred = model.predict_proba(hold_out_x)
            curr_train_pred = model.predict_proba(curr_x)
            print(i,log_loss(hold_out_y[:,i],hold_out_pred),log_loss(curr_y[:,i],curr_train_pred))
            train_pred[test_index][:,i] = hold_out_pred[:,1]
            
            # prepare test
            y_test = model.predict_proba(test_x)[:,1]
            test_pred[:,i] += y_test
        print('===========this fold done')
    test_pred = test_pred / fold_cnt
    return [train_pred, test_pred]

with open('../features/mnb_feat1.pkl','wb') as fout:
    _feat1 = gen_base_mnb_feat(train_tfidf1,train_y,test_tfidf1,3)
    pickle.dump(_feat1,fout)

0 0.326410504502 0.250603130512
1 0.0610047715975 0.0589797224541
2 0.206542481348 0.17832550856
3 0.030708840219 0.0226513455201
4 0.209386724709 0.179440504102
5 0.0599511952229 0.053824332119
0 0.317623748166 0.251866473661
1 0.066978783688 0.0574292325375
2 0.217078229126 0.175659921216
3 0.0225298737139 0.0256984472411
4 0.214682162434 0.178124678911
5 0.0593940596629 0.0539140209586
0 0.329041097508 0.250898738331
1 0.0699024465048 0.0567192903762
2 0.229814188687 0.173650227697
3 0.0273243375288 0.0240239322357
4 0.22895062394 0.174492372673
5 0.0633596784626 0.0528066625696


In [12]:
with open('../features/mnb_feat2.pkl','wb') as fout:
    _feat2 = gen_base_mnb_feat(train_tfidf2,train_y,test_tfidf2,3)
    pickle.dump(_feat1,fout)

0 0.337717406616 0.328382416032
1 0.124539082177 0.130719536228
2 0.25860451231 0.266390921695
3 0.0873198985073 0.0690654448704
4 0.259330415761 0.263757784541
5 0.135487357405 0.130688344246
0 0.325397157453 0.331203784829
1 0.131145544727 0.131329667175
2 0.265647154974 0.266833818714
3 0.0614539930119 0.0779543269918
4 0.26302682404 0.264245655852
5 0.127908651044 0.133603396278
0 0.340992651411 0.327385454184
1 0.141490939743 0.127799755404
2 0.28609524357 0.262618662023
3 0.0753647062266 0.0729965149157
4 0.280574148454 0.260765958496
5 0.137835303626 0.130177800613
