In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import pickle
import re
  

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = train[list_classes].values
train['comment_text'] = train['comment_text'].fillna('nan')
test['comment_text'] = test['comment_text'].fillna('nan')
print('load done')

load done


In [2]:
def simple_eval(x,y,model_f):
    y_cnt = len(y)
    split_idx = y_cnt * 4 // 5  # 80%
    train_x,test_x = x[:split_idx],x[split_idx:]
    train_y,test_y = y[:split_idx],y[split_idx:]
    for i in range(6):
        model = model_f()
        model.fit(train_x,train_y[:,i])
        train_pred = model.predict_proba(train_x)
        val_pred = model.predict_proba(test_x)
        print(list_classes[i])
        print('train log loss',log_loss(train_y[:,i],train_pred))
        print('valid log loss',log_loss(test_y[:,i],val_pred))
print('done')

done


In [3]:
tf_vec1 = TfidfVectorizer(sublinear_tf=True,
                            strip_accents='unicode',
                            analyzer='word',
                            token_pattern=r'\w{1,}',
                            ngram_range=(1, 1),
                            max_features=15000)
train_tfidf1 = tf_vec1.fit_transform(train['comment_text'].values)
test_tfidf1 = tf_vec1.transform(test['comment_text'].values)

print(train_tfidf1.shape)

(159571, 15000)


In [4]:
n_comp = 30
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
train_svd1 = svd_obj.fit_transform(train_tfidf1)
test_svd1 = svd_obj.transform(test_tfidf1)

print(type(train_svd1),train_svd1.shape)
with open('../features/tfidf_feat1.pkl','wb') as fout:
    pickle.dump([train_svd1,test_svd1],fout)
print('dump done')
#simple_eval(train_svd1,train_y,XGBClassifier)
print('-----------')

<class 'numpy.ndarray'> (159571, 30)
dump done
-----------


In [5]:
#simple_eval(train_tfidf1,train_y,LogisticRegression)
#print('-----------')

In [6]:
tf_vec2 = TfidfVectorizer(sublinear_tf=True,
                        strip_accents='unicode',
                        analyzer='char',
                        ngram_range=(1, 5),
                        max_features=20000
                         )
train_tfidf2 = tf_vec2.fit_transform(train['comment_text'].values)
test_tfidf2 = tf_vec2.transform(test['comment_text'].values)
print(train_tfidf2.shape)

(159571, 20000)


In [7]:
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
train_svd2 = svd_obj.fit_transform(train_tfidf2)
test_svd2 = svd_obj.transform(test_tfidf2)

print(type(train_svd2),train_svd2.shape)
with open('../features/tfidf_feat2.pkl','wb') as fout:
    pickle.dump([train_svd2,test_svd2],fout)
print('dump done')
#simple_eval(train_svd2,train_y,XGBClassifier)
print('-----------')

<class 'numpy.ndarray'> (159571, 30)
dump done
-----------


In [8]:
simple_eval(train_tfidf2,train_y,LogisticRegression)
print('-----------')

toxic
train log loss 0.0998123893448
valid log loss 0.112494814028
severe_toxic
train log loss 0.0221921968053
valid log loss 0.0248359643588
obscene
train log loss 0.0535317480525
valid log loss 0.0605432076228
threat
train log loss 0.00907702751104
valid log loss 0.00971702054356
insult
train log loss 0.0650014251088
valid log loss 0.0741674836352
identity_hate
train log loss 0.02110441208
valid log loss 0.0264481163121
-----------


In [9]:
from sklearn.model_selection import KFold
def gen_base_lr_feat(train_x,train_y,test_x,fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        
        for i in range(6):
            model = LogisticRegression(C=4.0, solver='sag')
            # train and pred
            # fit for i
            model.fit(curr_x, curr_y[:,i])
            
            # prepare for i on this fold
            hold_out_pred = model.predict_proba(hold_out_x)
            curr_train_pred = model.predict_proba(curr_x)
            print(i,log_loss(hold_out_y[:,i],hold_out_pred),log_loss(curr_y[:,i],curr_train_pred))
            train_pred[test_index][:,i] = hold_out_pred[:,1]
            
            # prepare test
            y_test = model.predict_proba(test_x)[:,1]
            test_pred[:,i] += y_test
        print('===========this fold done')
    test_pred = test_pred / fold_cnt
    return [train_pred, test_pred]

with open('../features/lr_feat1.pkl','wb') as fout:
    lr_feat1 = gen_base_lr_feat(train_tfidf1,train_y,test_tfidf1,3,rnd=3)
    pickle.dump(lr_feat1,fout)


0 0.106869747783 0.0765935418469
1 0.0263652997404 0.0178789749717
2 0.0614181887651 0.0396472928721
3 0.00972236584927 0.00514685887537
4 0.075266301171 0.0521699030141
5 0.0244159962879 0.0162123495819
0 0.10657921818 0.0767299325569
1 0.0267222155878 0.0178033427337
2 0.0586618223194 0.0412339861803
3 0.00945594330306 0.00514799394355
4 0.0753552088324 0.0527258089475
5 0.0248968924598 0.0158238339973
0 0.10675209651 0.076658387922
1 0.0267690293319 0.017750692984
2 0.0595751698665 0.0407993925959
3 0.00843140716315 0.00553344660806
4 0.0741067501394 0.0530074662521
5 0.0262785261672 0.0156536020456


In [10]:
try:
    del tf_vec2,tf_vec1,svd_obj
except:
    pass
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
print(type(train_tfidf2),train_tfidf2.shape)
comb_train = csr_matrix(hstack((train_tfidf2,train_tfidf1)))
print(type(comb_train),comb_train.shape)
comb_test = csr_matrix(hstack((test_tfidf2,test_tfidf1)))

<class 'scipy.sparse.csr.csr_matrix'> (159571, 20000)
<class 'scipy.sparse.csr.csr_matrix'> (159571, 35000)


In [11]:
with open('../features/lr_feat2.pkl','wb') as fout:
    lr_feat2 = gen_base_lr_feat(comb_train,
                                train_y,
                                comb_test,3,rnd=11)
    pickle.dump(lr_feat2,fout)

0 0.099639656659 0.054628393736
1 0.0249534202316 0.0127154611361
2 0.0564826678753 0.0268183287149




3 0.00938601415621 0.00325840500521
4 0.0706398569588 0.0382229957783
5 0.0228506563076 0.0104947463601
0 0.09861080818 0.0550626182905
1 0.0260374885791 0.0124551870458
2 0.0536054008344 0.0282680901347
3 0.00894000712716 0.00332682671695
4 0.072252116926 0.0380555492809
5 0.0236037952282 0.0102168299594
0 0.0993468768689 0.0547179527111
1 0.0257323118392 0.0123429917305
2 0.0546772528595 0.0278125910116
3 0.00791669641523 0.00360682022169
4 0.0706789480791 0.0384201282555
5 0.0240889645387 0.0102627641676


In [12]:
from sklearn.naive_bayes import MultinomialNB
def gen_base_mnb_feat(train_x,train_y,test_x,fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        
        for i in range(6):
            model = MultinomialNB(alpha=0.2)
            # train and pred
            # fit for i
            model.fit(curr_x, curr_y[:,i])
            
            # prepare for i on this fold
            hold_out_pred = model.predict_proba(hold_out_x)
            curr_train_pred = model.predict_proba(curr_x)
            print(i,log_loss(hold_out_y[:,i],hold_out_pred),log_loss(curr_y[:,i],curr_train_pred))
            train_pred[test_index][:,i] = hold_out_pred[:,1]
            
            # prepare test
            y_test = model.predict_proba(test_x)[:,1]
            test_pred[:,i] += y_test
        print('===========this fold done')
    test_pred = test_pred / fold_cnt
    return [train_pred, test_pred]

with open('../features/mnb_feat1.pkl','wb') as fout:
    _feat1 = gen_base_mnb_feat(train_tfidf1,train_y,test_tfidf1,3)
    pickle.dump(_feat1,fout)

0 0.135064637786 0.119426880993
1 0.0280498515944 0.022540605607
2 0.0822088496144 0.070409846418
3 0.0174418668927 0.0114750738812
4 0.0859010193469 0.0748272828438
5 0.0292018964185 0.0244956319078
0 0.136402005569 0.119277291188
1 0.0278692392234 0.0225375090818
2 0.0831849543181 0.0708722878025
3 0.0140445220646 0.0120016718168
4 0.0884407651275 0.0742108440618
5 0.0303573563701 0.0239013261819
0 0.135258956974 0.119395227808
1 0.0278011896406 0.0227454464303
2 0.0829899448268 0.0708068473162
3 0.0142391699087 0.0119526680429
4 0.087647809208 0.0748190278716
5 0.0322763447175 0.0235071268866


In [13]:
with open('../features/mnb_feat2.pkl','wb') as fout:
    _feat2 = gen_base_mnb_feat(comb_train,
                                train_y,
                                comb_test,3,rnd=29)
    pickle.dump(_feat2,fout)

0 0.186172501784 0.17796216859
1 0.138356732746 0.139951829086
2 0.156714015863 0.143592378619
3 0.0372164529446 0.0324088537377
4 0.180579797164 0.167767305069
5 0.121608716466 0.117026055627
0 0.19673424959 0.176980401368
1 0.152391556595 0.138566231187
2 0.149583943395 0.143122222898
3 0.0420097037746 0.0370923417515
4 0.176807723136 0.165516545156
5 0.1301960023 0.11808540472
0 0.196710879649 0.178708104766
1 0.141026969453 0.14040481386
2 0.158906866955 0.144656850901
3 0.0454890042183 0.0409103477265
4 0.181096771759 0.16944895783
5 0.117908591271 0.112924617458
