In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import pickle
import re
  

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = train[list_classes].values
train['comment_text'] = train['comment_text'].fillna('nan')
test['comment_text'] = test['comment_text'].fillna('nan')
print('load done')

load done


In [2]:
def simple_eval(x,y,model_f):
    y_cnt = len(y)
    split_idx = y_cnt * 4 // 5  # 80%
    train_x,test_x = x[:split_idx],x[split_idx:]
    train_y,test_y = y[:split_idx],y[split_idx:]
    for i in range(6):
        model = model_f()
        model.fit(train_x,train_y[:,i])
        train_pred = model.predict_proba(train_x)
        val_pred = model.predict_proba(test_x)
        print(list_classes[i])
        print('train log loss',log_loss(train_y[:,i],train_pred))
        print('valid log loss',log_loss(test_y[:,i],val_pred))
print('done')

done


In [3]:
tf_vec1 = TfidfVectorizer(lowercase=True,ngram_range=(1,1),stop_words='english',
                          strip_accents='unicode',token_pattern=r'\w{1,}',
                          max_features=20000,sublinear_tf=True)
train_tfidf1 = tf_vec1.fit_transform(train['comment_text'].values)
test_tfidf1 = tf_vec1.transform(test['comment_text'].values)

print(train_tfidf1.shape)

(159571, 20000)


In [4]:
n_comp = 30
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
train_svd1 = svd_obj.fit_transform(train_tfidf1)
test_svd1 = svd_obj.transform(test_tfidf1)

print(type(train_svd1),train_svd1.shape)
with open('../features/tfidf_feat1.pkl','wb') as fout:
    pickle.dump([train_svd1,test_svd1],fout)
print('dump done')
#simple_eval(train_svd1,train_y,XGBClassifier)
print('-----------')

<class 'numpy.ndarray'> (159571, 30)
dump done
-----------


In [5]:
#simple_eval(train_tfidf1,train_y,LogisticRegression)
#print('-----------')

In [6]:
tf_vec2 = TfidfVectorizer(lowercase=True,ngram_range=(1,4),stop_words='english',
                          strip_accents='unicode',
                          analyzer='char',sublinear_tf=True,
                          max_features=20000
                         )
train_tfidf2 = tf_vec2.fit_transform(train['comment_text'].values)
test_tfidf2 = tf_vec2.transform(test['comment_text'].values)
print(train_tfidf2.shape)

(159571, 20000)


In [7]:
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
train_svd2 = svd_obj.fit_transform(train_tfidf2)
test_svd2 = svd_obj.transform(test_tfidf2)

print(type(train_svd2),train_svd2.shape)
with open('../features/tfidf_feat2.pkl','wb') as fout:
    pickle.dump([train_svd2,test_svd2],fout)
print('dump done')
#simple_eval(train_svd2,train_y,XGBClassifier)
print('-----------')

<class 'numpy.ndarray'> (159571, 30)
dump done
-----------


In [8]:
simple_eval(train_tfidf2,train_y,LogisticRegression)
print('-----------')

toxic
train log loss 0.097541815835
valid log loss 0.110883987182
severe_toxic
train log loss 0.0219622082081
valid log loss 0.0248464967118
obscene
train log loss 0.0523280928271
valid log loss 0.0597151221297
threat
train log loss 0.00907409432779
valid log loss 0.0097982739249
insult
train log loss 0.0640320514502
valid log loss 0.0732435923241
identity_hate
train log loss 0.0208597151247
valid log loss 0.0262118259185
-----------


In [10]:
from sklearn.model_selection import KFold
def gen_base_lr_feat(train_x,train_y,test_x,fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        
        for i in range(6):
            model = LogisticRegression(C=4.0, solver='sag')
            # train and pred
            # fit for i
            model.fit(curr_x, curr_y[:,i])
            
            # prepare for i on this fold
            hold_out_pred = model.predict_proba(hold_out_x)
            curr_train_pred = model.predict_proba(curr_x)
            print(i,log_loss(hold_out_y[:,i],hold_out_pred),log_loss(curr_y[:,i],curr_train_pred))
            train_pred[test_index][:,i] = hold_out_pred[:,1]
            
            # prepare test
            y_test = model.predict_proba(test_x)[:,1]
            test_pred[:,i] += y_test
        print('===========this fold done')
    test_pred = test_pred / fold_cnt
    return [train_pred, test_pred]

with open('../features/lr_feat1.pkl','wb') as fout:
    lr_feat1 = gen_base_lr_feat(train_tfidf1,train_y,test_tfidf1,3,rnd=3)
    pickle.dump(lr_feat1,fout)


0 0.112162928933 0.0772868537356
1 0.0275421343383 0.0177474643214
2 0.0618346678727 0.0385870107629
3 0.0107685962438 0.00555280091532
4 0.0794985925881 0.0532711245892
5 0.0250175069737 0.0158466171446
0 0.111863022997 0.0775423846985
1 0.0272079451912 0.0178824176308
2 0.0589719851043 0.0400635568916
3 0.0105118509577 0.00556538323423
4 0.0792451375962 0.0539148679389
5 0.0254378081868 0.0155320882088
0 0.111737584895 0.0774509017363
1 0.0270029129652 0.0179156085829
2 0.0603379689662 0.0394354345225
3 0.00909578620804 0.00609592294031
4 0.077722820262 0.0542104890624
5 0.0264166247953 0.0154944735549


In [11]:
try:
    del tf_vec2,tf_vec1,svd_obj
except:
    pass
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
print(type(train_tfidf2),train_tfidf2.shape)
comb_train = csr_matrix(hstack((train_tfidf2,train_tfidf1)))
print(type(comb_train),comb_train.shape)
comb_test = csr_matrix(hstack((test_tfidf2,test_tfidf1)))

<class 'scipy.sparse.csr.csr_matrix'> (159571, 20000)
<class 'scipy.sparse.csr.csr_matrix'> (159571, 40000)


In [12]:
with open('../features/lr_feat2.pkl','wb') as fout:
    lr_feat2 = gen_base_lr_feat(comb_train,
                                train_y,
                                comb_test,3,rnd=11)
    pickle.dump(lr_feat2,fout)

0 0.0992753116269 0.0509009814659
1 0.0250997174808 0.0120476879635
2 0.0559167648641 0.0249552065923




3 0.00946535659641 0.0031664655426
4 0.0707790065213 0.0360462550169
5 0.0228416402317 0.00994383327158
0 0.0982116370781 0.0514176614187
1 0.0258702392808 0.0118901480609
2 0.0528490594263 0.0263142293788
3 0.00906345637225 0.00325813530532
4 0.0719653150783 0.0358532461133
5 0.0235059199659 0.00968826654981
0 0.0987321530065 0.051220227543
1 0.0255597546453 0.0117850323748
2 0.0540558517289 0.0257791146956
3 0.00798051846313 0.00352866295317
4 0.070156678508 0.0362656929407
5 0.0237900454711 0.00978369268026


In [13]:
from sklearn.naive_bayes import MultinomialNB
def gen_base_mnb_feat(train_x,train_y,test_x,fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        
        for i in range(6):
            model = MultinomialNB(alpha=0.2)
            # train and pred
            # fit for i
            model.fit(curr_x, curr_y[:,i])
            
            # prepare for i on this fold
            hold_out_pred = model.predict_proba(hold_out_x)
            curr_train_pred = model.predict_proba(curr_x)
            print(i,log_loss(hold_out_y[:,i],hold_out_pred),log_loss(curr_y[:,i],curr_train_pred))
            train_pred[test_index][:,i] = hold_out_pred[:,1]
            
            # prepare test
            y_test = model.predict_proba(test_x)[:,1]
            test_pred[:,i] += y_test
        print('===========this fold done')
    test_pred = test_pred / fold_cnt
    return [train_pred, test_pred]

with open('../features/mnb_feat1.pkl','wb') as fout:
    _feat1 = gen_base_mnb_feat(train_tfidf1,train_y,test_tfidf1,3)
    pickle.dump(_feat1,fout)

0 0.134838087987 0.114813889975
1 0.0290536599381 0.0219682691696
2 0.0809643154916 0.0671168945423
3 0.0171299136645 0.0109046123392
4 0.0861348563346 0.0726614396987
5 0.0293209998066 0.0232365891808
0 0.136720689342 0.114593195684
1 0.0283361708217 0.0222039457422
2 0.0830842038615 0.0672832938731
3 0.0141937554341 0.0113706076216
4 0.0896873062395 0.0718669918728
5 0.0301157664279 0.0227412117302
0 0.135560647951 0.114813144008
1 0.0283633193014 0.0224347666845
2 0.0831101256533 0.0671101179289
3 0.0140281749492 0.0114971282078
4 0.0892222960358 0.0723001805665
5 0.0323889903141 0.0223151980501


In [14]:
with open('../features/mnb_feat2.pkl','wb') as fout:
    _feat2 = gen_base_mnb_feat(comb_train,
                                train_y,
                                comb_test,3,rnd=29)
    pickle.dump(_feat2,fout)

0 0.163815422018 0.150855394436
1 0.0970833485844 0.095349318838
2 0.134126022259 0.118827119824
3 0.0239551423336 0.0167431901075
4 0.151859877946 0.137775093853
5 0.0834846344072 0.0765224013328
0 0.173644757287 0.149794204233
1 0.107957363014 0.0952815484169
2 0.12886654133 0.118355840651
3 0.0243055508628 0.019107162246
4 0.150209871985 0.13551310893
5 0.0902276189779 0.0770138150816
0 0.173611755998 0.151461598431
1 0.0978086407079 0.0956687321528
2 0.136004584257 0.119273035877
3 0.0260453470303 0.0202408504824
4 0.15408210874 0.138587483728
5 0.0797546572766 0.0732659613781
