In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import pickle
import re
  

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = train[list_classes].values
train['comment_text'] = train['comment_text'].fillna('nan')
test['comment_text'] = test['comment_text'].fillna('nan')
print('load done')

load done


In [2]:
tf_vec1 = TfidfVectorizer(sublinear_tf=True,
                            strip_accents='unicode',
                            analyzer='word',
                            token_pattern=r'\w{1,}',
                            ngram_range=(1, 1),
                            max_features=15000)
train_tfidf1 = tf_vec1.fit_transform(train['comment_text'].values)
test_tfidf1 = tf_vec1.transform(test['comment_text'].values)

print(train_tfidf1.shape)

(159571, 15000)


In [3]:
n_comp = 30
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
train_svd1 = svd_obj.fit_transform(train_tfidf1)
test_svd1 = svd_obj.transform(test_tfidf1)

print(type(train_svd1),train_svd1.shape)
with open('../features/tfidf_feat1.pkl','wb') as fout:
    pickle.dump([train_svd1,test_svd1],fout)
print('dump done')
#simple_eval(train_svd1,train_y,XGBClassifier)
print('-----------')

<class 'numpy.ndarray'> (159571, 30)
dump done
-----------


In [4]:
#simple_eval(train_tfidf1,train_y,LogisticRegression)
#print('-----------')

In [5]:
tf_vec2 = TfidfVectorizer(sublinear_tf=True,
                        strip_accents='unicode',
                        analyzer='char',
                        ngram_range=(1, 5),
                        max_features=20000
                         )
train_tfidf2 = tf_vec2.fit_transform(train['comment_text'].values)
test_tfidf2 = tf_vec2.transform(test['comment_text'].values)
print(train_tfidf2.shape)

(159571, 20000)


In [6]:
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
train_svd2 = svd_obj.fit_transform(train_tfidf2)
test_svd2 = svd_obj.transform(test_tfidf2)

print(type(train_svd2),train_svd2.shape)
with open('../features/tfidf_feat2.pkl','wb') as fout:
    pickle.dump([train_svd2,test_svd2],fout)
print('dump done')
#simple_eval(train_svd2,train_y,XGBClassifier)
print('-----------')

<class 'numpy.ndarray'> (159571, 30)
dump done
-----------


In [7]:
test_m = LogisticRegression()
test_m.fit(train_tfidf1,train_y[:,0])
res = test_m.predict_proba(train_tfidf1)
print(res[:20])

[[  9.93791465e-01   6.20853525e-03]
 [  9.80035544e-01   1.99644562e-02]
 [  9.78047041e-01   2.19529591e-02]
 [  9.98319365e-01   1.68063457e-03]
 [  9.29023406e-01   7.09765939e-02]
 [  9.93445038e-01   6.55496179e-03]
 [  7.24703146e-02   9.27529685e-01]
 [  9.41548506e-01   5.84514941e-02]
 [  9.68103493e-01   3.18965074e-02]
 [  9.72104652e-01   2.78953476e-02]
 [  9.99899362e-01   1.00637849e-04]
 [  9.44155988e-01   5.58440124e-02]
 [  9.45490459e-01   5.45095412e-02]
 [  9.77286823e-01   2.27131766e-02]
 [  9.65986600e-01   3.40134002e-02]
 [  9.39576365e-01   6.04236353e-02]
 [  8.59454509e-01   1.40545491e-01]
 [  9.97199600e-01   2.80039959e-03]
 [  9.95466470e-01   4.53352967e-03]
 [  9.87808196e-01   1.21918045e-02]]


In [8]:
from sklearn.model_selection import KFold
def gen_base_lr_feat(train_x,train_y,test_x,fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        
        for i in range(6):
            model = LogisticRegression(solver='sag')
            # train and pred
            # fit for i
            model.fit(curr_x, curr_y[:,i])
            
            # prepare for i on this fold
            hold_out_pred = model.predict_proba(hold_out_x)
            #print(hold_out_pred[:10])
            curr_train_pred = model.predict_proba(curr_x)
            print(i,log_loss(hold_out_y[:,i],hold_out_pred),log_loss(curr_y[:,i],curr_train_pred))
            train_pred[test_index,i] = list(hold_out_pred[:,1].flatten())
            print('value', np.sum(train_pred[:,i]))
            
            # prepare test
            y_test = model.predict_proba(test_x)[:,1]
            test_pred[:,i] += y_test
        print('===========this fold done')
    test_pred = test_pred / fold_cnt
    #print(train_pred[:10])
    return [train_pred, test_pred]

with open('../features/lr_feat1.pkl','wb') as fout:
    lr_feat1 = gen_base_lr_feat(train_tfidf1,train_y,test_tfidf1,3,rnd=3)
    pickle.dump(lr_feat1,fout)


0 0.11637841408 0.101470700629
value 5073.67347141
1 0.0268814841569 0.0235481118326
value 527.776488528
2 0.0654412597451 0.0551604984025
value 2797.97919431
3 0.0116117976142 0.00877895441786
value 147.773767381
4 0.0780919918992 0.0672385638957
value 2626.93713451
5 0.0268349189501 0.0237007081884
value 462.355246289
0 0.115710924201 0.101490724417
value 10089.3685007
1 0.0271496944531 0.023369197004
value 1046.27653183
2 0.064373301468 0.0564090474947
value 5553.39412903
3 0.0104784711549 0.00880192538104
value 308.332413507
4 0.078593140831 0.0675666935586
value 5177.39871435
5 0.0267509200719 0.0233423897831
value 930.882672714
0 0.115965060406 0.101505392757
value 15105.6093417
1 0.0272824824149 0.0232743753467
value 1567.99001146
2 0.0646862020847 0.0560259446838
value 8315.94218415
3 0.00986511031698 0.00923503781117
value 463.357855318
4 0.0769433286852 0.0679915506206
value 7749.74417713
5 0.0285649653593 0.0229594146864
value 1370.07957545


In [9]:
try:
    del tf_vec2,tf_vec1,svd_obj
except:
    pass
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
print(type(train_tfidf2),train_tfidf2.shape)
comb_train = csr_matrix(hstack((train_tfidf2,train_tfidf1)))
print(type(comb_train),comb_train.shape)
comb_test = csr_matrix(hstack((test_tfidf2,test_tfidf1)))

<class 'scipy.sparse.csr.csr_matrix'> (159571, 20000)
<class 'scipy.sparse.csr.csr_matrix'> (159571, 35000)


In [10]:
with open('../features/lr_feat2.pkl','wb') as fout:
    lr_feat2 = gen_base_lr_feat(comb_train,
                                train_y,
                                comb_test,3,rnd=11)
    pickle.dump(lr_feat2,fout)

# pre first fold
# 0 0.102241368889 0.0801105597102
# value 5060.86889559
# 1 0.0246173591039 0.0192587631549
# value 526.726637916
# 2 0.056389939991 0.0419326132626
# value 2788.22846853
# 3 0.0102994295764 0.00665408777849
# value 148.649550485
# 4 0.0701977896968 0.0548491407929
# value 2614.47061102
# 5 0.0234527531648 0.0180436061952
# value 462.585903392
# ===========this fold done

0 0.102239095102 0.0800917061043
value 5059.15848823
1 0.0246358045399 0.0192438107933
value 524.575923679
2 0.0563597855779 0.0418893867255
value 2785.90493243
3 0.0102722342089 0.00660976936988
value 145.555880004
4 0.0701651831552 0.0548051232883
value 2612.02234374
5 0.023418342449 0.017993678117
value 459.831726417
0 0.101350251021 0.0804011873994
value 10034.6520956
1 0.0252152057843 0.0189389976528
value 1038.586226
2 0.0550991442584 0.0432158878634
value 5504.46980357
3 0.00934594512366 0.00666309066274
value 301.799106406
4 0.0714712989318 0.0546115190144
value 5141.74630297
5 0.0238734233266 0.0175308764241
value 914.550030731
0 0.101944315215 0.0801712996994
value 15042.1865961
1 0.0252180298612 0.0188807253836
value 1550.77476294
2 0.0557122351597 0.0427630890005
value 8252.99794711
3 0.00871845630187 0.00707120488699
value 447.497172205
4 0.0697444725026 0.0550089569535
value 7691.56770793
5 0.0247529883182 0.0175928125275
value 1344.11624097


In [11]:
from sklearn.naive_bayes import MultinomialNB
def gen_base_mnb_feat(train_x,train_y,test_x,fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        
        for i in range(6):
            model = MultinomialNB(alpha=0.2)
            # train and pred
            # fit for i
            model.fit(curr_x, curr_y[:,i])
            
            # prepare for i on this fold
            hold_out_pred = model.predict_proba(hold_out_x)
            curr_train_pred = model.predict_proba(curr_x)
            print(i,log_loss(hold_out_y[:,i],hold_out_pred),log_loss(curr_y[:,i],curr_train_pred))
            train_pred[test_index,i] = hold_out_pred[:,1]
            
            # prepare test
            y_test = model.predict_proba(test_x)[:,1]
            test_pred[:,i] += y_test
        print('===========this fold done')
    test_pred = test_pred / fold_cnt
    #print(train_pred[:10])
    return [train_pred, test_pred]

with open('../features/mnb_feat1.pkl','wb') as fout:
    _feat1 = gen_base_mnb_feat(train_tfidf1,train_y,test_tfidf1,3)
    pickle.dump(_feat1,fout)

0 0.135064637786 0.119426880993
1 0.0280498515944 0.022540605607
2 0.0822088496144 0.070409846418
3 0.0174418668927 0.0114750738812
4 0.0859010193469 0.0748272828438
5 0.0292018964185 0.0244956319078
0 0.136402005569 0.119277291188
1 0.0278692392234 0.0225375090818
2 0.0831849543181 0.0708722878025
3 0.0140445220646 0.0120016718168
4 0.0884407651275 0.0742108440618
5 0.0303573563701 0.0239013261819
0 0.135258956974 0.119395227808
1 0.0278011896406 0.0227454464303
2 0.0829899448268 0.0708068473162
3 0.0142391699087 0.0119526680429
4 0.087647809208 0.0748190278716
5 0.0322763447175 0.0235071268866


In [12]:
with open('../features/mnb_feat2.pkl','wb') as fout:
    _feat2 = gen_base_mnb_feat(comb_train,
                                train_y,
                                comb_test,3,rnd=29)
    pickle.dump(_feat2,fout)

0 0.186172501784 0.17796216859
1 0.138356732746 0.139951829086
2 0.156714015863 0.143592378619
3 0.0372164529446 0.0324088537377
4 0.180579797164 0.167767305069
5 0.121608716466 0.117026055627
0 0.19673424959 0.176980401368
1 0.152391556595 0.138566231187
2 0.149583943395 0.143122222898
3 0.0420097037746 0.0370923417515
4 0.176807723136 0.165516545156
5 0.1301960023 0.11808540472
0 0.196710879649 0.178708104766
1 0.141026969453 0.14040481386
2 0.158906866955 0.144656850901
3 0.0454890042183 0.0409103477265
4 0.181096771759 0.16944895783
5 0.117908591271 0.112924617458
