In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression, Ridge
import pickle
import re
  

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = train[list_classes].values
train['comment_text'] = train['comment_text'].fillna('nan')
test['comment_text'] = test['comment_text'].fillna('nan')
print('load done')

load done


In [2]:
tf_vec1 = TfidfVectorizer(sublinear_tf=True,
                            strip_accents='unicode',
                            analyzer='word',
                            token_pattern=r'\w{1,}',
                            ngram_range=(1, 1),
                            max_features=15000)
train_tfidf1 = tf_vec1.fit_transform(train['comment_text'].values)
test_tfidf1 = tf_vec1.transform(test['comment_text'].values)

print(train_tfidf1.shape)

(159571, 15000)


In [3]:
n_comp = 30
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
train_svd1 = svd_obj.fit_transform(train_tfidf1)
test_svd1 = svd_obj.transform(test_tfidf1)

print(type(train_svd1),train_svd1.shape)
with open('../features/tfidf_feat1.pkl','wb') as fout:
    pickle.dump([train_svd1,test_svd1],fout)
print('dump done')
#simple_eval(train_svd1,train_y,XGBClassifier)
print('-----------')

<class 'numpy.ndarray'> (159571, 30)
dump done
-----------


In [4]:
#simple_eval(train_tfidf1,train_y,LogisticRegression)
#print('-----------')

In [5]:
tf_vec2 = TfidfVectorizer(sublinear_tf=True,
                        strip_accents='unicode',
                        analyzer='char',
                        ngram_range=(1, 5),
                        max_features=20000
                         )
train_tfidf2 = tf_vec2.fit_transform(train['comment_text'].values)
test_tfidf2 = tf_vec2.transform(test['comment_text'].values)
print(train_tfidf2.shape)

(159571, 20000)


In [6]:
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
train_svd2 = svd_obj.fit_transform(train_tfidf2)
test_svd2 = svd_obj.transform(test_tfidf2)

print(type(train_svd2),train_svd2.shape)
with open('../features/tfidf_feat2.pkl','wb') as fout:
    pickle.dump([train_svd2,test_svd2],fout)
print('dump done')
#simple_eval(train_svd2,train_y,XGBClassifier)
print('-----------')

<class 'numpy.ndarray'> (159571, 30)
dump done
-----------


In [7]:
test_m = LogisticRegression()
test_m.fit(train_tfidf1,train_y[:,0])
res = test_m.predict_proba(train_tfidf1)
print(res[:20])

[[9.93791465e-01 6.20853525e-03]
 [9.80035544e-01 1.99644562e-02]
 [9.78047041e-01 2.19529591e-02]
 [9.98319365e-01 1.68063457e-03]
 [9.29023406e-01 7.09765939e-02]
 [9.93445038e-01 6.55496179e-03]
 [7.24703146e-02 9.27529685e-01]
 [9.41548506e-01 5.84514941e-02]
 [9.68103493e-01 3.18965074e-02]
 [9.72104652e-01 2.78953476e-02]
 [9.99899362e-01 1.00637849e-04]
 [9.44155988e-01 5.58440124e-02]
 [9.45490459e-01 5.45095412e-02]
 [9.77286823e-01 2.27131766e-02]
 [9.65986600e-01 3.40134002e-02]
 [9.39576365e-01 6.04236353e-02]
 [8.59454509e-01 1.40545491e-01]
 [9.97199600e-01 2.80039959e-03]
 [9.95466470e-01 4.53352967e-03]
 [9.87808196e-01 1.21918045e-02]]


In [20]:
FOLD_CNT = 10

from sklearn.model_selection import KFold
def gen_base_lr_feat(train_x,train_y,test_x,fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        
        for i in range(6):
            model = LogisticRegression(solver='sag')
            # train and pred
            # fit for i
            model.fit(curr_x, curr_y[:,i])
            
            # prepare for i on this fold
            hold_out_pred = model.predict_proba(hold_out_x)
            #print(hold_out_pred[:10])
            curr_train_pred = model.predict_proba(curr_x)
            print(i,log_loss(hold_out_y[:,i],hold_out_pred),log_loss(curr_y[:,i],curr_train_pred))
            train_pred[test_index,i] = list(hold_out_pred[:,1].flatten())
            # print('value', np.sum(train_pred[:,i]))
            
            # prepare test
            y_test = model.predict_proba(test_x)[:,1]
            test_pred[:,i] += y_test
        print('===========this fold done')
    test_pred = test_pred / fold_cnt
    #print(train_pred[:10])
    return [train_pred, test_pred]



with open('../features/lr_feat1.pkl','wb') as fout:
    lr_feat1 = gen_base_lr_feat(train_tfidf1,train_y,test_tfidf1,FOLD_CNT,rnd=3)
    pickle.dump(lr_feat1,fout)


0 0.10979960347266333 0.09896561694173542
1 0.02720006099187827 0.023151659144558513
2 0.062137294769084014 0.05427196128945869
3 0.011000500431473978 0.00847332471862434
4 0.07881512428087477 0.0663323385926536
5 0.025017944762569293 0.022851846136550197
0 0.11462564116130843 0.09850167256894128
1 0.027545732809584063 0.02311993734314232
2 0.06573899590521984 0.05380418583673979
3 0.01018402448560268 0.00853293946760875
4 0.07615336435068107 0.06652579799403885
5 0.026961266985451943 0.02267936083061755
0 0.11197384131136078 0.09891050977666685
1 0.0255033002013384 0.023307466432269896
2 0.06152059168598452 0.05433685600214215
3 0.010495899840595454 0.008544557243353379
4 0.07622853510080775 0.06646158983346043
5 0.026943229044574846 0.02267943178521963
0 0.11316112813087548 0.09858795845238326
1 0.025025442075033565 0.0233568464012065
2 0.06605445371493615 0.05387568493517784
3 0.012082146072289222 0.008394025002832753
4 0.07809637356843185 0.06639877891426613
5 0.0254159558578662 0.

In [21]:
try:
    del tf_vec2,tf_vec1,svd_obj
except:
    pass
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
print(type(train_tfidf2),train_tfidf2.shape)
comb_train = csr_matrix(hstack((train_tfidf2,train_tfidf1)))
print(type(comb_train),comb_train.shape)
comb_test = csr_matrix(hstack((test_tfidf2,test_tfidf1)))

<class 'scipy.sparse.csr.csr_matrix'> (159571, 20000)
<class 'scipy.sparse.csr.csr_matrix'> (159571, 35000)


In [22]:
with open('../features/lr_feat2.pkl','wb') as fout:
    lr_feat2 = gen_base_lr_feat(comb_train,
                                train_y,
                                comb_test,FOLD_CNT,rnd=11)
    pickle.dump(lr_feat2,fout)

# pre first fold
# 0 0.102241368889 0.0801105597102
# value 5060.86889559
# 1 0.0246173591039 0.0192587631549
# value 526.726637916
# 2 0.056389939991 0.0419326132626
# value 2788.22846853
# 3 0.0102994295764 0.00665408777849
# value 148.649550485
# 4 0.0701977896968 0.0548491407929
# value 2614.47061102
# 5 0.0234527531648 0.0180436061952
# value 462.585903392
# ===========this fold done

0 0.098774754641582 0.07915151355358084
1 0.025184978556446778 0.019041449745457722
2 0.05369561018472291 0.042121817279004936
3 0.009941324935575676 0.006501461388060503
4 0.0715878844738845 0.054559269860794526
5 0.021965914807135157 0.01762462728546486
0 0.10164050167789963 0.0789221390183588
1 0.025140557773119618 0.01906653365602601
2 0.058247756744541926 0.04163096902367481
3 0.00910309727472782 0.00655957286087448
4 0.06962510456143833 0.054722222963508635
5 0.023441650962694533 0.017468953731112593
0 0.09692995202090067 0.07945979069668047
1 0.023333988370940195 0.019201990765704505
2 0.05343418355482056 0.0421398917379652
3 0.009145482384380652 0.006555421262305305
4 0.06835634614720004 0.05475767677335491
5 0.02373366752738558 0.017469133496867083
0 0.09941483109188719 0.07907929350847934
1 0.023589621237725435 0.019200296095447187
2 0.05751688497984832 0.04180958726787386
3 0.010651367480790597 0.006473163132128603
4 0.0710322653825429 0.05458966595367894
5 0.022182813860883

In [23]:
def gen_base_ridge_feat(train_x,train_y,test_x,fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        
        for i in range(6):
            model = Ridge(alpha=20, copy_X=True, fit_intercept=True, 
                          solver='auto',max_iter=100,normalize=False, random_state=0,  tol=0.0025)
            # train and pred
            # fit for i
            model.fit(curr_x, curr_y[:,i])
            
            # prepare for i on this fold
            hold_out_pred = model.predict(hold_out_x)
            #print(hold_out_pred[:10])
            curr_train_pred = model.predict(curr_x)
            print(i,log_loss(hold_out_y[:,i],hold_out_pred),log_loss(curr_y[:,i],curr_train_pred))
            train_pred[test_index,i] = list(hold_out_pred.flatten())
            # print('value', np.sum(train_pred[:,i]))
            
            # prepare test
            y_test = model.predict(test_x)
            test_pred[:,i] += y_test
            #print(test_pred[:5])
        print('===========this fold done')
    test_pred = test_pred / fold_cnt
    #print(train_pred[:10])
    return [train_pred, test_pred]

with open('../features/ridge_feat1.pkl','wb') as fout:
    lr_feat2 = gen_base_ridge_feat(train_tfidf1,
                                   train_y,
                                   test_tfidf1,FOLD_CNT,rnd=12)
    pickle.dump(lr_feat2,fout)

0 0.14000874149101825 0.13726698257617057
1 0.027039264357669908 0.025546707668226757
2 0.07576018563509507 0.07267267852502338
3 0.012620128711821988 0.01073524095226189
4 0.0911816888526827 0.08786882783932934
5 0.02985499650055174 0.02550985028338122
0 0.15096689362664178 0.1355043751792196
1 0.028625319694508863 0.0257714497472438
2 0.08066704493352796 0.07223642062251684
3 0.011640303024249971 0.010842597880888687
4 0.08897328515177895 0.08860494073219534
5 0.02812587189372931 0.02517364119950403
0 0.1463837220930897 0.13681853513288492
1 0.026494556639596354 0.025823140746131883
2 0.07342534035611584 0.073192173008876
3 0.012409808874573101 0.010766908181262925
4 0.1035122550752089 0.08731226634627312
5 0.031925065273412295 0.02524823240584563
0 0.14319738317461286 0.13684222416699998
1 0.0290169345949462 0.025812154217138467
2 0.07965439058034172 0.07267193867552764
3 0.013188241618561535 0.010691839698499562
4 0.09648346398990881 0.08752292051257121
5 0.0306076825742491 0.02538

In [24]:
with open('../features/ridge_feat2.pkl','wb') as fout:
    lr_feat2 = gen_base_ridge_feat(comb_train,
                                train_y,
                                comb_test,FOLD_CNT,rnd=13)
    pickle.dump(lr_feat2,fout)

0 0.12766545335805593 0.12019685935902497
1 0.026333119128341766 0.02412952952017802
2 0.06982059838266509 0.06318029713445303
3 0.01451683467285073 0.010575348934990517
4 0.08344636652038491 0.07809041784500809
5 0.025253268302457565 0.023793735242276726
0 0.13277470093030377 0.11965661063855332
1 0.027086558910919305 0.024064818992668092
2 0.07187207312496083 0.06288285427325421
3 0.011663159760932316 0.0106756764623322
4 0.08519207827218431 0.07841832828609735
5 0.02869365862950276 0.02356019997576742
0 0.12896769710761247 0.11994322779821029
1 0.025594092456950997 0.02424890448177865
2 0.06562210433278969 0.06381398336896206
3 0.012396826235858222 0.010596002748575462
4 0.08758378519552666 0.07802941812901802
5 0.028694756482673372 0.02360413976241201
0 0.12906358394053946 0.12025535739182375
1 0.02668143334094101 0.02436632180867595
2 0.06944874181472484 0.06347882870750327
3 0.013398298886361076 0.010516752085265552
4 0.08660896425485548 0.07800299386938382
5 0.02748046749629305 

In [25]:
from sklearn.naive_bayes import MultinomialNB
def gen_base_mnb_feat(train_x,train_y,test_x,fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        
        for i in range(6):
            model = MultinomialNB(alpha=0.2)
            # train and pred
            # fit for i
            model.fit(curr_x, curr_y[:,i])
            
            # prepare for i on this fold
            hold_out_pred = model.predict_proba(hold_out_x)
            curr_train_pred = model.predict_proba(curr_x)
            print(i,log_loss(hold_out_y[:,i],hold_out_pred),log_loss(curr_y[:,i],curr_train_pred))
            train_pred[test_index,i] = hold_out_pred[:,1]
            
            # prepare test
            y_test = model.predict_proba(test_x)[:,1]
            test_pred[:,i] += y_test
        print('===========this fold done')
    test_pred = test_pred / fold_cnt
    #print(train_pred[:10])
    return [train_pred, test_pred]

with open('../features/mnb_feat1.pkl','wb') as fout:
    _feat1 = gen_base_mnb_feat(train_tfidf1,train_y,test_tfidf1,FOLD_CNT,14)
    pickle.dump(_feat1,fout)

0 0.1330099922427433 0.12170319887372258
1 0.028719400861075443 0.02280552717201566
2 0.08214139701490712 0.0721296801925734
3 0.016395133322408238 0.010812460779388558
4 0.08760471341758294 0.07592536566660288
5 0.027171480985197925 0.024136984319296093
0 0.13772739502580397 0.1212815395524188
1 0.02785324623852354 0.022901787058968576
2 0.08350851629293067 0.07189968450295545
3 0.014222048983739213 0.010957064303060265
4 0.08631714222056791 0.07599747028976188
5 0.02923318243026409 0.02393033098439545
0 0.13298852807266384 0.12174018749289935
1 0.02641081149043995 0.02302963439868193
2 0.08193647329052725 0.07200549803651564
3 0.015097505379192433 0.010936328584793749
4 0.08495575496383799 0.07602224670377068
5 0.030562218543322493 0.023916684791236154
0 0.1359435686421793 0.12137143833862038
1 0.02595354305711452 0.02314508946316274
2 0.08344310670049598 0.07194748115816076
3 0.015684088425917283 0.010869324395291235
4 0.08751045373520845 0.07594682442146707
5 0.02880110457885136 0.

In [26]:
with open('../features/mnb_feat2.pkl','wb') as fout:
    _feat2 = gen_base_mnb_feat(comb_train,
                                train_y,
                                comb_test,FOLD_CNT,rnd=29)
    pickle.dump(_feat2,fout)

0 0.19731660264656714 0.18268046705154126
1 0.14989905794235941 0.15485050679642454
2 0.1575766031779702 0.14978958095199033
3 0.051991706448463786 0.050191623037050694
4 0.1855847855234005 0.17363575702386003
5 0.13598174708131358 0.13299884246536345
0 0.1860139725736997 0.1828774805692798
1 0.1563333659423143 0.1543739861339933
2 0.1657865967822671 0.14871742855262157
3 0.056347666829339935 0.05194673620797601
4 0.18239981035870842 0.17347864016214082
5 0.13826603080138156 0.13140407100644655
0 0.1861851453509003 0.18312172808532262
1 0.16008520688190342 0.15423081490311122
2 0.15941769658884944 0.14903132745760123
3 0.055211567188075766 0.04737280931294338
4 0.19061930820581233 0.173002319159281
5 0.1448061414594751 0.13201805704461247
0 0.19688134562145795 0.18244254972501828
1 0.1608863222383025 0.15558662653961414
2 0.15950919012100376 0.14875808214274655
3 0.049858494405430095 0.05069668161050368
4 0.17698072837523554 0.17371441762429693
5 0.13115400210667677 0.13384611590022025