In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import pickle
import re
  

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = train[list_classes].values
train['comment_text'] = train['comment_text'].fillna('nan')
test['comment_text'] = test['comment_text'].fillna('nan')
print('load done')

load done


In [2]:
tf_vec1 = TfidfVectorizer(sublinear_tf=True,
                            strip_accents='unicode',
                            analyzer='word',
                            token_pattern=r'\w{1,}',
                            ngram_range=(1, 1),
                            max_features=15000)
train_tfidf1 = tf_vec1.fit_transform(train['comment_text'].values)
test_tfidf1 = tf_vec1.transform(test['comment_text'].values)

print(train_tfidf1.shape)

(159571, 15000)


In [3]:
n_comp = 30
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
train_svd1 = svd_obj.fit_transform(train_tfidf1)
test_svd1 = svd_obj.transform(test_tfidf1)

print(type(train_svd1),train_svd1.shape)
with open('../features/tfidf_feat1.pkl','wb') as fout:
    pickle.dump([train_svd1,test_svd1],fout)
print('dump done')
#simple_eval(train_svd1,train_y,XGBClassifier)
print('-----------')

<class 'numpy.ndarray'> (159571, 30)
dump done
-----------


In [4]:
#simple_eval(train_tfidf1,train_y,LogisticRegression)
#print('-----------')

In [5]:
tf_vec2 = TfidfVectorizer(sublinear_tf=True,
                        strip_accents='unicode',
                        analyzer='char',
                        ngram_range=(1, 5),
                        max_features=20000
                         )
train_tfidf2 = tf_vec2.fit_transform(train['comment_text'].values)
test_tfidf2 = tf_vec2.transform(test['comment_text'].values)
print(train_tfidf2.shape)

(159571, 20000)


In [6]:
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
train_svd2 = svd_obj.fit_transform(train_tfidf2)
test_svd2 = svd_obj.transform(test_tfidf2)

print(type(train_svd2),train_svd2.shape)
with open('../features/tfidf_feat2.pkl','wb') as fout:
    pickle.dump([train_svd2,test_svd2],fout)
print('dump done')
#simple_eval(train_svd2,train_y,XGBClassifier)
print('-----------')

<class 'numpy.ndarray'> (159571, 30)
dump done
-----------


In [7]:
test_m = LogisticRegression()
test_m.fit(train_tfidf1,train_y[:,0])
res = test_m.predict_proba(train_tfidf1)
print(res[:20])

[[  9.93791465e-01   6.20853525e-03]
 [  9.80035544e-01   1.99644562e-02]
 [  9.78047041e-01   2.19529591e-02]
 [  9.98319365e-01   1.68063457e-03]
 [  9.29023406e-01   7.09765939e-02]
 [  9.93445038e-01   6.55496179e-03]
 [  7.24703146e-02   9.27529685e-01]
 [  9.41548506e-01   5.84514941e-02]
 [  9.68103493e-01   3.18965074e-02]
 [  9.72104652e-01   2.78953476e-02]
 [  9.99899362e-01   1.00637849e-04]
 [  9.44155988e-01   5.58440124e-02]
 [  9.45490459e-01   5.45095412e-02]
 [  9.77286823e-01   2.27131766e-02]
 [  9.65986600e-01   3.40134002e-02]
 [  9.39576365e-01   6.04236353e-02]
 [  8.59454509e-01   1.40545491e-01]
 [  9.97199600e-01   2.80039959e-03]
 [  9.95466470e-01   4.53352967e-03]
 [  9.87808196e-01   1.21918045e-02]]


In [8]:
from sklearn.model_selection import KFold
def gen_base_lr_feat(train_x,train_y,test_x,fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        
        for i in range(6):
            model = LogisticRegression(solver='sag')
            # train and pred
            # fit for i
            model.fit(curr_x, curr_y[:,i])
            
            # prepare for i on this fold
            hold_out_pred = model.predict_proba(hold_out_x)
            #print(hold_out_pred[:10])
            curr_train_pred = model.predict_proba(curr_x)
            print(i,log_loss(hold_out_y[:,i],hold_out_pred),log_loss(curr_y[:,i],curr_train_pred))
            train_pred[test_index,i] = list(hold_out_pred[:,1].flatten())
            print('value', np.sum(train_pred[:,i]))
            
            # prepare test
            y_test = model.predict_proba(test_x)[:,1]
            test_pred[:,i] += y_test
        print('===========this fold done')
    test_pred = test_pred / fold_cnt
    #print(train_pred[:10])
    return [train_pred, test_pred]

with open('../features/lr_feat1.pkl','wb') as fout:
    lr_feat1 = gen_base_lr_feat(train_tfidf1,train_y,test_tfidf1,5,rnd=3)
    pickle.dump(lr_feat1,fout)


0 0.113500936706 0.0997201092397
value 3070.23470695
1 0.027854372592 0.0230948716027
value 317.312680365
2 0.065036301593 0.0542706814096
value 1682.08147824
3 0.0108139809824 0.00861309387776
value 88.9334318053
4 0.077999343874 0.0666086895257
value 1572.06659916
5 0.0264100739872 0.0230796532461
value 281.054554301
0 0.11439927879 0.099658336191
value 6083.80900341
1 0.0253598449757 0.0235929350643
value 631.932505451
2 0.0645951141862 0.0544715772649
value 3338.62690625
3 0.0115849282023 0.00854300985482
value 183.746771638
4 0.0777074218487 0.0666025749871
value 3120.98202178
5 0.0268064066618 0.0229512961132
value 550.891430024
0 0.112739435006 0.0999176214636
value 9095.78106546
1 0.0282918074964 0.0229714119392
value 933.470925428
2 0.0615502443829 0.0554807114092
value 5002.63757765
3 0.0104156330202 0.00856162549939
value 276.584465503
4 0.0764465228709 0.0671635609432
value 4663.0051079
5 0.0249729952424 0.0231745660625
value 830.044642046
0 0.113641711347 0.0997819002181
v

In [9]:
try:
    del tf_vec2,tf_vec1,svd_obj
except:
    pass
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
print(type(train_tfidf2),train_tfidf2.shape)
comb_train = csr_matrix(hstack((train_tfidf2,train_tfidf1)))
print(type(comb_train),comb_train.shape)
comb_test = csr_matrix(hstack((test_tfidf2,test_tfidf1)))

<class 'scipy.sparse.csr.csr_matrix'> (159571, 20000)
<class 'scipy.sparse.csr.csr_matrix'> (159571, 35000)


In [10]:
with open('../features/lr_feat2.pkl','wb') as fout:
    lr_feat2 = gen_base_lr_feat(comb_train,
                                train_y,
                                comb_test,5,rnd=11)
    pickle.dump(lr_feat2,fout)

# pre first fold
# 0 0.102241368889 0.0801105597102
# value 5060.86889559
# 1 0.0246173591039 0.0192587631549
# value 526.726637916
# 2 0.056389939991 0.0419326132626
# value 2788.22846853
# 3 0.0102994295764 0.00665408777849
# value 148.649550485
# 4 0.0701977896968 0.0548491407929
# value 2614.47061102
# 5 0.0234527531648 0.0180436061952
# value 462.585903392
# ===========this fold done

0 0.100909333581 0.0792918252609
value 3055.79384114
1 0.0256903892666 0.0189550739087
value 311.623999626
2 0.0567181414043 0.0416830494684
value 1673.41401602
3 0.00973549825632 0.00655675910027
value 85.9183936085
4 0.0710422582748 0.0545085947835
value 1560.10435426
5 0.0230226349176 0.0176918303664
value 279.129508035
0 0.0995092612664 0.0797156589489
value 6067.76645727
1 0.0234275681011 0.0193487649282
value 626.256200702
2 0.0557114316353 0.0419808369082
value 3322.6244022
3 0.0100271951409 0.00654935882583
value 182.565448673
4 0.0700607930895 0.0545855634658
value 3103.41796912
5 0.0236178503061 0.017558769134
value 547.272097833
0 0.0992456314304 0.0796425627958
value 9060.08304168
1 0.0262452664987 0.0188093657365
value 927.816878596
2 0.0525593342791 0.0429490025332
value 4966.84040274
3 0.00931181420069 0.00655116696845
value 272.682226478
4 0.0695901096694 0.0548669630779
value 4634.37512834
5 0.0224933798031 0.0176295318077
value 818.027040023
0 0.0998843314895 0.079677

In [11]:
from sklearn.naive_bayes import Na
def gen_base_mnb_feat(train_x,train_y,test_x,fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((159571,6)),np.zeros((153164,6))
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        
        for i in range(6):
            model = MultinomialNB(alpha=0.2)
            # train and pred
            # fit for i
            model.fit(curr_x, curr_y[:,i])
            
            # prepare for i on this fold
            hold_out_pred = model.predict_proba(hold_out_x)
            curr_train_pred = model.predict_proba(curr_x)
            print(i,log_loss(hold_out_y[:,i],hold_out_pred),log_loss(curr_y[:,i],curr_train_pred))
            train_pred[test_index,i] = hold_out_pred[:,1]
            
            # prepare test
            y_test = model.predict_proba(test_x)[:,1]
            test_pred[:,i] += y_test
        print('===========this fold done')
    test_pred = test_pred / fold_cnt
    #print(train_pred[:10])
    return [train_pred, test_pred]

with open('../features/mnb_feat1.pkl','wb') as fout:
    _feat1 = gen_base_mnb_feat(train_tfidf1,train_y,test_tfidf1,5)
    pickle.dump(_feat1,fout)

0 0.135560194604 0.120561737325
1 0.0285371306379 0.0225454062507
2 0.0828095122556 0.0713272756591
3 0.0159182001159 0.0111174353636
4 0.0868404344073 0.075368922452
5 0.0284919697058 0.0241334099091
0 0.134681292656 0.120682101916
1 0.0262086478194 0.023074774629
2 0.082826070923 0.0712227363663
3 0.0160457490121 0.0111798574498
4 0.086302657742 0.0753838108457
5 0.0302236033733 0.0239395433961
0 0.134863559081 0.120979069189
1 0.0300368652994 0.0223379177587
2 0.0820201599427 0.071903712272
3 0.0132530406987 0.0113145750311
4 0.0876037696284 0.0753846702295
5 0.0284297870312 0.0240878173892
0 0.136029159734 0.120535369571
1 0.0273814556392 0.0227063001572
2 0.0838548806382 0.0713355021028
3 0.0132462985369 0.0113342075519
4 0.0859986921885 0.0757095382946
5 0.03100268325 0.0237016180376
0 0.134140192152 0.120918320737
1 0.0263620168356 0.0231601929502
2 0.0807105921087 0.0720705335873
3 0.0133729149718 0.0114124332105
4 0.0879463804368 0.075507778038
5 0.0323003703458 0.023572042588

In [12]:
with open('../features/mnb_feat2.pkl','wb') as fout:
    _feat2 = gen_base_mnb_feat(comb_train,
                                train_y,
                                comb_test,5,rnd=29)
    pickle.dump(_feat2,fout)

0 0.190397770417 0.180842323448
1 0.147501960311 0.148863045449
2 0.160540684057 0.147071636004
3 0.0482392675088 0.0449910588575
4 0.182377847089 0.171305140377
5 0.130924882777 0.125953991006
0 0.190274425559 0.180812565856
1 0.155720714552 0.149726896419
2 0.157758170819 0.146289166808
3 0.0445484575551 0.0407295580431
4 0.181941901975 0.170843030741
5 0.13309407156 0.127351070285
0 0.196184953701 0.181175591931
1 0.156670586422 0.147950345781
2 0.149266858669 0.148003993169
3 0.0516411361469 0.0457793850572
4 0.179008681533 0.170796406869
5 0.137809170834 0.127432185804
0 0.201146962246 0.180772669862
1 0.154966644169 0.147148695958
2 0.163447989425 0.146932593976
3 0.0553869180086 0.0484173499772
4 0.184527404689 0.171530667595
5 0.135948761598 0.125357156666
0 0.193657966576 0.181765482802
1 0.150211085784 0.151211454609
2 0.155403110581 0.148273353903
3 0.0476132332553 0.0464999292976
4 0.181570932743 0.17260405909
5 0.124960369999 0.124681675638
