In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
import pickle

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = train[list_classes].values
train['comment_text'] = train['comment_text'].fillna('nan')
test['comment_text'] = test['comment_text'].fillna('nan')
print('load done')

load done


In [2]:
def simple_eval(x,y,model_f):
    y_cnt = len(y)
    split_idx = y_cnt * 4 // 5  # 80%
    train_x,test_x = x[:split_idx],x[split_idx:]
    train_y,test_y = y[:split_idx],y[split_idx:]
    for i in range(6):
        model = model_f()
        model.fit(train_x,train_y[:,i])
        train_pred = model.predict_proba(train_x)
        val_pred = model.predict_proba(test_x)
        print(list_classes[i])
        print('train log loss',log_loss(train_y[:,i],train_pred))
        print('valid log loss',log_loss(test_y[:,i],val_pred))
print('done')

done


In [4]:
tf_vec1 = TfidfVectorizer(lowercase=True,ngram_range=(1,1),stop_words='english',
                          strip_accents='unicode',token_pattern=r'\w{1,}',
                          max_features=20000,sublinear_tf=True)
train_tfidf1 = tf_vec1.fit_transform(train['comment_text'].values)
test_tfidf1 = tf_vec1.transform(test['comment_text'].values)

print(train_tfidf1.shape)

(95851, 20000)


In [5]:
n_comp = 30
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
train_svd1 = svd_obj.fit_transform(train_tfidf1)
test_svd1 = svd_obj.transform(test_tfidf1)

print(type(train_svd1),train_svd1.shape)
with open('../features/tfidf_feat1.pkl','wb') as fout:
    pickle.dump([train_svd1,test_svd1],fout)
print('dump done')
#simple_eval(train_svd1,train_y,XGBClassifier)
print('-----------')

<class 'numpy.ndarray'> (95851, 30)
dump done
-----------


In [6]:
#simple_eval(train_tfidf1,train_y,LogisticRegression)
#print('-----------')

In [7]:
tf_vec2 = TfidfVectorizer(lowercase=True,ngram_range=(1,4),stop_words='english',
                          strip_accents='unicode',
                          analyzer='char',sublinear_tf=True,
                          max_features=20000
                         )
train_tfidf2 = tf_vec2.fit_transform(train['comment_text'].values)
test_tfidf2 = tf_vec2.transform(test['comment_text'].values)
print(train_tfidf2.shape)

(95851, 20000)


In [8]:
svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
train_svd2 = svd_obj.fit_transform(train_tfidf2)
test_svd2 = svd_obj.transform(test_tfidf2)

print(type(train_svd2),train_svd2.shape)
with open('../features/tfidf_feat2.pkl','wb') as fout:
    pickle.dump([train_svd2,test_svd2],fout)
print('dump done')
#simple_eval(train_svd2,train_y,XGBClassifier)
print('-----------')

<class 'numpy.ndarray'> (95851, 30)
dump done
-----------


In [9]:
simple_eval(train_tfidf2,train_y,LogisticRegression)
print('-----------')

toxic
train log loss 0.102668476965
valid log loss 0.118305630169
severe_toxic
train log loss 0.022411922314
valid log loss 0.0267855165921
obscene
train log loss 0.0551266350843
valid log loss 0.0666293033904
threat
train log loss 0.0100382343539
valid log loss 0.0129778949002
insult
train log loss 0.0657594819378
valid log loss 0.0811999457175
identity_hate
train log loss 0.0216025589557
valid log loss 0.0273140165199
-----------


In [10]:
from sklearn.model_selection import KFold
def gen_base_lr_feat(train_x,train_y,test_x,fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((95851,6)),np.zeros((226998,6))
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        
        for i in range(6):
            model = LogisticRegression(C=4.0, solver='sag')
            # train and pred
            # fit for i
            model.fit(curr_x, curr_y[:,i])
            
            # prepare for i on this fold
            hold_out_pred = model.predict_proba(hold_out_x)
            curr_train_pred = model.predict_proba(curr_x)
            print(i,log_loss(hold_out_y[:,i],hold_out_pred),log_loss(curr_y[:,i],curr_train_pred))
            train_pred[test_index][:,i] = hold_out_pred[:,1]
            
            # prepare test
            y_test = model.predict_proba(test_x)[:,1]
            test_pred[:,i] += y_test
        print('===========this fold done')
    test_pred = test_pred / fold_cnt
    return [train_pred, test_pred]

with open('../features/lr_feat1.pkl','wb') as fout:
    lr_feat1 = gen_base_lr_feat(train_tfidf1,train_y,test_tfidf1,3,rnd=3)
    pickle.dump(lr_feat1,fout)


0 0.119380694824 0.0746850700205
1 0.0274234604134 0.0177959414703
2 0.0607416599157 0.0403743840836
3 0.0134692027362 0.00583995249244
4 0.0800887331554 0.0537194073094
5 0.0264754666052 0.0152924881785
0 0.115220868579 0.0760982045533
1 0.0288516442682 0.0173982641861
2 0.0635477352893 0.0390987948236
3 0.00923871874123 0.0069111234668
4 0.0816040309818 0.0526277565744
5 0.0248571972657 0.0155287935524
0 0.116801974208 0.0760718797437
1 0.0288413520517 0.0174308477028
2 0.0662603023561 0.038419435077
3 0.0114051707333 0.00644193432172
4 0.0850341080131 0.0521715309297
5 0.0276622324554 0.0148449221083


In [16]:
try:
    del tf_vec2,tf_vec1,svd_obj
except:
    pass
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
print(type(train_tfidf2),train_tfidf2.shape)
comb_train = csr_matrix(hstack((train_tfidf2,train_tfidf1)))
print(type(comb_train),comb_train.shape)
comb_test = csr_matrix(hstack((test_tfidf2,test_tfidf1)))

<class 'scipy.sparse.csr.csr_matrix'> (95851, 20000)
<class 'scipy.sparse.csr.csr_matrix'> (95851, 40000)


In [17]:
with open('../features/lr_feat2.pkl','wb') as fout:
    lr_feat2 = gen_base_lr_feat(comb_train,
                                train_y,
                                comb_test,3,rnd=11)
    pickle.dump(lr_feat2,fout)

0 0.104134998628 0.0478121390271
1 0.0251872981136 0.0112555860909
2 0.0528032659613 0.0254522321099




3 0.0116355559205 0.00330471061956
4 0.0708317222992 0.0346417522155
5 0.023723862077 0.00947155042997
0 0.101401116398 0.0486003144496
1 0.0270874049698 0.0108921556735
2 0.0570462990754 0.0244360983602
3 0.00798403923112 0.00393517985065
4 0.0730095129567 0.0339649417963
5 0.0227642005701 0.00942174225657
0 0.103618997585 0.0483479923041
1 0.0265124405488 0.0111355137278
2 0.05932751422 0.0238983647454
3 0.00981952136598 0.00357960972976
4 0.0778400073864 0.0328498576888
5 0.0255244757828 0.00901477624363


In [18]:
from sklearn.naive_bayes import MultinomialNB
def gen_base_mnb_feat(train_x,train_y,test_x,fold_cnt=3,rnd=1):
    kf = KFold(n_splits=fold_cnt, shuffle=False, random_state=233*rnd)
    train_pred, test_pred = np.zeros((95851,6)),np.zeros((226998,6))
    for train_index, test_index in kf.split(train_x):
        # x,y
        curr_x,curr_y = train_x[train_index],train_y[train_index]
        hold_out_x,hold_out_y = train_x[test_index],train_y[test_index]
        
        for i in range(6):
            model = MultinomialNB(alpha=0.2)
            # train and pred
            # fit for i
            model.fit(curr_x, curr_y[:,i])
            
            # prepare for i on this fold
            hold_out_pred = model.predict_proba(hold_out_x)
            curr_train_pred = model.predict_proba(curr_x)
            print(i,log_loss(hold_out_y[:,i],hold_out_pred),log_loss(curr_y[:,i],curr_train_pred))
            train_pred[test_index][:,i] = hold_out_pred[:,1]
            
            # prepare test
            y_test = model.predict_proba(test_x)[:,1]
            test_pred[:,i] += y_test
        print('===========this fold done')
    test_pred = test_pred / fold_cnt
    return [train_pred, test_pred]

with open('../features/mnb_feat1.pkl','wb') as fout:
    _feat1 = gen_base_mnb_feat(train_tfidf1,train_y,test_tfidf1,3)
    pickle.dump(_feat1,fout)

0 0.141056929622 0.108651103131
1 0.0296058661402 0.02355056603
2 0.0811652113554 0.0666857558162
3 0.0205765279316 0.0121435583698
4 0.0882265251768 0.0713652720406
5 0.0322594811266 0.0236270422449
0 0.137666189062 0.110316129543
1 0.0325684311321 0.0229291386099
2 0.0863003733897 0.0645677933659
3 0.0139678059491 0.0137292755743
4 0.0906829840528 0.0700061843421
5 0.0329099885628 0.0231661090294
0 0.138532079466 0.11024644516
1 0.0323311218192 0.0230591485073
2 0.0893742591071 0.0635197885879
3 0.0179541554089 0.0128313608333
4 0.0968412780798 0.0684965722229
5 0.0338471869764 0.0229417046308


In [19]:
with open('../features/mnb_feat2.pkl','wb') as fout:
    _feat2 = gen_base_mnb_feat(comb_train,
                                train_y,
                                comb_test,3,rnd=29)
    pickle.dump(_feat2,fout)

0 0.17013950897 0.137766542843
1 0.0795185056839 0.0708648635884
2 0.123236395282 0.108441934387
3 0.0300327330205 0.0146157794155
4 0.140281672143 0.126474120042
5 0.0645588569353 0.0519272156845
0 0.162476923761 0.140672833354
1 0.0758198359313 0.0697682943836
2 0.127067794103 0.106291154116
3 0.0192439807651 0.0170760865145
4 0.140768228894 0.126127014893
5 0.063597455964 0.0519235790907
0 0.168494053822 0.13893619362
1 0.0779116635651 0.067094604248
2 0.127780207434 0.104351880275
3 0.0243567855136 0.0160279797648
4 0.15544891496 0.120734900875
5 0.064355638381 0.0496653106203
