In [27]:
import numpy as np

In [28]:
path_train='dataset/train.csv'
path_test='dataset/test.csv'

In [29]:
import pandas as pd
train = pd.read_csv(path_train)
test = pd.read_csv(path_test)

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
train_x, val_x, train_y, val_y = train_test_split(train['comment_text'],train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']], test_size=0.2, random_state=2)

In [32]:
import re, string
symbols = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return symbols.sub(r' \1 ', s).split()

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
transform_function = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1).fit(train['comment_text'])



In [34]:
comments_train = transform_function.transform(train_x)
comments_val = transform_function.transform(val_x)
comments_test = transform_function.transform(test['comment_text'])



In [35]:
train_x = pd.DataFrame(train_x)
val_x = pd.DataFrame(val_x)

In [36]:
combined = [train_x, val_x, test]

In [37]:
col = ['total_length', 'capitals', 'caps_vs_length','num_exclamation_marks', 'num_question_marks', 'num_punctuation','num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique','num_smilies']

In [38]:
for data in combined:
    data['total_length'] = data['comment_text'].apply(len)
    data['capitals'] = data['comment_text'].apply(lambda x: sum(1 for c in x if c.isupper()))
    data['caps_vs_length'] = data.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    data['num_exclamation_marks'] = data['comment_text'].apply(lambda x: x.count('!'))
    data['num_question_marks'] = data['comment_text'].apply(lambda x: x.count('?'))
    data['num_punctuation'] = data['comment_text'].apply(lambda x: sum(x.count(w) for w in '.,;:'))
    data['num_symbols'] = data['comment_text'].apply(lambda x: sum(x.count(w) for w in '*&$%'))
    data['num_words'] = data['comment_text'].apply(lambda x: len(x.split()))
    data['num_unique_words'] = data['comment_text'].apply(lambda x: len(set(w for w in x.split())))
    data['words_vs_unique'] = data['num_unique_words'] / data['num_words']
    data['num_smilies'] = data['comment_text'].apply(lambda x: sum(x.count(w) for w in (':-)', ':)', ';-)', ';)')))

In [39]:
import scipy
train_x = scipy.sparse.csr_matrix(train_x[col].values)
val_x = scipy.sparse.csr_matrix(val_x[col].values)
test = scipy.sparse.csr_matrix(test[col].values)

In [40]:
comments_train = scipy.sparse.hstack([train_x.tocsr(),comments_train.tocsr()])
comments_val = scipy.sparse.hstack([val_x,comments_val])
comments_test = scipy.sparse.hstack([test,comments_test])

In [41]:
import xgboost as xgb

In [42]:
def run(train_X, train_y, test_X, test_y=None, feature_names=None):
    dic = {}
    dic['objective'] = 'binary:logistic'
    dic['eta'] = 0.1
    dic['max_depth'] = 6
    dic['silent'] = 1
    dic['eval_metric'] = 'auc'
    dic['min_child_weight'] = 1
    dic['subsample'] = 0.7
    dic['colsample_bytree'] = 0.7
    num = 100
    list_dic = list(dic.items())

    xgtrain = xgb.DMatrix(train_X, label=train_y)
    xgtest = xgb.DMatrix(test_X, label=test_y)

    model = xgb.train(list_dic, xgtrain, num, [ (xgtrain,'train'), (xgtest, 'test') ], early_stopping_rounds=10)

    return model 

In [47]:
import gc
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((test.shape[0], len(col)))

for i, j in enumerate(col):
    print('fit '+j)
    model = run(comments_train, train_y[j], comments_val,val_y[j])
    preds[:,i] = model.predict(xgb.DMatrix(comments_test), ntree_limit = model.best_ntree_limit)
    model.save_model(f"{j}_xgboost.json")
    gc.collect()

fit toxic
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.69918	test-auc:0.69861
[1]	train-auc:0.71597	test-auc:0.71605
[2]	train-auc:0.75598	test-auc:0.75801
[3]	train-auc:0.76397	test-auc:0.76431
[4]	train-auc:0.76690	test-auc:0.76740
[5]	train-auc:0.82699	test-auc:0.82094
[6]	train-auc:0.85755	test-auc:0.84995
[7]	train-auc:0.85698	test-auc:0.85013
[8]	train-auc:0.85867	test-auc:0.85116
[9]	train-auc:0.86382	test-auc:0.85564
[10]	train-auc:0.87147	test-auc:0.86218
[11]	train-auc:0.87592	test-auc:0.86619
[12]	train-auc:0.88396	test-auc:0.87416
[13]	train-auc:0.88761	test-auc:0.87731
[14]	train-auc:0.89162	test-auc:0.88209
[15]	train-auc:0.89253	test-auc:0.88239
[16]	train-auc:0.89931	test-auc:0.89070
[17]	train-auc:0.90224	test-auc:0.89

[86]	train-auc:0.99383	test-auc:0.98551
[87]	train-auc:0.99389	test-auc:0.98551
[88]	train-auc:0.99394	test-auc:0.98542
[89]	train-auc:0.99397	test-auc:0.98551
[90]	train-auc:0.99401	test-auc:0.98552
[91]	train-auc:0.99404	test-auc:0.98551
[92]	train-auc:0.99411	test-auc:0.98549
[93]	train-auc:0.99418	test-auc:0.98559
[94]	train-auc:0.99425	test-auc:0.98547
[95]	train-auc:0.99430	test-auc:0.98551
[96]	train-auc:0.99436	test-auc:0.98548
[97]	train-auc:0.99440	test-auc:0.98552
[98]	train-auc:0.99443	test-auc:0.98548
[99]	train-auc:0.99447	test-auc:0.98555
fit obscene
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.79909	test-auc:0.80105
[1]	train-auc:0.81261	test-auc:0.81233
[2]	train-auc:0.81436	test-auc:0.81385
[3]	train-auc:0.81498	test-

[72]	train-auc:0.99565	test-auc:0.97689
[73]	train-auc:0.99577	test-auc:0.97661
[74]	train-auc:0.99575	test-auc:0.97675
[75]	train-auc:0.99586	test-auc:0.97735
[76]	train-auc:0.99629	test-auc:0.97792
[77]	train-auc:0.99672	test-auc:0.97764
[78]	train-auc:0.99683	test-auc:0.97716
[79]	train-auc:0.99709	test-auc:0.97825
[80]	train-auc:0.99722	test-auc:0.97825
[81]	train-auc:0.99737	test-auc:0.97850
[82]	train-auc:0.99740	test-auc:0.97847
[83]	train-auc:0.99745	test-auc:0.97844
[84]	train-auc:0.99752	test-auc:0.97827
[85]	train-auc:0.99759	test-auc:0.97848
[86]	train-auc:0.99764	test-auc:0.97843
[87]	train-auc:0.99764	test-auc:0.97883
[88]	train-auc:0.99763	test-auc:0.97870
[89]	train-auc:0.99781	test-auc:0.97862
[90]	train-auc:0.99792	test-auc:0.97847
[91]	train-auc:0.99796	test-auc:0.97878
[92]	train-auc:0.99801	test-auc:0.97852
[93]	train-auc:0.99803	test-auc:0.97835
[94]	train-auc:0.99805	test-auc:0.97853
[95]	train-auc:0.99823	test-auc:0.97770
[96]	train-auc:0.99830	test-auc:0.97783


[61]	train-auc:0.98028	test-auc:0.94631
[62]	train-auc:0.98095	test-auc:0.94700
[63]	train-auc:0.98142	test-auc:0.94783
[64]	train-auc:0.98178	test-auc:0.94784
[65]	train-auc:0.98200	test-auc:0.94863
[66]	train-auc:0.98236	test-auc:0.94845
[67]	train-auc:0.98289	test-auc:0.95022
[68]	train-auc:0.98325	test-auc:0.95153
[69]	train-auc:0.98343	test-auc:0.95118
[70]	train-auc:0.98344	test-auc:0.95151
[71]	train-auc:0.98353	test-auc:0.95223
[72]	train-auc:0.98358	test-auc:0.95241
[73]	train-auc:0.98377	test-auc:0.95254
[74]	train-auc:0.98392	test-auc:0.95271
[75]	train-auc:0.98403	test-auc:0.95311
[76]	train-auc:0.98404	test-auc:0.95271
[77]	train-auc:0.98445	test-auc:0.95284
[78]	train-auc:0.98495	test-auc:0.95306
[79]	train-auc:0.98498	test-auc:0.95365
[80]	train-auc:0.98508	test-auc:0.95386
[81]	train-auc:0.98537	test-auc:0.95413
[82]	train-auc:0.98555	test-auc:0.95439
[83]	train-auc:0.98570	test-auc:0.95440
[84]	train-auc:0.98620	test-auc:0.95499
[85]	train-auc:0.98654	test-auc:0.95482


In [48]:
labels=pd.read_csv('dataset/test_labels.csv')
labels=np.array(labels.iloc[:,1:])
sum_labels=np.sum(labels,axis=1)
idx=sum_labels>=0

In [49]:
preds_consider=preds[idx]
labels_consider= labels[idx]
preds_consider.shape,labels_consider.shape

((63978, 6), (63978, 6))

In [50]:
from sklearn.metrics import roc_auc_score
scores=[]
for i in range(6):
  scores.append(roc_auc_score(labels_consider[:,i],preds_consider[:,i]))
np.mean(scores)

0.9639784442207091

In [53]:
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((test.shape[0], len(col)))

for i, j in enumerate(col):
    print('fit '+j)
    model = run(comments_train, train_y[j], comments_val,val_y[j])
    preds[:,i] = model.predict(xgb.DMatrix(comments_test), ntree_limit = model.best_ntree_limit)

fit toxic
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.69918	test-auc:0.69861
[1]	train-auc:0.71597	test-auc:0.71605
[2]	train-auc:0.75598	test-auc:0.75801
[3]	train-auc:0.76397	test-auc:0.76431
[4]	train-auc:0.76690	test-auc:0.76740
[5]	train-auc:0.82699	test-auc:0.82094
[6]	train-auc:0.85755	test-auc:0.84995
[7]	train-auc:0.85698	test-auc:0.85013
[8]	train-auc:0.85867	test-auc:0.85116
[9]	train-auc:0.86382	test-auc:0.85564
[10]	train-auc:0.87147	test-auc:0.86218
[11]	train-auc:0.87592	test-auc:0.86619
[12]	train-auc:0.88396	test-auc:0.87416
[13]	train-auc:0.88761	test-auc:0.87731
[14]	train-auc:0.89162	test-auc:0.88209
[15]	train-auc:0.89253	test-auc:0.88239
[16]	train-auc:0.89931	test-auc:0.89070
[17]	train-auc:0.90224	test-auc:0.89



fit severe_toxic
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.82168	test-auc:0.82789
[1]	train-auc:0.86087	test-auc:0.86567
[2]	train-auc:0.89183	test-auc:0.89292
[3]	train-auc:0.90699	test-auc:0.90721
[4]	train-auc:0.90909	test-auc:0.90863
[5]	train-auc:0.92527	test-auc:0.92512
[6]	train-auc:0.93603	test-auc:0.93043
[7]	train-auc:0.93885	test-auc:0.93479
[8]	train-auc:0.94392	test-auc:0.94437
[9]	train-auc:0.94563	test-auc:0.94873
[10]	train-auc:0.94608	test-auc:0.94896
[11]	train-auc:0.94651	test-auc:0.94910
[12]	train-auc:0.94659	test-auc:0.95054
[13]	train-auc:0.94670	test-auc:0.95058
[14]	train-auc:0.94716	test-auc:0.95047
[15]	train-auc:0.94766	test-auc:0.95186
[16]	train-auc:0.94845	test-auc:0.95598
[17]	train-auc:0.94931	test-a

[86]	train-auc:0.98631	test-auc:0.97609
[87]	train-auc:0.98636	test-auc:0.97645
[88]	train-auc:0.98651	test-auc:0.97655
[89]	train-auc:0.98663	test-auc:0.97682
[90]	train-auc:0.98668	test-auc:0.97692
[91]	train-auc:0.98683	test-auc:0.97704
[92]	train-auc:0.98696	test-auc:0.97709
[93]	train-auc:0.98711	test-auc:0.97715
[94]	train-auc:0.98734	test-auc:0.97725
[95]	train-auc:0.98752	test-auc:0.97718
[96]	train-auc:0.98755	test-auc:0.97722
[97]	train-auc:0.98769	test-auc:0.97746
[98]	train-auc:0.98786	test-auc:0.97759
[99]	train-auc:0.98804	test-auc:0.97777
fit threat
Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.69659	test-auc:0.73251
[1]	train-auc:0.73755	test-auc:0.74882
[2]	train-auc:0.74528	test-auc:0.74884
[3]	train-auc:0.75549	test-a

[75]	train-auc:0.97506	test-auc:0.96217
[76]	train-auc:0.97543	test-auc:0.96235
[77]	train-auc:0.97585	test-auc:0.96276
[78]	train-auc:0.97620	test-auc:0.96321
[79]	train-auc:0.97629	test-auc:0.96335
[80]	train-auc:0.97647	test-auc:0.96371
[81]	train-auc:0.97671	test-auc:0.96399
[82]	train-auc:0.97695	test-auc:0.96421
[83]	train-auc:0.97714	test-auc:0.96420
[84]	train-auc:0.97758	test-auc:0.96438
[85]	train-auc:0.97786	test-auc:0.96466
[86]	train-auc:0.97796	test-auc:0.96475
[87]	train-auc:0.97818	test-auc:0.96491
[88]	train-auc:0.97832	test-auc:0.96505
[89]	train-auc:0.97845	test-auc:0.96524
[90]	train-auc:0.97857	test-auc:0.96536
[91]	train-auc:0.97886	test-auc:0.96545
[92]	train-auc:0.97898	test-auc:0.96563
[93]	train-auc:0.97919	test-auc:0.96578
[94]	train-auc:0.97941	test-auc:0.96588
[95]	train-auc:0.97958	test-auc:0.96601
[96]	train-auc:0.97978	test-auc:0.96607
[97]	train-auc:0.97989	test-auc:0.96616
[98]	train-auc:0.98012	test-auc:0.96640
[99]	train-auc:0.98012	test-auc:0.96654
