In [3]:
import numpy as np
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
train_x, val_x, train_y, val_y = train_test_split(train['comment_text'],train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']], test_size=0.2, random_state=2)

In [6]:
import re, string
symbols = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return symbols.sub(r' \1 ', s).split()

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
transform_function = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1).fit(train['comment_text'])



In [8]:
comments_train = transform_function.transform(train_x)
comments_val = transform_function.transform(val_x)
comments_test = transform_function.transform(test['comment_text'])

In [9]:
train_x = pd.DataFrame(train_x)
val_x = pd.DataFrame(val_x)

In [10]:
combined = [train_x, val_x, test]

In [11]:
col = ['total_length', 'capitals', 'caps_vs_length','num_exclamation_marks', 'num_question_marks', 'num_punctuation','num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique','num_smilies']

In [12]:
for data in combined:
    data['total_length'] = data['comment_text'].apply(len)
    data['capitals'] = data['comment_text'].apply(lambda x: sum(1 for c in x if c.isupper()))
    data['caps_vs_length'] = data.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    data['num_exclamation_marks'] = data['comment_text'].apply(lambda x: x.count('!'))
    data['num_question_marks'] = data['comment_text'].apply(lambda x: x.count('?'))
    data['num_punctuation'] = data['comment_text'].apply(lambda x: sum(x.count(w) for w in '.,;:'))
    data['num_symbols'] = data['comment_text'].apply(lambda x: sum(x.count(w) for w in '*&$%'))
    data['num_words'] = data['comment_text'].apply(lambda x: len(x.split()))
    data['num_unique_words'] = data['comment_text'].apply(lambda x: len(set(w for w in x.split())))
    data['words_vs_unique'] = data['num_unique_words'] / data['num_words']
    data['num_smilies'] = data['comment_text'].apply(lambda x: sum(x.count(w) for w in (':-)', ':)', ';-)', ';)')))

In [13]:
import scipy
train_x = scipy.sparse.csr_matrix(train_x[col].values)
val_x = scipy.sparse.csr_matrix(val_x[col].values)
test = scipy.sparse.csr_matrix(test[col].values)

In [14]:
comments_train = scipy.sparse.hstack([train_x.tocsr(),comments_train.tocsr()])
comments_val = scipy.sparse.hstack([val_x,comments_val])
comments_test = scipy.sparse.hstack([test,comments_test])

In [15]:
import xgboost as xgb
import gc

In [16]:
def XGBoost(train_X, train_y, test_X, test_y=None, feature_names=None):
    dic = {}
    #dic['objective'] = 'binary:logistic'
    #dic['eta'] = 0.1
    #dic['max_depth'] = 6
    #dic['silent'] = 1
    #dic['eval_metric'] = 'auc'
    #dic['min_child_weight'] = 1
    #dic['subsample'] = 0.7
    #dic['colsample_bytree'] = 0.7
    num = 100
    list_dic = list(dic.items())

    xgtrain = xgb.DMatrix(train_X, label=train_y)
    xgtest = xgb.DMatrix(test_X, label=test_y)

    model = xgb.train(list_dic, xgtrain, num, [ (xgtrain,'train'), (xgtest, 'test') ], early_stopping_rounds=10)

    return model 

In [17]:
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((test.shape[0], len(col)))

for i, j in enumerate(col):
    print('fit '+j)
    model = XGBoost(comments_train, train_y[j], comments_val,val_y[j])
    preds[:,i] = model.predict(xgb.DMatrix(comments_test), ntree_limit = model.best_ntree_limit)
    gc.collect()

fit toxic
[0]	train-rmse:0.38988	test-rmse:0.39070
[1]	train-rmse:0.32039	test-rmse:0.32197
[2]	train-rmse:0.27913	test-rmse:0.28141
[3]	train-rmse:0.25346	test-rmse:0.25658
[4]	train-rmse:0.23821	test-rmse:0.24219
[5]	train-rmse:0.22921	test-rmse:0.23340
[6]	train-rmse:0.22276	test-rmse:0.22791
[7]	train-rmse:0.21823	test-rmse:0.22421
[8]	train-rmse:0.21471	test-rmse:0.22140
[9]	train-rmse:0.21133	test-rmse:0.21877
[10]	train-rmse:0.20917	test-rmse:0.21638
[11]	train-rmse:0.20718	test-rmse:0.21502
[12]	train-rmse:0.20521	test-rmse:0.21340
[13]	train-rmse:0.20342	test-rmse:0.21194
[14]	train-rmse:0.20175	test-rmse:0.21074
[15]	train-rmse:0.20012	test-rmse:0.20997
[16]	train-rmse:0.19867	test-rmse:0.20914
[17]	train-rmse:0.19697	test-rmse:0.20814
[18]	train-rmse:0.19565	test-rmse:0.20756
[19]	train-rmse:0.19455	test-rmse:0.20678
[20]	train-rmse:0.19346	test-rmse:0.20617
[21]	train-rmse:0.19190	test-rmse:0.20524
[22]	train-rmse:0.19099	test-rmse:0.20451
[23]	train-rmse:0.19012	test-rmse:



fit severe_toxic
[0]	train-rmse:0.35502	test-rmse:0.35564
[1]	train-rmse:0.25520	test-rmse:0.25672
[2]	train-rmse:0.18739	test-rmse:0.19029
[3]	train-rmse:0.14234	test-rmse:0.14709
[4]	train-rmse:0.11349	test-rmse:0.12024
[5]	train-rmse:0.09575	test-rmse:0.10469
[6]	train-rmse:0.08531	test-rmse:0.09601
[7]	train-rmse:0.07944	test-rmse:0.09146
[8]	train-rmse:0.07578	test-rmse:0.08926
[9]	train-rmse:0.07351	test-rmse:0.08801
[10]	train-rmse:0.07222	test-rmse:0.08732
[11]	train-rmse:0.07138	test-rmse:0.08697
[12]	train-rmse:0.07082	test-rmse:0.08685
[13]	train-rmse:0.07034	test-rmse:0.08679
[14]	train-rmse:0.06978	test-rmse:0.08672
[15]	train-rmse:0.06899	test-rmse:0.08635
[16]	train-rmse:0.06861	test-rmse:0.08633
[17]	train-rmse:0.06837	test-rmse:0.08633
[18]	train-rmse:0.06809	test-rmse:0.08631
[19]	train-rmse:0.06769	test-rmse:0.08635
[20]	train-rmse:0.06735	test-rmse:0.08641
[21]	train-rmse:0.06714	test-rmse:0.08639
[22]	train-rmse:0.06691	test-rmse:0.08639
[23]	train-rmse:0.06673	tes

[43]	train-rmse:0.12599	test-rmse:0.14807
[44]	train-rmse:0.12565	test-rmse:0.14801
[45]	train-rmse:0.12525	test-rmse:0.14796
[46]	train-rmse:0.12497	test-rmse:0.14788
[47]	train-rmse:0.12461	test-rmse:0.14779
[48]	train-rmse:0.12424	test-rmse:0.14779
[49]	train-rmse:0.12381	test-rmse:0.14774
[50]	train-rmse:0.12350	test-rmse:0.14762
[51]	train-rmse:0.12322	test-rmse:0.14752
[52]	train-rmse:0.12298	test-rmse:0.14740
[53]	train-rmse:0.12272	test-rmse:0.14737
[54]	train-rmse:0.12239	test-rmse:0.14729
[55]	train-rmse:0.12211	test-rmse:0.14729
[56]	train-rmse:0.12183	test-rmse:0.14734
[57]	train-rmse:0.12163	test-rmse:0.14723
[58]	train-rmse:0.12134	test-rmse:0.14721
[59]	train-rmse:0.12110	test-rmse:0.14724
[60]	train-rmse:0.12082	test-rmse:0.14720
[61]	train-rmse:0.12061	test-rmse:0.14709
[62]	train-rmse:0.12025	test-rmse:0.14704
[63]	train-rmse:0.12008	test-rmse:0.14700
[64]	train-rmse:0.11988	test-rmse:0.14699
[65]	train-rmse:0.11971	test-rmse:0.14708
[66]	train-rmse:0.11949	test-rmse:

In [18]:
##慢慢慢
labels=pd.read_csv('test_labels.csv')
labels=np.array(labels.iloc[:,1:])
sum_labels=np.sum(labels,axis=1)
idx=sum_labels>=0

In [19]:
preds_consider=preds[idx]
labels_consider= labels[idx]
preds_consider.shape,labels_consider.shape

((63978, 6), (63978, 6))

In [20]:
from sklearn.metrics import roc_auc_score
scores=[]
for i in range(6):
    scores.append(roc_auc_score(labels_consider[:,i],preds_consider[:,i]))
np.mean(scores)

0.9413375603198052