In [42]:
import numpy as np
import pandas as pd
train = pd.read_csv("train.csv")
test = pd.read_csv("bili_test.csv")

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
train_x, val_x, train_y, val_y = train_test_split(train['comment_text'],train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']], test_size=0.2, random_state=2)

In [45]:
import re, string
symbols = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return symbols.sub(r' \1 ', s).split()

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
transform_function = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1).fit(train['comment_text'])



In [47]:
comments_train = transform_function.transform(train_x)
comments_val = transform_function.transform(val_x)
comments_test = transform_function.transform(test['comment_text'])

In [48]:
train_x = pd.DataFrame(train_x)
val_x = pd.DataFrame(val_x)

In [49]:
combined = [train_x, val_x, test]

In [50]:
col = ['total_length', 'capitals', 'caps_vs_length','num_exclamation_marks', 'num_question_marks', 'num_punctuation','num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique','num_smilies']

In [51]:
for data in combined:
    data['total_length'] = data['comment_text'].apply(len)
    data['capitals'] = data['comment_text'].apply(lambda x: sum(1 for c in x if c.isupper()))
    data['caps_vs_length'] = data.apply(lambda row: float(row['capitals'])/float(row['total_length']),
                                axis=1)
    data['num_exclamation_marks'] = data['comment_text'].apply(lambda x: x.count('!'))
    data['num_question_marks'] = data['comment_text'].apply(lambda x: x.count('?'))
    data['num_punctuation'] = data['comment_text'].apply(lambda x: sum(x.count(w) for w in '.,;:'))
    data['num_symbols'] = data['comment_text'].apply(lambda x: sum(x.count(w) for w in '*&$%'))
    data['num_words'] = data['comment_text'].apply(lambda x: len(x.split()))
    data['num_unique_words'] = data['comment_text'].apply(lambda x: len(set(w for w in x.split())))
    data['words_vs_unique'] = data['num_unique_words'] / data['num_words']
    data['num_smilies'] = data['comment_text'].apply(lambda x: sum(x.count(w) for w in (':-)', ':)', ';-)', ';)')))

In [52]:
import scipy
train_x = scipy.sparse.csr_matrix(train_x[col].values)
val_x = scipy.sparse.csr_matrix(val_x[col].values)
test = scipy.sparse.csr_matrix(test[col].values)

In [53]:
comments_train = scipy.sparse.hstack([train_x.tocsr(),comments_train.tocsr()])
comments_val = scipy.sparse.hstack([val_x,comments_val])
comments_test = scipy.sparse.hstack([test,comments_test])

In [54]:
import xgboost as xgb
import gc

In [55]:
def run(train_X, train_y, test_X, test_y=None, feature_names=None):
    dic = {}
    dic['objective'] = 'binary:logistic'
    dic['eta'] = 0.1
    dic['max_depth'] = 6
    dic['silent'] = 1
    dic['eval_metric'] = 'auc'
    dic['min_child_weight'] = 1
    dic['subsample'] = 0.7
    dic['colsample_bytree'] = 0.7
    num = 100
    list_dic = list(dic.items())

    xgtrain = xgb.DMatrix(train_X, label=train_y)
    xgtest = xgb.DMatrix(test_X, label=test_y)

    model = xgb.train(list_dic, xgtrain, num, [ (xgtrain,'train'), (xgtest, 'test') ], early_stopping_rounds=10)

    return model 

In [56]:
##这个也不好跑
col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
preds = np.zeros((test.shape[0], len(col)))

for i, j in enumerate(col):
    print('fit '+j)
    model = run(comments_train, train_y[j], comments_val,val_y[j])
    preds[:,i] = model.predict(xgb.DMatrix(comments_test), ntree_limit = model.best_ntree_limit)
    gc.collect()

fit toxic
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-auc:0.69918	test-auc:0.69861
[1]	train-auc:0.71597	test-auc:0.71605
[2]	train-auc:0.75598	test-auc:0.75801
[3]	train-auc:0.76397	test-auc:0.76431
[4]	train-auc:0.76690	test-auc:0.76740
[5]	train-auc:0.82699	test-auc:0.82094
[6]	train-auc:0.85755	test-auc:0.84995
[7]	train-auc:0.85698	test-auc:0.85013
[8]	train-auc:0.85867	test-auc:0.85116
[9]	train-auc:0.86382	test-auc:0.85564
[10]	train-auc:0.87147	test-auc:0.86218
[11]	train-auc:0.87592	test-auc:0.86619
[12]	train-auc:0.88396	test-auc:0.87416
[13]	train-auc:0.88761	test-auc:0.87731
[14]	train-auc:0.89162	test-auc:0.88209
[15]	train-auc:0.89253	test-auc:0.88239
[16]	train-auc:0.89931	test-auc:0.89070
[17]	tr



fit severe_toxic
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-auc:0.82168	test-auc:0.82789
[1]	train-auc:0.86087	test-auc:0.86567
[2]	train-auc:0.89183	test-auc:0.89292
[3]	train-auc:0.90699	test-auc:0.90721
[4]	train-auc:0.90909	test-auc:0.90863
[5]	train-auc:0.92527	test-auc:0.92512
[6]	train-auc:0.93603	test-auc:0.93043
[7]	train-auc:0.93885	test-auc:0.93479
[8]	train-auc:0.94392	test-auc:0.94437
[9]	train-auc:0.94563	test-auc:0.94873
[10]	train-auc:0.94608	test-auc:0.94896
[11]	train-auc:0.94651	test-auc:0.94910
[12]	train-auc:0.94659	test-auc:0.95054
[13]	train-auc:0.94670	test-auc:0.95058
[14]	train-auc:0.94716	test-auc:0.95047
[15]	train-auc:0.94766	test-auc:0.95186
[16]	train-auc:0.94845	test-auc:0.95598


[86]	train-auc:0.98631	test-auc:0.97609
[87]	train-auc:0.98636	test-auc:0.97645
[88]	train-auc:0.98651	test-auc:0.97655
[89]	train-auc:0.98663	test-auc:0.97682
[90]	train-auc:0.98668	test-auc:0.97692
[91]	train-auc:0.98683	test-auc:0.97704
[92]	train-auc:0.98696	test-auc:0.97709
[93]	train-auc:0.98711	test-auc:0.97715
[94]	train-auc:0.98734	test-auc:0.97725
[95]	train-auc:0.98752	test-auc:0.97718
[96]	train-auc:0.98755	test-auc:0.97722
[97]	train-auc:0.98769	test-auc:0.97746
[98]	train-auc:0.98786	test-auc:0.97759
[99]	train-auc:0.98804	test-auc:0.97777
fit threat
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-auc:0.69659	test-auc:0.73251
[1]	train-auc:0.73755	test-auc:0.74882
[2]	train-auc:0.74528	test-auc:0.74884

[74]	train-auc:0.97481	test-auc:0.96206
[75]	train-auc:0.97506	test-auc:0.96217
[76]	train-auc:0.97543	test-auc:0.96235
[77]	train-auc:0.97585	test-auc:0.96276
[78]	train-auc:0.97620	test-auc:0.96321
[79]	train-auc:0.97629	test-auc:0.96335
[80]	train-auc:0.97647	test-auc:0.96371
[81]	train-auc:0.97671	test-auc:0.96399
[82]	train-auc:0.97695	test-auc:0.96421
[83]	train-auc:0.97714	test-auc:0.96420
[84]	train-auc:0.97758	test-auc:0.96438
[85]	train-auc:0.97786	test-auc:0.96466
[86]	train-auc:0.97796	test-auc:0.96475
[87]	train-auc:0.97818	test-auc:0.96491
[88]	train-auc:0.97832	test-auc:0.96505
[89]	train-auc:0.97845	test-auc:0.96524
[90]	train-auc:0.97857	test-auc:0.96536
[91]	train-auc:0.97886	test-auc:0.96545
[92]	train-auc:0.97898	test-auc:0.96563
[93]	train-auc:0.97919	test-auc:0.96578
[94]	train-auc:0.97941	test-auc:0.96588
[95]	train-auc:0.97958	test-auc:0.96601
[96]	train-auc:0.97978	test-auc:0.96607
[97]	train-auc:0.97989	test-auc:0.96616
[98]	train-auc:0.98012	test-auc:0.96640


In [73]:
##慢慢慢
labels=pd.read_csv('bili_test_label.csv')
labels=np.array(labels.iloc[:,1:])
sum_labels=np.sum(labels,axis=1)
idx=sum_labels>=0

In [74]:
preds_consider=preds[idx]
labels_consider= labels[idx]
preds_consider.shape,labels_consider.shape

((100, 6), (100, 6))

In [75]:
preds_output=pd.DataFrame(preds_consider)
preds_output

Unnamed: 0,0,1,2,3,4,5
0,0.058796,0.004888,0.021741,0.000921,0.024539,0.005419
1,0.742393,0.035874,0.940924,0.000454,0.309874,0.006232
2,0.121828,0.007203,0.037679,0.001133,0.035876,0.006386
3,0.059197,0.001628,0.016351,0.000846,0.013170,0.003497
4,0.103977,0.006066,0.031953,0.001133,0.035427,0.006543
...,...,...,...,...,...,...
95,0.055072,0.001483,0.011606,0.000712,0.012714,0.003028
96,0.028111,0.001634,0.010181,0.000774,0.011977,0.002618
97,0.061814,0.004361,0.022250,0.000921,0.022836,0.004960
98,0.026953,0.000888,0.011009,0.000417,0.009987,0.002093


In [76]:
preds_output.tail(10)

Unnamed: 0,0,1,2,3,4,5
90,0.056815,0.002836,0.020984,0.001009,0.024091,0.005074
91,0.059738,0.003667,0.024451,0.000888,0.025636,0.005367
92,0.103977,0.006066,0.031953,0.001133,0.035427,0.006543
93,0.086181,0.002883,0.028202,0.001251,0.033409,0.0061
94,0.032829,0.001784,0.012792,0.00067,0.012361,0.003595
95,0.055072,0.001483,0.011606,0.000712,0.012714,0.003028
96,0.028111,0.001634,0.010181,0.000774,0.011977,0.002618
97,0.061814,0.004361,0.02225,0.000921,0.022836,0.00496
98,0.026953,0.000888,0.011009,0.000417,0.009987,0.002093
99,0.149896,0.002675,0.028619,0.001337,0.019782,0.005681


In [77]:
preds_consider=preds[idx]
labels_consider= labels[idx]
preds_consider.shape,labels_consider.shape

((100, 6), (100, 6))