In [1]:
import pandas as pd
import numpy as np
import collections
from mxnet import nd, gluon, init, autograd
from mxnet.gluon import nn, data as gdata, loss as gloss
from mxnet.contrib import text
import os

In [2]:
dir_path = './jigsaw-toxic-comment-classification-challenge/'
_, test_file, test_label_file, train_file = [os.path.join(dir_path, file_name) for file_name in sorted(os.listdir(dir_path))]

In [3]:
train_data_labels = pd.read_csv(train_file)
test_data = pd.read_csv(test_file)
test_label = pd.read_csv(test_label_file)

In [4]:
labels_name = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
labels_index = list(range(1, 7))

In [5]:
train_data = train_data_labels[['id', 'comment_text']]
train_test_data = pd.concat((train_data, test_data), axis=0) # 将训练数据与预测数据进行拼接一起形成单词表

In [6]:
# 分词函数

def get_tok(data):
    def _tok(one):
        return [w.lower() for w in one.strip().replace('\n', ' ').split(' ')]
    return [_tok(s) for s in data]

In [7]:
# 形成单词表函数

def get_voc(data):
    worda_count = collections.Counter([w for s in get_tok(data) for w in s])
    return text.vocab.Vocabulary(counter=worda_count)

In [8]:
# 形成可以用来创建词向量的单词索引

def get_features(data, voc):
    max_len = 100
    
    def padding(v):
        return v[:max_len] if len(v) > max_len else v + [0] * (max_len - len(v))
    return [padding(voc.to_indices(s)) for s in get_tok(data)]

In [9]:
voc = get_voc(train_test_data['comment_text'])

In [10]:
test_features = get_features(test_data['comment_text'], voc)

In [12]:
# 开始设计神经网络模型。本问题中，虽然看似六分类问题，实质上是正对每一个分类的二分类问题。现就针对每一个分类按照二分类问题进行网络构建。
# 考虑参考textCNN思路构建网络

class TextCNN(nn.Block):
    def __init__(self, voc, w2v_size, k_sizes, n_channels, **kwargs):
        super(TextCNN, self).__init__(**kwargs)
        self.embedding = nn.Embedding(len(voc), w2v_size)
        self.convs = nn.Sequential()
        for k, c in zip(k_sizes, n_channels):
            self.convs.add(nn.Conv1D(c, k, activation='relu'))
        self.pooling = nn.GlobalMaxPool1D()
        self.dropout_1 = nn.Dropout(0.3)
        self.dense_h = nn.Dense(6, activation='relu')
        self.dropout_2 = nn.Dropout(0.6)
        self.dense = nn.Dense(2)
        
    def forward(self, inputs):
        emb = self.embedding(inputs)
        input_emb = emb.transpose((0, 2, 1))
        conv_process = [nd.flatten(self.pooling(conv(input_emb))) for conv in self.convs]
        dense_inputs = nd.concat(*conv_process, dim=1)
        hid = self.dense_h(self.dropout_1(dense_inputs))
        outputs = self.dense(self.dropout_2(hid))
        return outputs

In [62]:
def softmax(X):
    return X.exp() / X.exp().sum(axis=1, keepdims=True)

In [63]:
w2v_size, k_sizes, n_channels = 10, [3, 4, 5], [10, 10, 10]

in_vars = locals()
for i in labels_index:
    filename = './model_results/model_params_{0}'.format(labels_name[i-1])
    net = TextCNN(voc, w2v_size, k_sizes, n_channels)
    net.load_parameters(filename)
    test_features_ndarrray = nd.array(test_features)
    in_vars['test_pre_{0}'.format(labels_name[i-1])] = net(test_features_ndarrray)
    in_vars['test_{0}_as'.format(labels_name[i-1])] = softmax(in_vars['test_pre_{0}'.format(labels_name[i-1])])
#     in_vars['test_label_{0}'.format(labels_name[i-1])] = in_vars['test_pre_{0}'.format(labels_name[i-1])].argmax(axis=1)

In [86]:
results_dict = {
    'toxic': test_toxic_as[:, 1].asnumpy(), 
    'severe_toxic': test_severe_toxic_as[:, 1].asnumpy(),
    'obscene': test_obscene_as[:, 1].asnumpy(),
    'threat': test_threat_as[:, 1].asnumpy(),
    'insult': test_insult_as[:, 1].asnumpy(),
    'identity_hate': test_identity_hate_as[:, 1].asnumpy()
}

In [87]:
pre_results = pd.DataFrame(data=results_dict)

In [88]:
results = pd.concat((test_data['id'], pre_results), axis=1)

In [89]:
pre_results.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,1.0,0.04276186,0.9975051,0.02534585,0.9977515,0.01338618
1,0.000197,4.31731e-07,4.801004e-07,4.495851e-09,1.207244e-07,0.0001087413
2,0.007691,2.167242e-09,0.0004309055,2.002184e-15,5.544279e-06,7.535175e-05
3,0.000725,3.244744e-06,0.001518609,3.6952519999999997e-19,5.522445e-06,1.973828e-08
4,0.002934,0.0002535286,7.926438e-06,9.129067e-12,0.004348285,7.048269e-05


In [90]:
results.to_csv('submission.csv', index=False)

In [54]:
import mxnet

In [60]:
test_pre_toxic.exp() / test_pre_toxic.exp().sum(axis=1, keepdims=True)


[[1.7059134e-11 1.0000000e+00]
 [9.9980253e-01 1.9746160e-04]
 [9.9230886e-01 7.6911170e-03]
 ...
 [9.9984646e-01 1.5349554e-04]
 [9.9998218e-01 1.7859442e-05]
 [2.4421385e-02 9.7557861e-01]]
<NDArray 153164x2 @cpu(0)>

In [85]:
test_toxic_as[:, 1].asnumpy()

array([1.0000000e+00, 1.9746160e-04, 7.6911170e-03, ..., 1.5349554e-04,
       1.7859442e-05, 9.7557861e-01], dtype=float32)

In [91]:
results.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,1.0,0.04276186,0.9975051,0.02534585,0.9977515,0.01338618
1,0000247867823ef7,0.000197,4.31731e-07,4.801004e-07,4.495851e-09,1.207244e-07,0.0001087413
2,00013b17ad220c46,0.007691,2.167242e-09,0.0004309055,2.002184e-15,5.544279e-06,7.535175e-05
3,00017563c3f7919a,0.000725,3.244744e-06,0.001518609,3.6952519999999997e-19,5.522445e-06,1.973828e-08
4,00017695ad8997eb,0.002934,0.0002535286,7.926438e-06,9.129067e-12,0.004348285,7.048269e-05
