### 1.加载各种库

In [1]:
from keras import backend as K
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
from capsule import *
import jieba

Using TensorFlow backend.


### 2.导入停用词、词向量

In [3]:
# not enable in windows

# jieba.enable_parallel(4)

K.clear_session()

remove_stop_words = False

train_file = '/home/carnd/BDCI/multi-topic/train.csv'
test_file = '/home/carnd/BDCI/multi-topic/test_public.csv'

# load stopwords
f = open('/home/carnd/BDCI/multi-topic/stopwords.txt','r', encoding='utf-8')
stop_words = []
for line in f.readlines():
    stop_words.append(line.strip())
stop_words = set(stop_words)

# load Glove Vectors
embeddings_index = {}
EMBEDDING_DIM = 300
embfile = '/home/carnd/BDCI/multi-topic/sgns.baidubaike.bigram-char'

#从预训练的词向量中构建词向量
with open(embfile, encoding='utf-8') as f:
    for i, line in enumerate(f):
        values = line.split()
        words = values[:-EMBEDDING_DIM]
        word = ''.join(words)
        try:
            coefs = np.asarray(values[-EMBEDDING_DIM:], dtype='float32')
            embeddings_index[word] = coefs
        except:
            pass
print('Found %s word vectors.' % len(embeddings_index))

Found 635793 word vectors.


### 3.数据预处理
#### 3.1
    - 去除停用词
    - 将数据用dict类型表示
        - key = 评论
        - value = {'主题+情感值'}
    - 构造feature集和label集

In [4]:
#构建训练集
train_df = pd.read_csv(train_file, encoding='utf-8')
test_df = pd.read_csv(test_file, encoding='utf-8')
train_df['label'] = train_df['subject'].str.cat(train_df['sentiment_value'].astype(str))


#去除训练集和测试集的停用词
if remove_stop_words:
    train_df['content'] = train_df.content.map(
        lambda x: ''.join([e for e in x.strip().split() if e not in stop_words]))
    test_df['content'] = test_df.content.map(
        lambda x: ''.join([e for e in x.strip().split() if e not in stop_words]))
else:
    train_df['content'] = train_df.content.map(lambda x: ''.join(x.strip().split()))
    test_df['content'] = test_df.content.map(lambda x: ''.join(x.strip().split()))
    
#将数据用dict类型表示   
train_dict = {}
for ind, row in train_df.iterrows():
    content, label = row['content'], row['label']
    if train_dict.get(content) is None:
        train_dict[content] = set([label])
    else:
        train_dict[content].add(label)

In [5]:
print(train_dict['一直92，偶尔出去了不了解当地油品加95(97)。5万公里从没遇到问题，省油，动力也充足，加95也没感觉有啥不同。'])

{'动力1', '油耗1'}


In [7]:
conts = []
labels = []

#构造feature集和label集  
for k, v in train_dict.items():
    conts.append(k)
    labels.append(v)

In [8]:
print( conts[4090:4100])
print( labels[4090:4100])

['楼主好执着，终于入手xt。我开了半年多感觉动力还是差点，40万以内买suv动力也就这意思了。', '新一代鹰眼2.0的会不会有，还有后排出风口', '确实可以从中控看，挺好用的', '车身刚性应该是提高了，现款的森总感觉薄薄的，开关门没有德系车的厚重感', '加个尾排，会对车性能有影响？？', '你测过百公里加速时间吗。我反复测s#模式，最好一次7.6秒，一般都是在8秒左右。', '是否有保留2.0L发动机？', '换回原厂cd用手机导航', '性能车就不能谈上，2.5后劲不是很好', '这个挺好的啊，我把导航换了，换成了这个']
[{'动力0'}, {'空间0'}, {'配置0'}, {'外观-1'}, {'动力0', '操控0'}, {'动力0'}, {'动力0'}, {'配置0'}, {'动力0', '操控0'}, {'配置0'}]


### 3.数据预处理
#### 3.2
    - 对数据标签进行one-hot编码
      这里的label数量是：10种主题*3种情感 = 30个
    - 调用jieba分词对评论分词
    - 构建词典
    - 构建词汇的embedding矩阵
    - 对数据进行padding


In [11]:
#对数据标签进行one-hot编码
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(labels)

#对训练集和测试集进行jieba分词
content_list = [jieba.lcut(str(c)) for c in conts]
test_content_list = [jieba.lcut(c) for c in test_df.content.astype(str).values]

#构建词典
word_set = set([word for row in list(content_list) + list(test_content_list) for word in row])
print(len(word_set))

#将评论转化为单词的index矩阵
word2index = {w: i + 1 for i, w in enumerate(word_set)}
seqs = [[word2index[w] for w in l] for l in content_list]
seqs_dev = [[word2index[w] for w in l] for l in test_content_list]
embedding_matrix = np.zeros((len(word2index) + 1, EMBEDDING_DIM))

#构建词汇的embedding矩阵
for word, i in word2index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

max_features = len(word_set) + 1

19881


In [12]:
def get_padding_data(maxlen=100):

    x_train = sequence.pad_sequences(seqs, maxlen=maxlen)
    x_dev = sequence.pad_sequences(seqs_dev, maxlen=maxlen)
    return x_train, x_dev

### 4.构建网络模型
    - 输入层
    - embedding层 -> Dropout层 -> 双向RNN -> capsule层 -> Flatten层 -> Dropout层 -> 全连接层
    - 输出层

In [13]:
def get_capsule_model():

    input1 = Input(shape=(maxlen,))
    embed_layer = Embedding(len(word2index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=maxlen,
                            trainable=False)(input1)

    embed_layer = SpatialDropout1D(rate_drop_dense)(embed_layer)


    '''
    x = Bidirectional(
        GRU(gru_len, activation='relu', dropout=dropout_p, recurrent_dropout=dropout_p, return_sequences=True))(
        embed_layer)    
    '''
    x = Bidirectional(
        GRU(gru_len, activation='relu', dropout=dropout_p, recurrent_dropout=dropout_p, return_sequences=True))(
        embed_layer)

    capsule = Capsule(num_capsule=Num_capsule, dim_capsule=Dim_capsule, routings=Routings,
                      share_weights=True)(x)

    # output_capsule = Lambda(lambda x: K.sqrt(K.sum(K.square(x), 2)))(capsule)

    capsule = Flatten()(capsule)
    capsule = Dropout(dropout_p)(capsule)

    output = Dense(30, activation='sigmoid')(capsule)

    model = Model(inputs=input1, outputs=output)

    model.compile(
        loss='binary_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])

    return model

### 5. 训练

In [54]:
maxlen = 100
X_train, X_dev = get_padding_data(maxlen)
print(X_train.shape, X_dev.shape, y_train.shape)


# train model and find params

# model = get_capsule_model()

# batch_size = 30

# epochs = 50

# file_path = "weights_base.best.hdf5"

# checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, mode='min')

# early = EarlyStopping(monitor="val_loss", mode="min", patience=2)

# callbacks_list = [checkpoint, early]  # early

# model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, callbacks=callbacks_list)

batchsize_train = 64
epochs = 15
batchsize_test = 1024
first_model_results = []

for i in range(5):

    model = get_capsule_model()
    model.fit(X_train, y_train, batch_size= batchsize_train, epochs=epochs)
    first_model_results.append(model.predict(X_dev, batch_size= batchsize_test))

(8290, 100) (2364, 100) (8290, 30)
Epoch 1/1
Epoch 1/1
 128/8290 [..............................] - ETA: 8:15 - loss: 0.6081 - acc: 0.7099 

KeyboardInterrupt: 

In [57]:
print( np.shape(first_model_results) )

(1, 2364, 30)


### 6.预测
    - 如果预测最大值 < 0.5 则取最大值对应的分类做单分类
    - 如果预测最大值 > 0.5 则按照预测值取整做分类

In [70]:
pred4 = np.average(first_model_results, axis=0)
tmp = [[i for i in row] for row in pred4]


''' 如果预测最大值 < 0.5 则取最大值对应的分类做单分类
    如果预测最大值 > 0.5 则按照预测值取整做分类'''
for i, v in enumerate(tmp):
    if max(v) < 0.5:
        max_val = max(v)
        tmp[i] = [1 if j == max_val else 0 for j in v]
    else:
        tmp[i] = [int(round(j)) for j in v]

tmp = np.asanyarray(tmp)

### 7.将数据还原为标准格式

In [71]:
res = mlb.inverse_transform(tmp)
cids = []
subjs = []
sent_vals = []

for c, r in zip(test_df.content_id, res):
    for t in r:
        if '-' in t:
            sent_val = -1
            subj = t[:-2]
        else:
            sent_val = int(t[-1])
            subj = t[:-1]
        cids.append(c)
        subjs.append(subj)
        sent_vals.append(sent_val)
        

res_df = pd.DataFrame({'content_id': cids, 'subject': subjs, 'sentiment_value': sent_vals,

                       'sentiment_word': ['一般' for i in range(len(cids))]})



columns = ['content_id', 'subject', 'sentiment_value', 'sentiment_word']

res_df = res_df.reindex(columns=columns)

res_df.to_csv('submit_capsule_word.csv', encoding='utf-8', index=False)