In [60]:
import pandas as pd

def load_lcqmc():
    '''LCQMC文本匹配数据集
    '''
    train = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.train.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    valid = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.valid.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    test = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.test.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    return train, valid, test


In [61]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [62]:
train,valid,test = load_lcqmc()

In [63]:
import jieba
def cut_by_jieba(sentence):
    return " ".join(jieba.lcut(sentence))

In [64]:
train["query1_seg"] = train["query1"].progress_apply(cut_by_jieba)
train["query2_seg"] = train["query2"].progress_apply(cut_by_jieba)
valid["query1_seg"] = valid["query1"].progress_apply(cut_by_jieba)
valid["query2_seg"] = valid["query2"].progress_apply(cut_by_jieba)
test["query1_seg"] = test["query1"].progress_apply(cut_by_jieba)
test["query2_seg"] = test["query2"].progress_apply(cut_by_jieba)

  0%|          | 0/238766 [00:00<?, ?it/s]

  0%|          | 0/238766 [00:00<?, ?it/s]

  0%|          | 0/8802 [00:00<?, ?it/s]

  0%|          | 0/8802 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

In [65]:
train

Unnamed: 0,query1,query2,label,query1_seg,query2_seg
0,喜欢打篮球的男生喜欢什么样的女生,爱打篮球的男生喜欢什么样的女生,1,喜欢 打篮球 的 男生 喜欢 什么样 的 女生,爱 打篮球 的 男生 喜欢 什么样 的 女生
1,我手机丢了，我想换个手机,我想买个新手机，求推荐,1,我 手机 丢 了 ， 我想 换个 手机,我想 买个 新手机 ， 求 推荐
2,大家觉得她好看吗,大家觉得跑男好看吗？,0,大家 觉得 她 好看 吗,大家 觉得 跑 男 好看 吗 ？
3,求秋色之空漫画全集,求秋色之空全集漫画,1,求 秋色 之空 漫画 全集,求 秋色 之空 全集 漫画
4,晚上睡觉带着耳机听音乐有什么害处吗？,孕妇可以戴耳机听音乐吗?,0,晚上 睡觉 带 着 耳机 听 音乐 有 什么 害处 吗 ？,孕妇 可以 戴 耳机 听 音乐 吗 ?
...,...,...,...,...,...
238761,女孩子说我是你的汤是什么意思,男孩给女孩说你的眼是海什么意思,0,女孩子 说 我 是 你 的 汤 是 什么 意思,男孩 给 女孩 说 你 的 眼 是 海 什么 意思
238762,求重生之老公请接招全文,求重生之老公请接招>全文,1,求 重生 之 老公 请 接招 全文,求 重生 之 老公 请 接招 > 全文
238763,求小说电子书，,求《甄嬛》小说电子书！,0,求 小说 电子书 ，,求 《 甄 嬛 》 小说 电子书 ！
238764,杭州有什么好玩的地方？,杭州有什么好玩的地方求推,1,杭州 有 什么 好玩 的 地方 ？,杭州 有 什么 好玩 的 地方 求 推


In [66]:
import gensim
import torch
import torch.nn as nn
import numpy as np
model = gensim.models.Word2Vec.load('models/word2vec.model')
weights = torch.FloatTensor(model.wv.vectors)
unk_idx = len(model.wv.key_to_index)
pad_idx = unk_idx + 1
model.wv.key_to_index['<UNK>'] = unk_idx
model.wv.key_to_index['<PAD>'] = pad_idx
model.wv.index_to_key.extend(['<UNK>', '<PAD>'])
weights = torch.FloatTensor(np.concatenate((model.wv.vectors, np.zeros((2, 100))), axis=0))

In [67]:
from torch.utils.data import DataLoader, Dataset

class MyDataset(Dataset):
    def __init__(self, model, df, train=True):
        self.model = model
        self.df = df
        self.train = train
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        text1 = str(self.df.iloc[idx]['query1_seg']).split()
        text2 = str(self.df.iloc[idx]['query2_seg']).split()
        out1 = [self.model.wv.key_to_index.get(t1, unk_idx) for t1 in text1]
        out2 = [self.model.wv.key_to_index.get(t2, unk_idx) for t2 in text2]
        len1 = min(len(text1), max_len)
        len2 = min(len(text2), max_len)
        if len(out1) > max_len:
            out1 = out1[:max_len]
        else:
            out1 += [pad_idx] * (max_len - len(out1))
        if len(out2) > max_len:
            out2 = out2[:max_len]
        else:
            out2 += [pad_idx] * (max_len - len(out2))
            
        if self.train:
            return torch.tensor(out1), torch.tensor(out2), len1, len2, torch.tensor(self.df.iloc[idx]['label'])
        
        return torch.tensor(out1), torch.tensor(out2), len1, len2

In [68]:
batch_size = 64
max_len = 25

In [69]:
train_ds = MyDataset(model, train)
valid_ds = MyDataset(model, valid)
test_ds = MyDataset(model, test, train=False)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [70]:
config = {'embedding_size': 100,
          'num_layers': 2,
          'hidden_size': 64,
          'max_len': 25,
          'vocab_size':len(model.wv.key_to_index)}

In [71]:
from datetime import datetime
def printbar():
    nowtime = datetime.now()
    print('========' * 8 + nowtime.strftime('%H:%M:%S'))

In [72]:
class SiamGRU(nn.Module):
    def __init__(self, config):
        super(SiamGRU, self).__init__()
#         self.embedding = nn.Embedding(config['vocab_size'], 
#                                       config['embedding_size'], 
#                                       padding_idx=pad_idx)
        self.embedding = nn.Embedding.from_pretrained(weights,
                                                 freeze=False,
                                                 padding_idx=pad_idx)
        
        self.gru = nn.GRU(config['embedding_size'], 
                          config['hidden_size'],
                          num_layers=config['num_layers'], 
                          bidirectional=True,
                          batch_first=True, 
                          dropout=0.5)
    
        self.clf = nn.Sequential(nn.Linear(8 * config['hidden_size'], config['hidden_size']),
                                 nn.ReLU(inplace=True),
                                 nn.Dropout(0.5),
                                 nn.Linear(config['hidden_size'], 2))
        
    def cross_layer(self, x1, x2):
        f1 = torch.mul(x1, x2)
        f2 = torch.abs(x1 - x2)
        return torch.cat([x1, x2, f1, f2], axis=1)
    
    def forward_once(self, x, text_len):
        embed = self.embedding(x)
        output, _ = self.gru(embed) # batch_size, seq_len, 2 * hidden_size
        avg_out = torch.mean(output, 1)
        return avg_out
        
    def forward(self, x1, x2, text_len1, text_len2):
        out1 = self.forward_once(x1, text_len1)
        out2 = self.forward_once(x2, text_len2)
        cross_features = self.cross_layer(out1, out2)
        out = self.clf(cross_features)
        return out

In [79]:
def train(net, epochs, loss_fn, train_dl, valid_dl, optimizer,device):
    net.train()
    printbar()
    print('Start Training...')
    total_train_loss = []
    total_valid_loss = []
    for epoch in range(epochs):
        epoch_train_loss = 0
        count = 0
        for x1, x2, len1, len2, y in tqdm(train_dl):
            optimizer.zero_grad()
            x1 = x1.to(device)
            x2 = x2.to(device)
            len1 = len1.to(device)
            len2 = len2.to(device)
            y = y.to(device)
            y_out = net(x1, x2, len1, len2)
            loss = loss_fn(y_out, y)
            loss.backward()
            optimizer.step()
            epoch_train_loss += loss.item()
            y_pred = torch.max(y_out, 1)[1]
            count += (y_pred.cpu().numpy() == y.cpu().numpy()).mean()
        train_acc = count / len(train_dl)
        train_loss = epoch_train_loss / len(train_dl)
        valid_loss, valid_acc = evaluate(net, loss_fn, valid_dl, device)
        printbar()
        print(f'Epoch: {epoch+1}, train loss: {train_loss:.4f}, valid loss: {valid_loss:.4f}, train_acc: {train_acc:.4f}, valid_acc: {valid_acc:.4f}')
        total_train_loss.append(train_loss)
        total_valid_loss.append(valid_loss)
    return total_train_loss, total_valid_loss

def evaluate(net, loss_fn, valid_dl, device):
    net.eval()
    valid_loss = 0
    count = 0
    with torch.no_grad():
        for x1, x2, len1, len2, y in tqdm(valid_dl):
            x1 = x1.to(device)
            x2 = x2.to(device)
            len1 = len1.to(device)
            len2 = len2.to(device)
            y = y.to(device)
            y_out = net(x1, x2, len1, len2)
            loss = loss_fn(y_out, y)
            valid_loss += loss.item()
            y_pred = torch.max(y_out, 1)[1]
            count += (y_pred.cpu().numpy() == y.cpu().numpy()).mean()
    net.train()
    return valid_loss / len(valid_dl), count / len(valid_dl)

def predict(net, test_dl, device):
    y_pred = []
    for x1, x2, len1, len2 in test_dl:
        x1 = x1.to(device)
        x2 = x2.to(device)
        len1 = len1.to(device)
        len2 = len2.to(device)
        y_out = net(x1, x2, len1, len2)
        y_pred.extend(torch.max(y_out, 1)[1].tolist())
    return y_pred

In [78]:
epochs = 10
device = torch.device('cuda:0' if torch.cuda.is_available() else "cpu")
loss_fn = nn.CrossEntropyLoss()
siamgru = SiamGRU(config)
siamgru.to(device)
optimizer = torch.optim.Adam(siamgru.parameters(), lr=1e-3)
train_loss, valid_loss = train(siamgru, epochs, loss_fn, train_dl, valid_dl, optimizer, device)

Start Training...


  0%|          | 0/3731 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

Epoch: 1, train loss: 0.4089, valid loss: 0.5729, train_acc: 0.8154, valid_acc: 0.7505


  0%|          | 0/3731 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

Epoch: 2, train loss: 0.2867, valid loss: 0.5687, train_acc: 0.8823, valid_acc: 0.7870


  0%|          | 0/3731 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

Epoch: 3, train loss: 0.2369, valid loss: 0.5121, train_acc: 0.9052, valid_acc: 0.7983


  0%|          | 0/3731 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

Epoch: 4, train loss: 0.2052, valid loss: 0.5798, train_acc: 0.9192, valid_acc: 0.8036


  0%|          | 0/3731 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

Epoch: 5, train loss: 0.1808, valid loss: 0.5826, train_acc: 0.9296, valid_acc: 0.8131


  0%|          | 0/3731 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

Epoch: 6, train loss: 0.1619, valid loss: 0.5790, train_acc: 0.9373, valid_acc: 0.8125


  0%|          | 0/3731 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

Epoch: 7, train loss: 0.1467, valid loss: 0.6999, train_acc: 0.9435, valid_acc: 0.8161


  0%|          | 0/3731 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

Epoch: 8, train loss: 0.1329, valid loss: 0.6551, train_acc: 0.9496, valid_acc: 0.8159


  0%|          | 0/3731 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

Epoch: 9, train loss: 0.1204, valid loss: 0.8124, train_acc: 0.9540, valid_acc: 0.8156


  0%|          | 0/3731 [00:00<?, ?it/s]

  0%|          | 0/138 [00:00<?, ?it/s]

Epoch: 10, train loss: 0.1105, valid loss: 0.8623, train_acc: 0.9580, valid_acc: 0.8163


In [80]:
test_preds = predict(siamgru, test_dl, device)

In [82]:
torch.save(siamgru.state_dict, 'models/siamgru.model')

In [83]:
test["siamgru_prediction"] = test_preds

In [84]:
test

Unnamed: 0,query1,query2,label,query1_seg,query2_seg,siamgru_prediction
0,谁有狂三这张高清的,这张高清图，谁有,0,谁 有 狂 三 这张 高清 的,这张 高清 图 ， 谁 有,0
1,英雄联盟什么英雄最好,英雄联盟最好英雄是什么,1,英雄 联盟 什么 英雄 最好,英雄 联盟 最好 英雄 是 什么,1
2,这是什么意思，被蹭网吗,我也是醉了，这是什么意思,0,这是 什么 意思 ， 被 蹭 网 吗,我 也 是 醉 了 ， 这是 什么 意思,0
3,现在有什么动画片好看呢？,现在有什么好看的动画片吗？,1,现在 有 什么 动画片 好看 呢 ？,现在 有 什么 好看 的 动画片 吗 ？,1
4,请问晶达电子厂现在的工资待遇怎么样要求有哪些,三星电子厂工资待遇怎么样啊,0,请问 晶达 电子厂 现在 的 工资待遇 怎么样 要求 有 哪些,三星电子 厂 工资待遇 怎么样 啊,0
...,...,...,...,...,...,...
12495,微店怎么开？怎么做代理？,微店怎样代理,1,微店 怎么 开 ？ 怎么 做 代理 ？,微店 怎样 代理,1
12496,小学科学三年级上,小学三年级科学,0,小学 科学 三年级 上,小学 三年级 科学,0
12497,冬眠是什么意思？,冬眠的意思是什么,1,冬眠 是 什么 意思 ？,冬眠 的 意思 是 什么,1
12498,天猫有假货吗,天猫卖假货吗,0,天猫 有 假货 吗,天猫 卖 假货 吗,1


In [85]:
siamgru_accuracy = len(test[test["label"]==test["siamgru_prediction"]])/len(test)
print("孪生网络的accuracy是:{}".format(siamgru_accuracy))

孪生网络的accuracy是:0.81896
