In [1]:
import pandas as pd

def load_lcqmc():
    '''LCQMC文本匹配数据集
    '''
    train = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.train.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    valid = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.valid.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    test = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.test.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    return train, valid, test

In [2]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [3]:
train,valid,test = load_lcqmc()

In [4]:
test

Unnamed: 0,query1,query2,label
0,谁有狂三这张高清的,这张高清图，谁有,0
1,英雄联盟什么英雄最好,英雄联盟最好英雄是什么,1
2,这是什么意思，被蹭网吗,我也是醉了，这是什么意思,0
3,现在有什么动画片好看呢？,现在有什么好看的动画片吗？,1
4,请问晶达电子厂现在的工资待遇怎么样要求有哪些,三星电子厂工资待遇怎么样啊,0
...,...,...,...
12495,微店怎么开？怎么做代理？,微店怎样代理,1
12496,小学科学三年级上,小学三年级科学,0
12497,冬眠是什么意思？,冬眠的意思是什么,1
12498,天猫有假货吗,天猫卖假货吗,0


In [7]:
#分词
import jieba
train_corpus = []
valid_corpus = []

for i in range(0,len(train)):
    query1 = train.iloc[i]["query1"]
    query2 = train.iloc[i]["query2"]
    train_corpus.append(jieba.lcut(query1))
    train_corpus.append(jieba.lcut(query2))
for i in range(0,len(valid)):
    query1 = valid.iloc[i]["query1"]
    query2 = valid.iloc[i]["query2"]
    valid_corpus.append(jieba.lcut(query1))
    valid_corpus.append(jieba.lcut(query2))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\muma\AppData\Local\Temp\jieba.cache
Loading model cost 0.532 seconds.
Prefix dict has been built successfully.


In [8]:
def cut_by_jieba(sentence):
    return " ".join(jieba.lcut(sentence))

In [9]:
train["query1_seg"] = train["query1"].progress_apply(cut_by_jieba)
train["query2_seg"] = train["query2"].progress_apply(cut_by_jieba)
valid["query1_seg"] = valid["query1"].progress_apply(cut_by_jieba)
valid["query2_seg"] = valid["query2"].progress_apply(cut_by_jieba)
test["query1_seg"] = test["query1"].progress_apply(cut_by_jieba)
test["query2_seg"] = test["query2"].progress_apply(cut_by_jieba)

  0%|          | 0/238766 [00:00<?, ?it/s]

  0%|          | 0/238766 [00:00<?, ?it/s]

  0%|          | 0/8802 [00:00<?, ?it/s]

  0%|          | 0/8802 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

In [9]:
#训练word2vec
from gensim.models import word2vec
model = word2vec.Word2Vec(train_corpus+valid_corpus, vector_size=100, window=5, min_count=5, workers=4)
model.save('models//word2vec.model')

In [10]:
#训练tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(analyzer='char')
corpus = []
for item in train_corpus:
    corpus.append(" ".join(item))
for item in valid_corpus:
    corpus.append(" ".join(item))
vectorizer.fit_transform(corpus)

<495136x5029 sparse matrix of type '<class 'numpy.float64'>'
	with 5698648 stored elements in Compressed Sparse Row format>

In [11]:
corpus_words = vectorizer.get_feature_names()



In [10]:
import numpy as np

In [12]:
from gensim.models import Word2Vec
model = Word2Vec.load("models//word2vec.model")

In [20]:
words = model.wv.key_to_index
We = []
for word in words:
    We.append(model.wv[word])
We = np.array(We)

In [28]:
seg_fq = {}
N = 0
for datum in train_corpus+valid_corpus:
    for seg in datum:
        if seg in seg_fq:
            seg_fq[seg] += 1
        else:
            seg_fq[seg] = 1
        N += 1

In [37]:
def getWordWeight(seg_fq, a=1e-3):
    if a <=0: # when the parameter makes no sense, use unweighted
        a = 1.0
    seg2weight = {}
    for key, value in seg_fq.items():
        seg2weight[key] = a / (a + value/N)
    return seg2weight

In [43]:
def getWeight(words, word2weight):
    weight4ind = {}
    for word, ind in words.items():
        if word in word2weight:
            weight4ind[ind] = word2weight[word]
        else:
            weight4ind[ind] = 1.0
    return weight4ind

In [44]:
def lookupIDX(words,w):
    w = w.lower()
    if len(w) > 1 and w[0] == '#':
        w = w.replace("#","")
    if w in words:
        return words[w]
    elif 'UUUNKKK' in words:
        return words['UUUNKKK']
    else:
        return len(words) - 1

In [45]:
def getSeq(p1,words):
    p1 = p1.split()
    X1 = []
    for i in p1:
        X1.append(lookupIDX(words,i))
    return X1

In [46]:
def prepare_data(list_of_seqs):
    lengths = [len(s) for s in list_of_seqs]
    n_samples = len(list_of_seqs)
    maxlen = np.max(lengths)
    x = np.zeros((n_samples, maxlen)).astype('int32')
    x_mask = np.zeros((n_samples, maxlen)).astype('float32')
    for idx, s in enumerate(list_of_seqs):
        x[idx, :lengths[idx]] = s
        x_mask[idx, :lengths[idx]] = 1.
    x_mask = np.asarray(x_mask, dtype='float32')
    return x, x_mask

In [59]:
def sentences2idx(sentences, words):
    """
    Given a list of sentences, output array of word indices that can be fed into the algorithms.
    :param sentences: a list of sentences
    :param words: a dictionary, words['str'] is the indices of the word 'str'
    :return: x1, m1. x1[i, :] is the word indices in sentence i, m1[i,:] is the mask for sentence i (0 means no word at the location)
    """
    seq1 = []
    for i in sentences:
        seq1.append(getSeq(i,words))
    x1,m1 = prepare_data(seq1)
    return x1, m1

In [48]:
def getWeight(words, seg2weight):
    weight4ind = {}
    for word, ind in words.items():
        if word in seg2weight:
            weight4ind[ind] = seg2weight[word]
        else:
            weight4ind[ind] = 1.0
    return weight4ind

In [63]:
def seq2weight(seq, mask, weight4ind):
    weight = np.zeros(seq.shape).astype('float32')
    for i in range(seq.shape[0]):
        for j in range(seq.shape[1]):
            if mask[i,j] > 0 and seq[i,j] >= 0:
                weight[i,j] = weight4ind[seq[i,j]]
    weight = np.asarray(weight, dtype='float32')
    return weight

In [71]:
import numpy as np
from sklearn.decomposition import TruncatedSVD


def get_weighted_average(We, x, w):
    """
    Compute the weighted average vectors
    :param We: We[i,:] is the vector for word i
    :param x: x[i, :] are the indices of the words in sentence i
    :param w: w[i, :] are the weights for the words in sentence i
    :return: emb[i, :] are the weighted average vector for sentence i
    """
    n_samples = x.shape[0]
    emb = np.zeros((n_samples, We.shape[1]))
    for i in range(n_samples):
        emb[i,:] = w[i,:].dot(We[x[i,:],:]) / np.count_nonzero(w[i,:])
    return emb

def compute_pc(X,npc=1):
    """
    Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: component_[i,:] is the i-th pc
    """
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    svd.fit(X)
    return svd.components_

def remove_pc(X, npc=1):
    """
    Remove the projection on the principal components
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: XX[i, :] is the data point after removing its projection
    """
    pc = compute_pc(X, npc)
    if npc==1:
        XX = X - X.dot(pc.transpose()) * pc
    else:
        XX = X - X.dot(pc.transpose()).dot(pc)
    return XX


def SIF_embedding(We, x, w, params):
    """
    Compute the scores between pairs of sentences using weighted average + removing the projection on the first principal component
    :param We: We[i,:] is the vector for word i
    :param x: x[i, :] are the indices of the words in the i-th sentence
    :param w: w[i, :] are the weights for the words in the i-th sentence
    :param params.rmpc: if >0, remove the projections of the sentence embeddings to their first principal component
    :return: emb, emb[i, :] is the embedding for sentence i
    """
    emb = get_weighted_average(We, x, w)
#     if  params.rmpc > 0:
    emb = remove_pc(emb, params)
    return emb

In [50]:
seg2weight = getWordWeight(seg_fq, 1e-3)
weight4ind = getWeight(words, seg2weight)

In [13]:
#计算Mean-Pooling
def mean_pooling(model, sentence):
    embeddings = []
    for word in sentence.split():
        try:
            embeddings.append(model.wv[word])
        except KeyError:
            continue
    if len(embeddings) == 0:
        return np.zeros((100,))
    else:
        return np.mean(embeddings, axis=0)

In [14]:
#计算Max-Pooling
def max_pooling(model, sentence):
    embeddings = []
    for word in sentence.split():
        try:
            embeddings.append(model.wv[word])
        except KeyError:
            continue
    if len(embeddings) == 0:
        return np.zeros((100,))
    else:
        return np.max(embeddings, axis=0)

In [39]:
#计算tfidf-pooling
def tfidf_pooling(model, sentence, vec, vocab):
    tfidf = vec.transform([sentence])
    embeddings = []
    weights = []
    for idx, weight in zip(tfidf.indices, tfidf.data):
        if idx >= len(vocab):
            continue
        word = vocab[idx]
        try:
            embeddings.append(model.wv[word])
        except KeyError:
            continue
        weights.append(weight)
    if len(embeddings) == 0:
        return np.zeros((100,))
    return np.average(embeddings, weights=weights, axis=0)

In [69]:
#计算SIF-pooling
def sif_pooing(model,sentence):
    x, m = sentences2idx([sentence], words)
    w = seq2weight(x, m, weight4ind)
    embedding = SIF_embedding(We, x, w, 1)
    return embedding[0]

In [24]:
tfidf = vectorizer.transform(["喜欢 打篮球 的 男生 喜欢 什么样 的 女生"])

In [25]:
tfidf.indices, tfidf.data

(array([3361, 3084, 3003, 2991, 2939, 2386, 2273, 1826, 1223,  997,  383,
         323,    0], dtype=int32),
 array([0.32743866, 0.17890163, 0.20029903, 0.35083583, 0.27177707,
        0.45845405, 0.15997743, 0.21489497, 0.18084487, 0.45577642,
        0.08932919, 0.07201654, 0.29690745]))

In [16]:
train['mean_pooling1'] = train['query1_seg'].progress_apply(lambda sentence: mean_pooling(model, sentence))
train['mean_pooling2'] = train['query2_seg'].progress_apply(lambda sentence: mean_pooling(model, sentence))
valid['mean_pooling1'] = valid['query1_seg'].progress_apply(lambda sentence: mean_pooling(model, sentence))
valid['mean_pooling2'] = valid['query2_seg'].progress_apply(lambda sentence: mean_pooling(model, sentence))
test['mean_pooling1'] = test['query1_seg'].progress_apply(lambda sentence: mean_pooling(model, sentence))
test['mean_pooling2'] = test['query2_seg'].progress_apply(lambda sentence: mean_pooling(model, sentence))

  0%|          | 0/238766 [00:00<?, ?it/s]

  0%|          | 0/238766 [00:00<?, ?it/s]

  0%|          | 0/8802 [00:00<?, ?it/s]

  0%|          | 0/8802 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

In [17]:
train['max_pooling1'] = train['query1_seg'].progress_apply(lambda sentence: max_pooling(model, sentence))
train['max_pooling2'] = train['query2_seg'].progress_apply(lambda sentence: max_pooling(model, sentence))
valid['max_pooling1'] = valid['query1_seg'].progress_apply(lambda sentence: max_pooling(model, sentence))
valid['max_pooling2'] = valid['query2_seg'].progress_apply(lambda sentence: max_pooling(model, sentence))
test['max_pooling1'] = test['query1_seg'].progress_apply(lambda sentence: max_pooling(model, sentence))
test['max_pooling2'] = test['query2_seg'].progress_apply(lambda sentence: max_pooling(model, sentence))

  0%|          | 0/238766 [00:00<?, ?it/s]

  0%|          | 0/238766 [00:00<?, ?it/s]

  0%|          | 0/8802 [00:00<?, ?it/s]

  0%|          | 0/8802 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

In [41]:
train['tfidf_pooling1'] = train['query1_seg'].progress_apply(lambda sentence: tfidf_pooling(model, sentence, vectorizer, corpus_words))
train['tfidf_pooling2'] = train['query2_seg'].progress_apply(lambda sentence: tfidf_pooling(model, sentence, vectorizer, corpus_words))
valid['tfidf_pooling1'] = valid['query1_seg'].progress_apply(lambda sentence: tfidf_pooling(model, sentence, vectorizer, corpus_words))
valid['tfidf_pooling2'] = valid['query2_seg'].progress_apply(lambda sentence: tfidf_pooling(model, sentence, vectorizer, corpus_words))
test['tfidf_pooling1'] = test['query1_seg'].progress_apply(lambda sentence: tfidf_pooling(model, sentence, vectorizer, corpus_words))
test['tfidf_pooling2'] = test['query2_seg'].progress_apply(lambda sentence: tfidf_pooling(model, sentence, vectorizer, corpus_words))

  0%|          | 0/238766 [00:00<?, ?it/s]

  0%|          | 0/238766 [00:00<?, ?it/s]

  0%|          | 0/8802 [00:00<?, ?it/s]

  0%|          | 0/8802 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

In [72]:
train['sif_pooling1'] = train['query1_seg'].progress_apply(lambda sentence: sif_pooing(model, sentence))
train['sif_pooling2'] = train['query2_seg'].progress_apply(lambda sentence: sif_pooing(model, sentence))
valid['sif_pooling1'] = valid['query1_seg'].progress_apply(lambda sentence: sif_pooing(model, sentence))
valid['sif_pooling2'] = valid['query2_seg'].progress_apply(lambda sentence: sif_pooing(model, sentence))
test['sif_pooling1'] = test['query1_seg'].progress_apply(lambda sentence: sif_pooing(model, sentence))
test['sif_pooling2'] = test['query2_seg'].progress_apply(lambda sentence: sif_pooing(model, sentence))

  0%|          | 0/238766 [00:00<?, ?it/s]

  0%|          | 0/238766 [00:00<?, ?it/s]

  0%|          | 0/8802 [00:00<?, ?it/s]

  0%|          | 0/8802 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

In [75]:
from scipy.spatial.distance import cosine
def cos_sim(emb1,emb2):
    return 1- cosine(emb1, emb2)

In [49]:
#使用test数据进行分类，分类依据是train+valid中label为1和0的比例，即将test的句子对的cos_sim由高到低排序，根据train+valid中1的比例，将test中的样本划分为1和0，再计算accuracy
#max_pooling
test["cos_sim_maxpooling"] = [cos_sim(emb1,emb2) for emb1,emb2 in zip(test["max_pooling1"],test["max_pooling2"])]
#mean_pooling
test["cos_sim_meanpooling"] = [cos_sim(emb1,emb2) for emb1,emb2 in zip(test["mean_pooling1"],test["mean_pooling2"])]
#tfidf_pooling
test["cos_sim_tfidfpooling"] = [cos_sim(emb1,emb2) for emb1,emb2 in zip(test["tfidf_pooling1"],test["tfidf_pooling2"])]

  dist = 1.0 - uv / np.sqrt(uu * vv)


In [76]:
#sif_pooling
test["cos_sim_sifpooling"] = [cos_sim(emb1,emb2) for emb1,emb2 in zip(test["sif_pooling1"],test["sif_pooling2"])]

In [77]:
#train+valid中1的比列
label_1_percent = (len(train[train["label"] == 1])+len(valid[valid["label"] == 1]))/(len(train)+len(valid))
print("label为1的占比:",label_1_percent)

label为1的占比: 0.5775221353325147


In [53]:
cos_sim_maxpooling_sort = test.sort_values(by="cos_sim_maxpooling")
cos_sim_meanpooling_sort = test.sort_values(by="cos_sim_meanpooling")
cos_sim_tfidfpooling_sort = test.sort_values(by="cos_sim_tfidfpooling")

In [78]:
cos_sim_sifpooling_sort = test.sort_values(by="cos_sim_sifpooling")

In [55]:
#maxpooling分类效果
predict_by_maxpooling_cossim_1 = cos_sim_maxpooling_sort[int(len(test)*(1-label_1_percent)):]
predict_by_maxpooling_cossim_0 = cos_sim_maxpooling_sort[:int(len(test)*(1-label_1_percent))]
accuracy_from_maxpooling_cossim = 100*(len(predict_by_maxpooling_cossim_1[predict_by_maxpooling_cossim_1["label"] == 1]) + len(predict_by_maxpooling_cossim_0[predict_by_maxpooling_cossim_0["label"] == 0]))/len(test)
print("通过maxpooling的accuacy:{}%".format(accuracy_from_maxpooling_cossim))

通过maxpooling的accuacy:63.408%


In [57]:
#meanpooling分类效果
predict_by_meanpooling_cossim_1 = cos_sim_meanpooling_sort[int(len(test)*(1-label_1_percent)):]
predict_by_meanpooling_cossim_0 = cos_sim_meanpooling_sort[:int(len(test)*(1-label_1_percent))]
accuracy_from_meanpooling_cossim = 100*(len(predict_by_meanpooling_cossim_1[predict_by_meanpooling_cossim_1["label"] == 1]) + len(predict_by_meanpooling_cossim_0[predict_by_meanpooling_cossim_0["label"] == 0]))/len(test)
print("通过meanpooling的accuacy:{}%".format(accuracy_from_meanpooling_cossim))

通过meanpooling的accuacy:66.688%


In [58]:
#tfidfpooling分类效果
predict_by_tfidfpooling_cossim_1 = cos_sim_tfidfpooling_sort[int(len(test)*(1-label_1_percent)):]
predict_by_tfidfpooling_cossim_0 = cos_sim_tfidfpooling_sort[:int(len(test)*(1-label_1_percent))]
accuracy_from_meanpooling_cossim = 100*(len(predict_by_tfidfpooling_cossim_1[predict_by_tfidfpooling_cossim_1["label"] == 1]) + len(predict_by_tfidfpooling_cossim_0[predict_by_tfidfpooling_cossim_0["label"] == 0]))/len(test)
print("通过tfidfpooling的accuacy:{}%".format(accuracy_from_meanpooling_cossim))

通过tfidfpooling的accuacy:68.784%


In [80]:
#sifpooling分类效果
predict_by_sifpooling_cossim_1 = cos_sim_sifpooling_sort[int(len(test)*(1-label_1_percent)):]
predict_by_sifpooling_cossim_0 = cos_sim_sifpooling_sort[:int(len(test)*(1-label_1_percent))]
accuracy_from_meanpooling_cossim = 100*(len(predict_by_sifpooling_cossim_1[predict_by_sifpooling_cossim_1["label"] == 1]) + len(predict_by_sifpooling_cossim_0[predict_by_sifpooling_cossim_0["label"] == 0]))/len(test)
print("通过sifpooling的accuacy:{}%".format(accuracy_from_meanpooling_cossim))

通过sifpooling的accuacy:48.896%


In [82]:
#SIF效果很差，应该有问题，后面需要再进行检查