In [1]:
#任务1：数据集读取
import pandas as pd

def load_lcqmc():
    '''LCQMC文本匹配数据集
    '''
    train = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.train.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    valid = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.valid.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    test = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.test.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    return train, valid, test

In [2]:
train, valid, test = load_lcqmc()

In [3]:
#任务2：文本数据分析
#步骤1
train["query1"].map(len).describe()

count    238766.000000
mean         10.668177
std           4.087534
min           2.000000
25%           8.000000
50%          10.000000
75%          12.000000
max          49.000000
Name: query1, dtype: float64

In [4]:
train["query2"].map(len).describe()

count    238766.000000
mean         11.209586
std           4.813823
min           2.000000
25%           8.000000
50%          10.000000
75%          13.000000
max         131.000000
Name: query2, dtype: float64

In [5]:
valid["query1"].map(len).describe()

count    8802.000000
mean       12.411497
std         3.490952
min         4.000000
25%        10.000000
50%        12.000000
75%        14.000000
max        36.000000
Name: query1, dtype: float64

In [6]:
valid["query2"].map(len).describe()

count    8802.000000
mean       12.575324
std         3.722241
min         4.000000
25%        10.000000
50%        12.000000
75%        14.000000
max        36.000000
Name: query2, dtype: float64

In [7]:
test["query1"].map(len).describe()

count    12500.000000
mean         9.616080
std          2.549024
min          4.000000
25%          8.000000
50%          9.000000
75%         11.000000
max         26.000000
Name: query1, dtype: float64

In [8]:
test["query2"].map(len).describe()

count    12500.000000
mean         9.818480
std          2.778299
min          4.000000
25%          8.000000
50%          9.000000
75%         11.000000
max         27.000000
Name: query2, dtype: float64

In [9]:
#步骤2
import jieba
def cut_by_jieba_len(sentence):
    seg_list = jieba.cut(sentence)
    return len(list(seg_list))
def cut_by_jieba(sentence):
    return list(jieba.cut(sentence))

In [12]:
#任务3：文本相似度（统计特征）
#步骤1
train["query1_seg_len"] = train["query1"].apply(cut_by_jieba_len)
train["query2_seg_len"] = train["query2"].apply(cut_by_jieba_len)
train["seg_len_diff"] = abs(train["query1_seg_len"]-train["query2_seg_len"])
valid["query1_seg_len"] = valid["query1"].apply(cut_by_jieba_len)
valid["query2_seg_len"] = valid["query2"].apply(cut_by_jieba_len)
valid["seg_len_diff"] = abs(valid["query1_seg_len"]-valid["query2_seg_len"])
test["query1_seg_len"] = test["query1"].apply(cut_by_jieba_len)
test["query2_seg_len"] = test["query2"].apply(cut_by_jieba_len)
test["seg_len_diff"] = abs(test["query1_seg_len"]-test["query2_seg_len"])

In [13]:
train["query1_seg"] = train["query1"].apply(cut_by_jieba)
train["query2_seg"] = train["query2"].apply(cut_by_jieba)
valid["query1_seg"] = valid["query1"].apply(cut_by_jieba)
valid["query2_seg"] = valid["query2"].apply(cut_by_jieba)
test["query1_seg"] = test["query1"].apply(cut_by_jieba)
test["query2_seg"] = test["query2"].apply(cut_by_jieba)

In [14]:
#文本长度
train["query1_text_len"] = train["query1"].map(len)
train["query2_text_len"] = train["query2"].map(len)
train["text_len_diff"] = abs(train["query1_text_len"]-train["query2_text_len"])
valid["query1_text_len"] = valid["query1"].map(len)
valid["query2_text_len"] = valid["query2"].map(len)
valid["text_len_diff"] = abs(valid["query1_text_len"]-valid["query2_text_len"])
test["query1_text_len"] = test["query1"].map(len)
test["query2_text_len"] = test["query2"].map(len)
test["text_len_diff"] = abs(test["query1_text_len"]-test["query2_text_len"])

In [15]:
#最长公共子串
def longestCommonSubstr(word1: str, word2: str) -> int:

    m = len(word1)
    n = len(word2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    # dp[i][j]代表word1以i结尾,word2以j结尾，的最大公共子串的长度

    max_len = 0
    row = 0
    col = 0
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if word1[i - 1] == word2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
                if max_len < dp[i][j]:
                    max_len = dp[i][j]
                    row = i
                    col = j
    return max_len

In [16]:
#query1和query2文本最长公用字符串长度
def gen_lcs_len(df):
    query1_query2_lcs_len = []
    for i in range(len(df)):
        query1_query2_lcs_len.append(longestCommonSubstr(df.iloc[i]["query1"],df.iloc[i]["query2"]))
    return query1_query2_lcs_len

In [17]:
train_query1_query2_lcs_len = gen_lcs_len(train)
valid_query1_query2_lcs_len = gen_lcs_len(valid)
test_query1_query2_lcs_len = gen_lcs_len(test)

In [20]:
train["lcs_len"] = train_query1_query2_lcs_len
valid["lcs_len"] = valid_query1_query2_lcs_len
test["lcs_len"] = test_query1_query2_lcs_len

In [21]:
#tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_diff_q1_q2 = []
vectorizer = TfidfVectorizer(analyzer='char')
corpus = []
for i in range(len(train)):
    corpus.append(" ".join(train.iloc[i]["query1_seg"]))
    corpus.append(" ".join(train.iloc[i]["query2_seg"]))
for i in range(len(valid)):
    corpus.append(" ".join(valid.iloc[i]["query1_seg"]))
    corpus.append(" ".join(valid.iloc[i]["query2_seg"]))
vectorizer.fit_transform(corpus)
# for i in range(len(train)):
#     corpus = [" ".join(train.iloc[i]["query1_seg"])," ".join(train.iloc[i]["query2_seg"])]
#     vectorizer = TfidfVectorizer(analyzer='char')
#     vectorizer.fit_transform(corpus)
#     q1_tfidf = vectorizer.transform([corpus[0]]).todense().tolist()
#     q2_tfidf = vectorizer.transform([corpus[1]]).todense().tolist()
#     diff = 0
#     for j in range(len(q1_tfidf[0])):
#         diff += abs(q1_tfidf[0][j] - q2_tfidf[0][j])
#     tfidf_diff_q1_q2.append(diff)
# train["query_tfidf_diff"] = tfidf_diff_q1_q2

<495136x5029 sparse matrix of type '<class 'numpy.float64'>'
	with 5698648 stored elements in Compressed Sparse Row format>

In [23]:
def seg_merge(segs):
    return " ".join(segs)
test["query1_seg_str"] = test["query1_seg"].apply(seg_merge)
test["query2_seg_str"] = test["query2_seg"].apply(seg_merge)

In [28]:
#seg_len_diff是query1和query2单词个数差，text_len_diff是query1和query2文本长度差，lcs_len是query1&query2最大公共子串长度，query_tfidf_diff是query1和query2的tfidf差值
def tfidf_trans(sent):
    tfidf = vectorizer.transform([sent])
    return tfidf
test["tfidf_query1"] = test["query1_seg_str"].apply(tfidf_trans)
test["tfidf_query2"] = test["query2_seg_str"].apply(tfidf_trans)

In [36]:
def tfidf_diff(tfidf1,tfidf2):
    q1_tfidf = tfidf1.todense().tolist()
    q2_tfidf = tfidf2.todense().tolist()
    diff = 0
    for j in range(len(q1_tfidf[0])):
        diff += abs(q1_tfidf[0][j] - q2_tfidf[0][j])
    return diff
test["tfidf_diff"] = [tfidf_diff(tfidf1,tfidf2) for tfidf1,tfidf2 in zip(test["tfidf_query1"],test["tfidf_query2"])]

In [39]:
test

Unnamed: 0,query1,query2,label,query1_seg_len,query2_seg_len,seg_len_diff,query1_seg,query2_seg,query1_text_len,query2_text_len,text_len_diff,lcs_len,query1_seg_str,query2_seg_str,tfidf_query1,tfidf_query2,tfidf_diff
0,谁有狂三这张高清的,这张高清图，谁有,0,7,6,1,"[谁, 有, 狂, 三, 这张, 高清, 的]","[这张, 高清, 图, ，, 谁, 有]",9,8,1,4,谁 有 狂 三 这张 高清 的,这张 高清 图 ， 谁 有,"(0, 4860)\t0.3155538548024277\n (0, 4370)\t...","(0, 4993)\t0.24950117119590645\n (0, 4860)\...",1.764077
1,英雄联盟什么英雄最好,英雄联盟最好英雄是什么,1,5,6,1,"[英雄, 联盟, 什么, 英雄, 最好]","[英雄, 联盟, 最好, 英雄, 是, 什么]",10,11,1,4,英雄 联盟 什么 英雄 最好,英雄 联盟 最好 英雄 是 什么,"(0, 4685)\t0.6341336871346732\n (0, 3762)\t...","(0, 4685)\t0.6219577374979803\n (0, 3762)\t...",0.218062
2,这是什么意思，被蹭网吗,我也是醉了，这是什么意思,0,8,9,1,"[这是, 什么, 意思, ，, 被, 蹭, 网, 吗]","[我, 也, 是, 醉, 了, ，, 这是, 什么, 意思]",11,12,1,6,这是 什么 意思 ， 被 蹭 网 吗,我 也 是 醉 了 ， 这是 什么 意思,"(0, 4993)\t0.19970582911291246\n (0, 4370)\...","(0, 4993)\t0.19780367700000576\n (0, 4486)\...",2.939979
3,现在有什么动画片好看呢？,现在有什么好看的动画片吗？,1,7,8,1,"[现在, 有, 什么, 动画片, 好看, 呢, ？]","[现在, 有, 什么, 好看, 的, 动画片, 吗, ？]",12,13,1,5,现在 有 什么 动画片 好看 呢 ？,现在 有 什么 好看 的 动画片 吗 ？,"(0, 5012)\t0.1563597287866881\n (0, 3121)\t...","(0, 5012)\t0.1542445910019394\n (0, 3121)\t...",0.796405
4,请问晶达电子厂现在的工资待遇怎么样要求有哪些,三星电子厂工资待遇怎么样啊,0,10,5,5,"[请问, 晶达, 电子厂, 现在, 的, 工资待遇, 怎么样, 要求, 有, 哪些]","[三星电子, 厂, 工资待遇, 怎么样, 啊]",22,13,9,7,请问 晶达 电子厂 现在 的 工资待遇 怎么样 要求 有 哪些,三星电子 厂 工资待遇 怎么样 啊,"(0, 4611)\t0.17840086557400103\n (0, 4417)\...","(0, 4417)\t0.40107835370982037\n (0, 4239)\...",3.706871
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12495,微店怎么开？怎么做代理？,微店怎样代理,1,8,3,5,"[微店, 怎么, 开, ？, 怎么, 做, 代理, ？]","[微店, 怎样, 代理]",12,6,6,3,微店 怎么 开 ？ 怎么 做 代理 ？,微店 怎样 代理,"(0, 5012)\t0.28106634095656613\n (0, 2941)\...","(0, 2941)\t0.4363370955283356\n (0, 2273)\t...",2.238277
12496,小学科学三年级上,小学三年级科学,0,4,3,1,"[小学, 科学, 三年级, 上]","[小学, 三年级, 科学]",8,7,1,4,小学 科学 三年级 上,小学 三年级 科学,"(0, 3417)\t0.38201281165493967\n (0, 3258)\...","(0, 3417)\t0.39961068973268976\n (0, 3258)\...",0.418174
12497,冬眠是什么意思？,冬眠的意思是什么,1,5,5,0,"[冬眠, 是, 什么, 意思, ？]","[冬眠, 的, 意思, 是, 什么]",8,8,0,3,冬眠 是 什么 意思 ？,冬眠 的 意思 是 什么,"(0, 5012)\t0.16975306997426018\n (0, 3124)\...","(0, 3124)\t0.6111585108638913\n (0, 3084)\t...",0.330381
12498,天猫有假货吗,天猫卖假货吗,0,4,4,0,"[天猫, 有, 假货, 吗]","[天猫, 卖, 假货, 吗]",6,6,0,3,天猫 有 假货 吗,天猫 卖 假货 吗,"(0, 4218)\t0.4932369449623591\n (0, 2894)\t...","(0, 4218)\t0.4605829689687227\n (0, 2894)\t...",0.813065


In [38]:
#步骤2
label_1_percent = (len(train[train["label"] == 1])+len(valid[valid["label"] == 1]))/(len(train)+len(valid))
print("label为1的占比:",label_1_percent)

label为1的占比: 0.5775221353325147


In [40]:
#1. 根据文本长度的差别分类(越小越好)
text_len_diff_sort = test.sort_values(by="text_len_diff")

In [41]:
predict_by_text_len_diff_1 = text_len_diff_sort[:int(len(test)*label_1_percent)]
predict_by_text_len_diff_0 = text_len_diff_sort[int(len(test)*label_1_percent):]

In [42]:
accuracy_from_text_len_diff = 100*(len(predict_by_text_len_diff_1[predict_by_text_len_diff_1["label"] == 1]) + len(predict_by_text_len_diff_0[predict_by_text_len_diff_0["label"] == 0]))/len(test)
print("通过文本长度的差别的accuacy:{}%".format(accuracy_from_text_len_diff))

通过文本长度的差别的accuacy:55.4%


In [43]:
#2. 根据文本单词个数差别分类（越小越好）
seg_len_diff_sort = test.sort_values(by="seg_len_diff")

In [44]:
predict_by_seg_len_diff_1 = seg_len_diff_sort[:int(len(test)*label_1_percent)]
predict_by_seg_len_diff_0 = seg_len_diff_sort[int(len(test)*label_1_percent):]
accuracy_from_seg_len_diff = 100*(len(predict_by_seg_len_diff_1[predict_by_seg_len_diff_1["label"] == 1]) + len(predict_by_seg_len_diff_0[predict_by_seg_len_diff_0["label"] == 0]))/len(test)
print("通过文本单词个数差别的accuacy:{}%".format(accuracy_from_seg_len_diff))

通过文本单词个数差别的accuacy:54.776%


In [45]:
#3. 根据文本最长公用字符串长度分类(越大越好)
lcs_len_sort = test.sort_values(by="lcs_len")

In [46]:
predict_by_lcs_len_diff_0 = lcs_len_sort[:int(len(test)*(1-label_1_percent))]
predict_by_lcs_len_diff_1 = lcs_len_sort[int(len(test)*(1-label_1_percent)):]
accuracy_from_lcs_len_diff = 100*(len(predict_by_lcs_len_diff_1[predict_by_lcs_len_diff_1["label"] == 1]) + len(predict_by_lcs_len_diff_0[predict_by_lcs_len_diff_0["label"] == 0]))/len(test)
print("通过文本单词个数差别的accuacy:{}%".format(accuracy_from_lcs_len_diff))

通过文本单词个数差别的accuacy:61.184%


In [47]:
#4. 根据TFIDF编码相似度分类（越小越好）
query_tfidf_diff_sort = test.sort_values(by="tfidf_diff")
predict_by_tfidf_diff_1 = query_tfidf_diff_sort[:int(len(test)*label_1_percent)]
predict_by_tfidf_diff_0 = query_tfidf_diff_sort[int(len(test)*label_1_percent):]
accuracy_from_tfidf_diff = 100*(len(predict_by_tfidf_diff_1[predict_by_tfidf_diff_1["label"] == 1]) + len(predict_by_tfidf_diff_0[predict_by_tfidf_diff_0["label"] == 0]))/len(test)
print("通过TFIDF编码相似度分类的accuacy:{}%".format(accuracy_from_tfidf_diff))

通过TFIDF编码相似度分类的accuacy:73.16%


In [None]:
#通过任务3的分析可知，tfidf作为分类指标的效果是最好的

In [None]:
#任务4
#TODO