In [1]:
#任务1：数据集读取
import pandas as pd

def load_lcqmc():
    '''LCQMC文本匹配数据集
    '''
    train = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.train.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    valid = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.valid.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    test = pd.read_csv('https://mirror.coggle.club/dataset/LCQMC.test.data.zip', 
            sep='\t', names=['query1', 'query2', 'label'])

    return train, valid, test

In [2]:
train, valid, test = load_lcqmc()

In [3]:
#任务2：文本数据分析
#步骤1
train["query1"].map(len).describe()

count    238766.000000
mean         10.668177
std           4.087534
min           2.000000
25%           8.000000
50%          10.000000
75%          12.000000
max          49.000000
Name: query1, dtype: float64

In [4]:
train["query2"].map(len).describe()

count    238766.000000
mean         11.209586
std           4.813823
min           2.000000
25%           8.000000
50%          10.000000
75%          13.000000
max         131.000000
Name: query2, dtype: float64

In [5]:
valid["query1"].map(len).describe()

count    8802.000000
mean       12.411497
std         3.490952
min         4.000000
25%        10.000000
50%        12.000000
75%        14.000000
max        36.000000
Name: query1, dtype: float64

In [6]:
valid["query2"].map(len).describe()

count    8802.000000
mean       12.575324
std         3.722241
min         4.000000
25%        10.000000
50%        12.000000
75%        14.000000
max        36.000000
Name: query2, dtype: float64

In [7]:
test["query1"].map(len).describe()

count    12500.000000
mean         9.616080
std          2.549024
min          4.000000
25%          8.000000
50%          9.000000
75%         11.000000
max         26.000000
Name: query1, dtype: float64

In [8]:
test["query2"].map(len).describe()

count    12500.000000
mean         9.818480
std          2.778299
min          4.000000
25%          8.000000
50%          9.000000
75%         11.000000
max         27.000000
Name: query2, dtype: float64

In [9]:
#步骤2
import jieba
def cut_by_jieba_len(sentence):
    seg_list = jieba.cut(sentence)
    return len(list(seg_list))
def cut_by_jieba(sentence):
    return list(jieba.cut(sentence))

In [10]:
#任务3：文本相似度（统计特征）
#步骤1
train["query1_seg_len"] = train["query1"].apply(cut_by_jieba_len)
train["query2_seg_len"] = train["query2"].apply(cut_by_jieba_len)
train["seg_len_diff"] = abs(train["query1_seg_len"]-train["query2_seg_len"])

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\muma\AppData\Local\Temp\jieba.cache
Loading model cost 0.514 seconds.
Prefix dict has been built successfully.


In [11]:
train["query1_seg"] = train["query1"].apply(cut_by_jieba)
train["query2_seg"] = train["query2"].apply(cut_by_jieba)

In [12]:
#文本长度
train["query1_text_len"] = train["query1"].map(len)
train["query2_text_len"] = train["query2"].map(len)
train["text_len_diff"] = abs(train["query1_text_len"]-train["query2_text_len"])

In [13]:
#最长公共子串
def longestCommonSubstr(word1: str, word2: str) -> int:

    m = len(word1)
    n = len(word2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]
    # dp[i][j]代表word1以i结尾,word2以j结尾，的最大公共子串的长度

    max_len = 0
    row = 0
    col = 0
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if word1[i - 1] == word2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1] + 1
                if max_len < dp[i][j]:
                    max_len = dp[i][j]
                    row = i
                    col = j
    return max_len

In [14]:
#query1和query2文本最长公用字符串长度
query1_query2_lcs_len = []
for i in range(len(train)):
    query1_query2_lcs_len.append(longestCommonSubstr(train.iloc[i]["query1"],train.iloc[i]["query2"]))

In [15]:
train["lcs_len"] = query1_query2_lcs_len

In [16]:
#tfidf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_diff_q1_q2 = []
for i in range(len(train)):
    corpus = [" ".join(train.iloc[i]["query1_seg"])," ".join(train.iloc[i]["query2_seg"])]
    vectorizer = TfidfVectorizer(analyzer='char')
    vectorizer.fit_transform(corpus)
    q1_tfidf = vectorizer.transform([corpus[0]]).todense().tolist()
    q2_tfidf = vectorizer.transform([corpus[1]]).todense().tolist()
    diff = 0
    for j in range(len(q1_tfidf[0])):
        diff += abs(q1_tfidf[0][j] - q2_tfidf[0][j])
    tfidf_diff_q1_q2.append(diff)
train["query_tfidf_diff"] = tfidf_diff_q1_q2

In [17]:
#seg_len_diff是query1和query2单词个数差，text_len_diff是query1和query2文本长度差，lcs_len是query1&query2最大公共子串长度，query_tfidf_diff是query1和query2的tfidf差值
train

Unnamed: 0,query1,query2,label,query1_seg_len,query2_seg_len,seg_len_diff,query1_seg,query2_seg,query1_text_len,query2_text_len,text_len_diff,lcs_len,query_tfidf_diff
0,喜欢打篮球的男生喜欢什么样的女生,爱打篮球的男生喜欢什么样的女生,1,8,8,0,"[喜欢, 打篮球, 的, 男生, 喜欢, 什么样, 的, 女生]","[爱, 打篮球, 的, 男生, 喜欢, 什么样, 的, 女生]",16,15,1,14,0.460537
1,我手机丢了，我想换个手机,我想买个新手机，求推荐,1,8,6,2,"[我, 手机, 丢, 了, ，, 我想, 换个, 手机]","[我想, 买个, 新手机, ，, 求, 推荐]",12,11,1,2,2.017176
2,大家觉得她好看吗,大家觉得跑男好看吗？,0,5,7,2,"[大家, 觉得, 她, 好看, 吗]","[大家, 觉得, 跑, 男, 好看, 吗, ？]",8,10,2,4,1.341360
3,求秋色之空漫画全集,求秋色之空全集漫画,1,5,5,0,"[求, 秋色, 之空, 漫画, 全集]","[求, 秋色, 之空, 全集, 漫画]",9,9,0,5,0.000000
4,晚上睡觉带着耳机听音乐有什么害处吗？,孕妇可以戴耳机听音乐吗?,0,12,8,4,"[晚上, 睡觉, 带, 着, 耳机, 听, 音乐, 有, 什么, 害处, 吗, ？]","[孕妇, 可以, 戴, 耳机, 听, 音乐, 吗, ?]",18,12,6,5,2.690207
...,...,...,...,...,...,...,...,...,...,...,...,...,...
238761,女孩子说我是你的汤是什么意思,男孩给女孩说你的眼是海什么意思,0,10,11,1,"[女孩子, 说, 我, 是, 你, 的, 汤, 是, 什么, 意思]","[男孩, 给, 女孩, 说, 你, 的, 眼, 是, 海, 什么, 意思]",14,15,1,4,1.205921
238762,求重生之老公请接招全文,求重生之老公请接招>全文,1,7,8,1,"[求, 重生, 之, 老公, 请, 接招, 全文]","[求, 重生, 之, 老公, 请, 接招, >, 全文]",11,12,1,9,0.399753
238763,求小说电子书，,求《甄嬛》小说电子书！,0,4,8,4,"[求, 小说, 电子书, ，]","[求, 《, 甄, 嬛, 》, 小说, 电子书, ！]",7,11,4,5,2.065871
238764,杭州有什么好玩的地方？,杭州有什么好玩的地方求推,1,7,8,1,"[杭州, 有, 什么, 好玩, 的, 地方, ？]","[杭州, 有, 什么, 好玩, 的, 地方, 求, 推]",11,12,1,10,0.756582


In [18]:
#步骤2
print(len(train[train["label"] == 1]),len(train[train["label"] == 0]))
label_1_percent = len(train[train["label"] == 1])/len(train)
print("label为1的占比:",label_1_percent)

138574 100192
label为1的占比: 0.5803757653937328


In [19]:
#1. 根据文本长度的差别分类(越小越好)
text_len_diff_sort = train.sort_values(by="text_len_diff")

In [20]:
predict_by_text_len_diff_1 = text_len_diff_sort[:int(len(train)*label_1_percent)]
predict_by_text_len_diff_0 = text_len_diff_sort[int(len(train)*label_1_percent):]

In [21]:
accuracy_from_text_len_diff = 100*(len(predict_by_text_len_diff_1[predict_by_text_len_diff_1["label"] == 1]) + len(predict_by_text_len_diff_0[predict_by_text_len_diff_0["label"] == 0]))/len(train)
print("通过文本长度的差别的accuacy:{}%".format(accuracy_from_text_len_diff))

通过文本长度的差别的accuacy:61.77428947170033%


In [22]:
#2. 根据文本单词个数差别分类（越小越好）
seg_len_diff_sort = train.sort_values(by="seg_len_diff")

In [23]:
predict_by_seg_len_diff_1 = seg_len_diff_sort[:int(len(train)*label_1_percent)]
predict_by_seg_len_diff_0 = seg_len_diff_sort[int(len(train)*label_1_percent):]
accuracy_from_seg_len_diff = 100*(len(predict_by_seg_len_diff_1[predict_by_seg_len_diff_1["label"] == 1]) + len(predict_by_seg_len_diff_0[predict_by_seg_len_diff_0["label"] == 0]))/len(train)
print("通过文本单词个数差别的accuacy:{}%".format(accuracy_from_seg_len_diff))

通过文本单词个数差别的accuacy:57.88596366316812%


In [24]:
#3. 根据文本最长公用字符串长度分类(越大越好)
lcs_len_sort = train.sort_values(by="lcs_len")

In [25]:
predict_by_lcs_len_diff_0 = lcs_len_sort[:int(len(train)*(1-label_1_percent))]
predict_by_lcs_len_diff_1 = lcs_len_sort[int(len(train)*(1-label_1_percent)):]
accuracy_from_lcs_len_diff = 100*(len(predict_by_lcs_len_diff_1[predict_by_lcs_len_diff_1["label"] == 1]) + len(predict_by_lcs_len_diff_0[predict_by_lcs_len_diff_0["label"] == 0]))/len(train)
print("通过文本单词个数差别的accuacy:{}%".format(accuracy_from_lcs_len_diff))

通过文本单词个数差别的accuacy:65.00046070211002%


In [27]:
#4. 根据TFIDF编码相似度分类（越小越好）
query_tfidf_diff_sort = train.sort_values(by="query_tfidf_diff")
predict_by_tfidf_diff_1 = query_tfidf_diff_sort[:int(len(train)*label_1_percent)]
predict_by_tfidf_diff_0 = query_tfidf_diff_sort[int(len(train)*label_1_percent):]
accuracy_from_tfidf_diff = 100*(len(predict_by_tfidf_diff_1[predict_by_tfidf_diff_1["label"] == 1]) + len(predict_by_tfidf_diff_0[predict_by_tfidf_diff_0["label"] == 0]))/len(train)
print("通过TFIDF编码相似度分类的accuacy:{}%".format(accuracy_from_tfidf_diff))

通过TFIDF编码相似度分类的accuacy:76.76721141201008%


In [None]:
#通过任务3的分析可知，tfidf作为分类指标的效果是最好的

In [None]:
#任务4
#TODO