In [1]:
import pandas as pd
import os
import distance  
import Levenshtein
import time
import lightgbm as lgb
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from catboost import CatBoostClassifier
from numba import jit
from sklearn import metrics
from sklearn.model_selection import KFold

In [2]:
# !pip install distance
!pip install python-Levenshtein textdistance



## 加载数据

In [3]:
train=pd.read_csv('data/train.csv',sep='\t',header=None)
train.columns=['q1','q2','label']
test=pd.read_csv('data/test.csv',sep='\t',header=None)
test.columns=['q1','q2']
test['label']=1
sample_submit=pd.read_csv('data/sample_submit.csv')

In [4]:
train.head()

Unnamed: 0,q1,q2,label
0,有哪些女明星被潜规则啦,哪些女明星被潜规则了,1
1,怎么支付宝绑定银行卡？,银行卡怎么绑定支付宝,1
2,请问这部电视剧叫什么名字,请问谁知道这部电视剧叫什么名字,1
3,泰囧完整版下载,エウテルペ完整版下载,0
4,在沧州市区哪家卖的盐焗鸡好吃？,沧州饭店哪家便宜又好吃又实惠,0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   q1      5000 non-null   object
 1   q2      5000 non-null   object
 2   label   5000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 117.3+ KB


In [6]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   q1      5000 non-null   object
 1   q2      5000 non-null   object
 2   label   5000 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 117.3+ KB


In [7]:
train['label'].value_counts(normalize=True)

1    0.5784
0    0.4216
Name: label, dtype: float64

In [8]:
data=pd.concat([train,test],axis=0).reset_index(drop=True)
train_size=len(train)

## 特征工程

### 1 基础特征

In [9]:
# 文本长度特征
data['q1_len']=data['q1'].astype(str).map(len)
data['q2_len']=data['q2'].astype(str).map(len)

In [10]:
data['q1_len'].describe()

count    10000.000000
mean        10.658400
std          4.019095
min          3.000000
25%          8.000000
50%         10.000000
75%         12.000000
max         49.000000
Name: q1_len, dtype: float64

In [11]:
# 长度差特征：差/比例
data['q1q2_len_diff']=data['q1_len']-data['q2_len']
data['q1q2_len_diff_abs']=np.abs(data['q1_len']-data['q2_len'])
data['q1q2_rate']=data['q1_len']/data['q2_len']
data['q2q1_rate']=data['q2_len']/data['q1_len']


In [12]:
## 特殊符号特征
data['q1_end_special']=data['q1'].str.endswith('？').astype(int)
data['q2_end_special']=data['q2'].str.endswith('？').astype(int)


## 2 共现字特征


In [13]:
data['comm_q1q2char_nums']=data.apply(lambda  row:len(set(row['q1'])&set(row['q2'])),axis=1)

In [14]:
# 共现字位置
def char_match_pos(q1, q2, pos_i):
    q1 = list(q1)
    q2 = list(q2)

    if pos_i < len(q1):
        q2_len = min(len(q2), 25)  # q2_len只匹配前25个字
        for pos_j in range(q2_len):
            if q1[pos_i] == q2[pos_j]:
                q_pos = pos_j + 1  # 如果匹配上了 记录匹配的位置
                break
            elif pos_j == q2_len - 1:
                q_pos = 0  # 如果没有匹配上 赋值为0
    else:
        q_pos = -1  # 如果后续长度不存在 赋值为-1

    return q_pos


for pos_i in range(8):
    data['q1_pos_' + str(pos_i + 1)] = data.apply(
        lambda row: char_match_pos(row['q1'], row['q2'], pos_i), axis=1).astype(np.int8)

In [15]:
# 这里也可以用结巴分词，改成“词”粒度的

## 3 距离特征

In [16]:
print("===========距离特征 =============")
sim_func_dict = {"jaccard": distance.jaccard,
                 "sorensen": distance.sorensen,
                 "levenshtein": distance.levenshtein,
                 "ratio": Levenshtein.ratio
                 }

for sim_func in tqdm(sim_func_dict, desc="距离特征"):
    data[sim_func] = data.apply(lambda row: sim_func_dict[sim_func](row["q1"],row["q2"]), axis=1)
    qt = [[3, 3], [3, 5], [5, 5], [5, 10], [10, 10], [10, 15], [15, 15], [15, 25]]

    for qt_len in qt:
        if qt_len[0] == 3 and sim_func == "levenshtein":
            pass
        else:
            data[sim_func + '_q' + str(qt_len[0]) + '_t' + str(qt_len[1])] = data.apply(
                lambda row: sim_func_dict[sim_func](row["q1"][:qt_len[0]],
                                                    row["q2"][:qt_len[1]]),
                axis=1)


距离特征:   0%|                                                                                                                                                                                                                                             | 0/4 [00:00<?, ?it/s]



距离特征: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:07<00:00,  1.80s/it]


## 4 文本向量匹配特征

In [17]:
W2V_SIZE=100

In [18]:
import os
import gensim
import jieba
import numpy as np
from gensim.models import KeyedVectors
from gensim.models import word2vec

In [19]:
data['q1_words_list']=data['q1'].apply(lambda x:[w for w in jieba.cut(x) if w])
data['q2_words_list']=data['q2'].apply(lambda x:[w for w in jieba.cut(x) if w])

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\yanqiang\AppData\Local\Temp\jieba.cache
Loading model cost 0.572 seconds.
Prefix dict has been built successfully.


In [20]:
# sentences=[]
# for sent in (data['q1']+data['q2']):
#     sentences.append([w for w in jieba.cut(sent) if w])
sentences=data['q1_words_list'].values.tolist()+data['q2_words_list'].values.tolist()
len(sentences)

20000

In [21]:
# sentences

In [22]:
if not os.path.exists('models'):
    os.mkdir('models')
w2v_model = word2vec.Word2Vec(sentences,
                                  size=W2V_SIZE, window=10, min_count=1, workers=4,iter=10, 
                                  sg=1)
w2v_model.save('models/' + 'word2vec.model')
w2v_model.wv.save_word2vec_format('models/' + 'word2vec.txt', binary=False)

In [23]:
len(w2v_model.wv.index2word)

11027

In [24]:
from scipy.spatial.distance import cosine, cityblock, canberra, euclidean, \
    minkowski, braycurtis, correlation, chebyshev, jensenshannon, mahalanobis, \
    seuclidean, sqeuclidean

from tqdm import tqdm

tqdm.pandas()

# 计算词向量的相似度
def get_w2v(query, title, num):
    q = np.zeros(W2V_SIZE)
    count = 0
    for w in query:
        if w in w2v_model.wv:
            q += w2v_model.wv[w]
            count += 1
    if count == 0:
        query_vec = q
    query_vec = (q / count).tolist()

    t = np.zeros(W2V_SIZE)
    count = 0
    for w in title:
        if w in w2v_model.wv:
            t += w2v_model.wv[w]
            count += 1
    if count == 0:
        title_vec = q
    title_vec = (t / count).tolist()

    if num == 1:
        try:
            vec_cosine = cosine(query_vec, title_vec)
            return vec_cosine
        except Exception as e:
            return 0
    if num == 2:
        try:
            vec_canberra = canberra(query_vec, title_vec) / len(query_vec)
            return vec_canberra
        except Exception as e:
            return 0
    if num == 3:
        try:
            vec_cityblock = cityblock(query_vec, title_vec) / len(query_vec)
            return vec_cityblock
        except Exception as e:
            return 0
    if num == 4:
        try:
            vec_euclidean = euclidean(query_vec, title_vec)
            return vec_euclidean
        except Exception as e:
            return 0
    if num == 5:
        try:
            vec_braycurtis = braycurtis(query_vec, title_vec)
            return vec_braycurtis
        except Exception as e:
            return 0
    if num == 6:
        try:
            vec_minkowski = minkowski(query_vec, title_vec)
            return vec_minkowski
        except Exception as e:
            return 0
    if num == 7:
        try:
            vec_correlation = correlation(query_vec, title_vec)
            return vec_correlation
        except Exception as e:
            return 0

    if num == 8:
        try:
            vec_chebyshev = chebyshev(query_vec, title_vec)
            return vec_chebyshev
        except Exception as e:
            return 0

    if num == 9:
        try:
            vec_jensenshannon = jensenshannon(query_vec, title_vec)
            return vec_jensenshannon
        except Exception as e:
            return 0

    if num == 10:
        try:
            vec_mahalanobis = mahalanobis(query_vec, title_vec)
            return vec_mahalanobis
        except Exception as e:
            return 0

    if num == 11:
        try:
            vec_seuclidean = seuclidean(query_vec, title_vec)
            return vec_seuclidean
        except Exception as e:
            return 0
    if num == 12:
        try:
            vec_sqeuclidean = sqeuclidean(query_vec, title_vec)
            return vec_sqeuclidean
        except Exception as e:
            return 0
# 词向量的相似度特征
data['vec_cosine'] = data.progress_apply(lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 1),
                                         axis=1)
data['vec_canberra'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 2), axis=1)
data['vec_cityblock'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 3), axis=1)
data['vec_euclidean'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 4), axis=1)
data['vec_braycurtis'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 5), axis=1)
data['vec_minkowski'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 6), axis=1)
data['vec_correlation'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 7), axis=1)

data['vec_chebyshev'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 8), axis=1)
data['vec_jensenshannon'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 9), axis=1)
data['vec_mahalanobis'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 10), axis=1)
data['vec_seuclidean'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 11), axis=1)
data['vec_sqeuclidean'] = data.progress_apply(
    lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 12), axis=1)

data['vec_cosine'] = data['vec_cosine'].astype('float32')
data['vec_canberra'] = data['vec_canberra'].astype('float32')
data['vec_cityblock'] = data['vec_cityblock'].astype('float32')
data['vec_euclidean'] = data['vec_euclidean'].astype('float32')
data['vec_braycurtis'] = data['vec_braycurtis'].astype('float32')
data['vec_correlation'] = data['vec_correlation'].astype('float32')

  from pandas import Panel
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:01<00:00, 9291.29it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:01<00:00, 9221.30it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 11409.65it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

## 5 向量特征

In [25]:
def w2v_sent2vec(words):
    """计算句子的平均word2vec向量, sentences是一个句子, 句向量最后会归一化"""

    M = []
    for word in words:
        try:
            M.append(w2v_model.wv[word])
        except KeyError:  # 不在词典里
            continue

    M = np.array(M)
    v = M.sum(axis=0)
    return (v / np.sqrt((v ** 2).sum())).astype(np.float32).tolist()


fea_names = ['q1_vec_{}'.format(i) for i in range(W2V_SIZE)]
data[fea_names] = data.progress_apply(lambda row: w2v_sent2vec(row['q1_words_list']), result_type='expand', axis=1)

fea_names = ['q2_vec_{}'.format(i) for i in range(W2V_SIZE)]
data[fea_names] = data.progress_apply(lambda row: w2v_sent2vec(row['q2_words_list']), result_type='expand', axis=1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 10810.74it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 10891.13it/s]



#### tfidf 向量特征

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
n_componets=16

def train_tfidf_model():
    tfidf_vectorizer=TfidfVectorizer(min_df=5,ngram_range=(1,2),max_features=100000)
    tfidf_array=tfidf_vectorizer.fit_transform([" ".join(sent) for sent in sentences])
    svd = TruncatedSVD(n_components=n_componets, n_iter=7, random_state=42)
    tfidf_array=svd.fit_transform(tfidf_array)
    return tfidf_vectorizer,svd
tfidf_vectorizer,svd=train_tfidf_model()

# data['q1_words_list'].values.tolist()+data['q2_words_list'].values.tolist()
q1_tfidf_array=tfidf_vectorizer.transform([" ".join(sent) for sent in data['q1_words_list']])
q1_tfidf_array=svd.transform(q1_tfidf_array)
# q1_tfidf_df=pd.DataFrame(q1_tfidf_array,columns=[f'q1_tfidf_vec_{i}' for i in range(n_componets)])

q2_tfidf_array=tfidf_vectorizer.transform([" ".join(sent) for sent in data['q2_words_list']])
q2_tfidf_array=svd.transform(q2_tfidf_array)
# q2_tfidf_df=pd.DataFrame(q2_tfidf_array,columns=[f'q2_tfidf_vec_{i}' for i in range(n_componets)])
# data=pd.concat([data,q1_tfidf_df,q2_tfidf_df],axis=1)

In [27]:
def get_tfidf_sim(row,num):
#     print(row.name)
    query_vec = q1_tfidf_array[row.name]
    title_vec = q2_tfidf_array[row.name]

    if num == 1:
        try:
            vec_cosine = cosine(query_vec, title_vec)
            return vec_cosine
        except Exception as e:
            return 0
    if num == 2:
        try:
            vec_canberra = canberra(query_vec, title_vec) / len(query_vec)
            return vec_canberra
        except Exception as e:
            return 0
    if num == 3:
        try:
            vec_cityblock = cityblock(query_vec, title_vec) / len(query_vec)
            return vec_cityblock
        except Exception as e:
            return 0
    if num == 4:
        try:
            vec_euclidean = euclidean(query_vec, title_vec)
            return vec_euclidean
        except Exception as e:
            return 0
    if num == 5:
        try:
            vec_braycurtis = braycurtis(query_vec, title_vec)
            return vec_braycurtis
        except Exception as e:
            return 0
    if num == 6:
        try:
            vec_minkowski = minkowski(query_vec, title_vec)
            return vec_minkowski
        except Exception as e:
            return 0
    if num == 7:
        try:
            vec_correlation = correlation(query_vec, title_vec)
            return vec_correlation
        except Exception as e:
            return 0

    if num == 8:
        try:
            vec_chebyshev = chebyshev(query_vec, title_vec)
            return vec_chebyshev
        except Exception as e:
            return 0

    if num == 9:
        try:
            vec_jensenshannon = jensenshannon(query_vec, title_vec)
            return vec_jensenshannon
        except Exception as e:
            return 0

    if num == 10:
        try:
            vec_mahalanobis = mahalanobis(query_vec, title_vec)
            return vec_mahalanobis
        except Exception as e:
            return 0

    if num == 11:
        try:
            vec_seuclidean = seuclidean(query_vec, title_vec)
            return vec_seuclidean
        except Exception as e:
            return 0
    if num == 12:
        try:
            vec_sqeuclidean = sqeuclidean(query_vec, title_vec)
            return vec_sqeuclidean
        except Exception as e:
            return 0

In [28]:
data['tfidf_vec_cosine'] = data.apply(lambda index: get_tfidf_sim(index, 1),axis=1)
data['tfidf_vec_canberra'] = data.apply(lambda index: get_tfidf_sim(index, 2),axis=1)
data['tfidf_vec_cityblock'] = data.apply(lambda index: get_tfidf_sim(index, 3),axis=1)
data['tfidf_vec_euclidean'] = data.apply(lambda index: get_tfidf_sim(index, 4),axis=1)
data['tfidf_vec_braycurtis'] = data.apply(lambda index: get_tfidf_sim(index, 5),axis=1)
data['tfidf_vec_minkowski'] = data.apply(lambda index: get_tfidf_sim(index, 6),axis=1)
data['tfidf_vec_correlation'] = data.apply(lambda index: get_tfidf_sim(index, 7),axis=1)

data['tfidf_vec_chebyshev'] = data.apply(lambda index: get_tfidf_sim(index, 8),axis=1)
data['tfidf_sb_vec_jensenshannon'] = data.apply(lambda index: get_tfidf_sim(index, 9),axis=1)
data['tfidf_vec_mahalanobis'] = data.apply(lambda index: get_tfidf_sim(index, 10),axis=1)
data['tfidf_vec_seuclidean'] = data.apply(lambda index: get_tfidf_sim(index, 11),axis=1)
data['tfidf_vec_sqeuclidean'] = data.apply(lambda index: get_tfidf_sim(index, 12),axis=1)



  dist = 1.0 - uv / np.sqrt(uu * vv)
  return l1_diff.sum() / l1_sum.sum()
  p = p / np.sum(p, axis=0)
  q = q / np.sum(q, axis=0)


## 6 词粒度特征

In [29]:
data['query']=data['q1_words_list'].apply(lambda x:" ".join(x))
data['title']=data['q2_words_list'].apply(lambda x:" ".join(x))

In [30]:
import pandas as pd
import numpy as np
import random
import math
import time
import gc
import os
import csv
import json
from itertools import chain
from tqdm import tqdm
def get_wordshare(row):
    query = row['query'].strip().split()
    title = row['title'].strip().split()

    query_len, title_len = len(query), len(title)

    query_words = {}
    title_words = {}
    for word in query:  # query
        query_words[word] = query_words.get(word, 0) + 1
    for word in title:  # title
        title_words[word] = title_words.get(word, 0) + 1
    share_term = set(query_words.keys()) & set(title_words.keys())

    # -------------------- WordMatchShare --------------
    n_shared_word_in_query = sum([query_words[w] for w in share_term])
    n_shared_word_in_title = sum([title_words[w] for w in share_term])

    WordMatchShare = (n_shared_word_in_query + n_shared_word_in_title) / (
            query_len + title_len)
    WordMatchShare_query = n_shared_word_in_query / query_len
    WordMatchShare_title = n_shared_word_in_title / title_len
    return query_len, title_len, WordMatchShare, WordMatchShare_query, WordMatchShare_title


fea_names = ['query_length', 'title_length', 'WordMatchShare', 'WordMatchShare_query',
             'WordMatchShare_title'  # 这四个根据前面的计算得到
             ]
data[fea_names] = data.progress_apply(lambda row: get_wordshare(row), result_type='expand', axis=1)


# 长度差特征
def get_lendiff(all_data):
    all_data['LengthDiff'] = all_data['query_length'] - all_data['title_length']
    all_data['LengthDiffRate'] = np.amin(all_data[['query_length', 'title_length']].values, axis=1) / np.amax(
        all_data[['query_length', 'title_length']].values, axis=1)

    all_data['LengthRatio_qt'] = all_data['query_length'] / all_data['title_length']
    all_data['LengthRatio_tq'] = all_data['title_length'] / all_data['query_length']
    return all_data


data = get_lendiff(data)


# tfidf


def get_tfidfwordshare(row, idf):
    query = row['query'].strip().split()
    title = row['title'].strip().split()

    query_words = {}
    title_words = {}
    for word in query:  # query
        query_words[word] = query_words.get(word, 0) + 1
    for word in title:  # title
        title_words[word] = title_words.get(word, 0) + 1
    share_term = set(query_words.keys()) & set(title_words.keys())

    # -------------------- WordMatchShare --------------
    sum_shared_word_in_query = sum([query_words[w] * idf.get(w, 0) for w in share_term])
    sum_shared_word_in_title = sum([title_words[w] * idf.get(w, 0) for w in share_term])
    sum_query_tol = sum(query_words[w] * idf.get(w, 0) for w in query_words)
    sum_title_tol = sum(title_words[w] * idf.get(w, 0) for w in title_words)
    sum_tol = sum_query_tol + sum_title_tol

    TFIDFWordMatchShare = (sum_shared_word_in_query + sum_shared_word_in_title) / sum_tol
    TFIDFWordMatchShare_query = sum_shared_word_in_query / sum_query_tol
    TFIDFWordMatchShare_title = sum_shared_word_in_title / sum_title_tol
    return TFIDFWordMatchShare, TFIDFWordMatchShare_query, TFIDFWordMatchShare_title


def get_tfidffea(data):
    doc_set = set()
    for title in tqdm(data['title']):
        doc_set.add(title)
    for query in tqdm(data['query']):
        doc_set.add(query)

    idf = {}
    doc_len = len(doc_set)
    print("一共有%d个unique文档." % doc_len)
    for doc in tqdm(doc_set):
        for word in set(doc.split()):
            idf[word] = idf.get(word, 0) + 1

    for word in idf:
        idf[word] = np.log(doc_len / (idf[word] + 1.)) + 1

    fea_names = ['TFIDFWordMatchShare', 'TFIDFWordMatchShare_query', 'TFIDFWordMatchShare_title']

    data[fea_names] = data.progress_apply(lambda row: get_tfidfwordshare(row, idf), result_type='expand', axis=1)
    return data


data = get_tfidffea(data)

## NgramJaccard特征

from nltk.util import bigrams
from nltk.util import ngrams
from nltk.util import everygrams


def retrieve_ngrams(txt, n):
    return [txt[i:i + n] for i in range(len(txt) - (n - 1))]


def get_ngrams(wordlist, n):
    ngrams = []
    for i in range(len(wordlist) - (n - 1)):
        ngrams.append(wordlist[i:i + n])
    return ngrams


import textdistance

fea_names = ['NgramJaccardCoef_1', 'NgramJaccardCoef_2', 'NgramJaccardCoef_3', 'NgramJaccardCoef_4']


def jaccard_coef(aa, bb):
    countUp = 0
    countDown = 0

    # Calculate how many elements in ListB are not in ListA
    for i in range(0, len(aa)):
        if aa[i] not in bb:
            countDown = countDown + 1

    # Calculate the numbers of common element
    for i in range(0, len(bb)):
        if bb[i] not in aa:
            countUp = countUp + 1

    # String_1 intersect String_2
    Numerator = len(aa) - countDown

    # String_1 union String_2
    Denominator = len(aa) + countUp

    jaccard = Numerator * 1.0 / (Denominator + 1)
    return jaccard


def get_ngramjaccard(row):
    query = row['query'].strip().split()
    title = row['title'].strip().split()

    query_ngrams = get_ngrams(query, 1)
    title_ngrams = get_ngrams(title, 1)
    NgramJaccardCoef_1 = jaccard_coef(query_ngrams,
                                      title_ngrams)
    query_ngrams = get_ngrams(query, 2)
    title_ngrams = get_ngrams(title, 2)
    NgramJaccardCoef_2 = jaccard_coef(query_ngrams,
                                      title_ngrams)

    query_ngrams = get_ngrams(query, 3)
    title_ngrams = get_ngrams(title, 3)
    NgramJaccardCoef_3 = jaccard_coef(query_ngrams,
                                      title_ngrams)
    query_ngrams = get_ngrams(query, 4)
    title_ngrams = get_ngrams(title, 4)
    NgramJaccardCoef_4 = jaccard_coef(query_ngrams,
                                      title_ngrams)
    return NgramJaccardCoef_1, NgramJaccardCoef_2, NgramJaccardCoef_3, NgramJaccardCoef_4


data[fea_names] = data.progress_apply(lambda row: get_ngramjaccard(row), result_type='expand', axis=1)


# 1.4 sequencematch相关
def lcsubstr_lens(s1, s2):  # 计算最长子串长度
    m = [[0 for i in range(len(s2) + 1)]
         for j in range(len(s1) + 1)]  # 生成0矩阵，为方便后续计算，比字符串长度多了一列
    mmax = 0  # 最长匹配的长度
    p = 0  # 最长匹配对应在s1中的最后一位
    for i in range(len(s1)):
        for j in range(len(s2)):
            if s1[i] == s2[j]:
                m[i + 1][j + 1] = m[i][j] + 1
                if m[i + 1][j + 1] > mmax:
                    mmax = m[i + 1][j + 1]
                    p = i + 1
    return mmax


def lcseque_lens(s1, s2):  # 计算最长子序列长度
    # 生成字符串长度加1的0矩阵，m用来保存对应位置匹配的结果
    m = [[0 for x in range(len(s2) + 1)] for y in range(len(s1) + 1)]
    # d用来记录转移方向
    d = [[None for x in range(len(s2) + 1)] for y in range(len(s1) + 1)]
    for p1 in range(len(s1)):
        for p2 in range(len(s2)):
            if s1[p1] == s2[p2]:  # 字符匹配成功，则该位置的值为左上方的值加1
                m[p1 + 1][p2 + 1] = m[p1][p2] + 1
                d[p1 + 1][p2 + 1] = 'ok'
            elif m[p1 + 1][p2] > m[p1][p2 + 1]:  # 左值大于上值，则该位置的值为左值，并标记回溯时的方向
                m[p1 + 1][p2 + 1] = m[p1 + 1][p2]
                d[p1 + 1][p2 + 1] = 'left'
            else:  # 上值大于左值，则该位置的值为上值，并标记方向up
                m[p1 + 1][p2 + 1] = m[p1][p2 + 1]
                d[p1 + 1][p2 + 1] = 'up'
    (p1, p2) = (len(s1), len(s2))
    s = []
    while m[p1][p2]:  # 不为None时
        c = d[p1][p2]
        if c == 'ok':  # 匹配成功，插入该字符，并向左上角找下一个
            s.append(s1[p1 - 1])
            p1 -= 1
            p2 -= 1
        if c == 'left':  # 根据标记，向左找下一个
            p2 -= 1
        if c == 'up':  # 根据标记，向上找下一个
            p1 -= 1
    return len(s)


from difflib import SequenceMatcher


def get_sequencematch(row):
    query = row['query'].strip().split()
    title = row['title'].strip().split()

    lcsubstr_len = lcsubstr_lens(query, title)
    lcseque_len = lcseque_lens(query, title)
    sq = SequenceMatcher(a=query, b=title)
    match = sq.find_longest_match(0, len(query), 0, len(title))
    longest_match_size = match.size
    longest_match_ratio = match.size / min(len(query), len(title))
    return lcsubstr_len, lcseque_len, longest_match_size, longest_match_ratio


fea_names = ["lcsubstr_len", "lcseque_len", "longest_match_size", "longest_match_ratio"]
data[fea_names] = data.progress_apply(lambda row: get_sequencematch(row), result_type='expand', axis=1)

# Fuzzy特征
from fuzzywuzzy import fuzz


def get_fuzzz(row):
    query_str = row['query'].strip()
    query = query_str.split()
    title_str = row['title'].strip()
    title = title_str.split()

    fuzz_qratio = fuzz.QRatio(query_str, title_str)
    fuzz_WRatio = fuzz.WRatio(query_str, title_str)
    fuzz_partial_ratio = fuzz.partial_ratio(query_str, title_str)

    fuzz_partial_token_set_ratio = fuzz.partial_token_set_ratio(query_str, title_str)
    fuzz_partial_token_sort_ratio = fuzz.partial_token_sort_ratio(query_str, title_str)
    fuzz_token_set_ratio = fuzz.token_set_ratio(query_str, title_str)
    fuzz_token_sort_ratio = fuzz.token_sort_ratio(query_str, title_str)

    return fuzz_qratio, fuzz_WRatio, fuzz_partial_ratio, fuzz_partial_token_set_ratio, fuzz_partial_token_sort_ratio, \
           fuzz_token_set_ratio, fuzz_token_sort_ratio


fea_names = ["fuzz_qratio", "fuzz_WRatio", "fuzz_partial_ratio", "fuzz_partial_token_set_ratio",
             "fuzz_partial_token_sort_ratio",
             "fuzz_token_set_ratio",
             "fuzz_token_sort_ratio"]
print(len(fea_names))
data[fea_names] = data.progress_apply(lambda row: get_fuzzz(row), result_type='expand', axis=1)


# 熵相关

def get_entropy(row):
    query_str = row['query'].strip()
    query = query_str.split()
    title_str = row['title'].strip()
    title = title_str.split()

    query_words = {}
    title_words = {}
    query_title_words = {}
    for word in query:  # query
        query_words[word] = query_words.get(word, 0) + 1
        query_title_words[word] = query_title_words.get(word, 0) + 1
    for word in title:  # title
        title_words[word] = title_words.get(word, 0) + 1
        query_title_words[word] = query_title_words.get(word, 0) + 1

    n_query_tol = sum(query_words.values())
    n_title_tol = sum(title_words.values())
    n_query_title_tol = sum(query_title_words.values())
    query_Entropy = abs(sum(map(lambda x: x / n_query_tol * \
                                          math.log(x / n_query_tol, 2),
                                query_words.values())))
    title_Entropy = abs(sum(map(lambda x: x / n_title_tol * \
                                          math.log(x / n_title_tol, 2),
                                title_words.values())))
    query_title_Entropy = abs(sum(map(lambda x: x / n_query_title_tol * \
                                                math.log(x / n_query_title_tol, 2),
                                      query_title_words.values())))

    query_title_words_share = {}
    for word in query_words:
        if word in title_words:
            query_title_words_share[word] = query_title_words_share.get(
                word, 0) + query_words[word]
    for word in title_words:
        if word in query_words:
            query_title_words_share[word] = query_title_words_share.get(
                word, 0) + title_words[word]

    WordMatchShare_Entropy = abs(sum(map(lambda x: x / n_query_title_tol * \
                                                   math.log(x / n_query_title_tol, 2),
                                         query_title_words_share.values())))
    return query_Entropy, title_Entropy, query_title_Entropy, WordMatchShare_Entropy


fea_names = ["query_Entropy", "title_Entropy", "query_title_Entropy", "WordMatchShare_Entropy"]
data[fea_names] = data.progress_apply(lambda row: get_entropy(row), result_type='expand', axis=1)


## 补充特征
def get_ngram_rp_query_in_title(query, title):
    query = list(query.strip().split())
    title = list(title.strip().split())
    query_2gram = []
    for i in range(len(query) - 1):
        query_2gram.append(query[i] + query[i + 1])
    query.extend(query_2gram)

    title_2gram = []
    for i in range(len(title) - 1):
        title_2gram.append(title[i] + title[i + 1])
    title.extend(title_2gram)

    len_query = len(query)
    len_title = len(title)
    len_common = len(set(query) & set(title))

    recall = len_common / (len_query + 0.001)
    precision = len_common / (len_title + 0.001)
    acc = len_common / (len_query + len_title - len_common)
    return [recall, precision, acc]


def get_prf(row):
    query_set = set(row['query'].strip().split())
    title_set = set(row['title'].strip().split())
    common_words_len = len(query_set & title_set)
    query_len = len(row['query'].strip().split())
    title_len = len(row['title'].strip().split())
    recall, precision, acc = get_ngram_rp_query_in_title(row['query'], row['title'])

    jaccard_similarity = common_words_len / len(query_set | title_set)
    qt_coword_query_ratio = common_words_len / query_len
    qt_coword_title_ratio = common_words_len / title_len
    qt_len_mean = (query_len + title_len) / 2.0
    qt_common_word_acc = common_words_len / (query_len + title_len - common_words_len)
    ngram_query_title_precision = precision
    ngram_query_title_recall = recall
    ngram_query_title_acc = acc

    return jaccard_similarity, qt_coword_query_ratio, qt_coword_title_ratio, qt_len_mean, \
           qt_common_word_acc, ngram_query_title_precision, ngram_query_title_recall, ngram_query_title_acc



fea_names = ["jaccard_similarity", "qt_coword_query_ratio", "qt_coword_title_ratio",
             "qt_len_mean", "qt_common_word_acc",
             "ngram_query_title_precision", "ngram_query_title_recall", "ngram_query_title_acc"]
data[fea_names] = data.progress_apply(lambda row: get_prf(row), result_type='expand', axis=1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 14688.68it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 3302341.55it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 2505557.95it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

一共有19325个unique文档.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 13225.72it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 12243.74it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:01<00:00, 8330.86it/s]
  1%|███▏                                                                                                                                                                      

7


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:02<00:00, 4442.67it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 13286.06it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:00<00:00, 12480.83it/s]


##  SentenceTransformer

In [31]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

query_embedding = model.encode(data['q1'],show_progress_bar=False)
passage_embedding = model.encode(data['q2'],show_progress_bar=False)
# q1_sbert_df=pd.DataFrame(query_embedding,columns=[f'q1_sber_vec_{i}' for i in range(512)])
# q2_sbert_df=pd.DataFrame(passage_embedding,columns=[f'q2_sber_vec_{i}' for i in range(512)])
# data=pd.concat([data,q1_sbert_df,q2_sbert_df],axis=1)

In [32]:
def get_sbert_sim(row,num):
#     print(row.name)
    query_vec = query_embedding[row.name]
    title_vec = passage_embedding[row.name]

    if num == 1:
        try:
            vec_cosine = cosine(query_vec, title_vec)
            return vec_cosine
        except Exception as e:
            return 0
    if num == 2:
        try:
            vec_canberra = canberra(query_vec, title_vec) / len(query_vec)
            return vec_canberra
        except Exception as e:
            return 0
    if num == 3:
        try:
            vec_cityblock = cityblock(query_vec, title_vec) / len(query_vec)
            return vec_cityblock
        except Exception as e:
            return 0
    if num == 4:
        try:
            vec_euclidean = euclidean(query_vec, title_vec)
            return vec_euclidean
        except Exception as e:
            return 0
    if num == 5:
        try:
            vec_braycurtis = braycurtis(query_vec, title_vec)
            return vec_braycurtis
        except Exception as e:
            return 0
    if num == 6:
        try:
            vec_minkowski = minkowski(query_vec, title_vec)
            return vec_minkowski
        except Exception as e:
            return 0
    if num == 7:
        try:
            vec_correlation = correlation(query_vec, title_vec)
            return vec_correlation
        except Exception as e:
            return 0

    if num == 8:
        try:
            vec_chebyshev = chebyshev(query_vec, title_vec)
            return vec_chebyshev
        except Exception as e:
            return 0

    if num == 9:
        try:
            vec_jensenshannon = jensenshannon(query_vec, title_vec)
            return vec_jensenshannon
        except Exception as e:
            return 0

    if num == 10:
        try:
            vec_mahalanobis = mahalanobis(query_vec, title_vec)
            return vec_mahalanobis
        except Exception as e:
            return 0

    if num == 11:
        try:
            vec_seuclidean = seuclidean(query_vec, title_vec)
            return vec_seuclidean
        except Exception as e:
            return 0
    if num == 12:
        try:
            vec_sqeuclidean = sqeuclidean(query_vec, title_vec)
            return vec_sqeuclidean
        except Exception as e:
            return 0

In [33]:
data['sb_vec_cosine'] = data.apply(lambda index: get_sbert_sim(index, 1),axis=1)
data['sb_vec_canberra'] = data.apply(lambda index: get_sbert_sim(index, 2),axis=1)
data['sb_sb_sb_vec_cityblock'] = data.apply(lambda index: get_sbert_sim(index, 3),axis=1)
data['sb_sb_vec_euclidean'] = data.apply(lambda index: get_sbert_sim(index, 4),axis=1)
data['sb_vec_braycurtis'] = data.apply(lambda index: get_sbert_sim(index, 5),axis=1)
data['sb_vec_minkowski'] = data.apply(lambda index: get_sbert_sim(index, 6),axis=1)
data['sb_vec_correlation'] = data.apply(lambda index: get_sbert_sim(index, 7),axis=1)

data['sb_vec_chebyshev'] = data.apply(lambda index: get_sbert_sim(index, 8),axis=1)
data['sb_sb_vec_jensenshannon'] = data.apply(lambda index: get_sbert_sim(index, 9),axis=1)
data['sb_vec_mahalanobis'] = data.apply(lambda index: get_sbert_sim(index, 10),axis=1)
data['sb_vec_seuclidean'] = data.apply(lambda index: get_sbert_sim(index, 11),axis=1)
data['sb_vec_sqeuclidean'] = data.apply(lambda index: get_sbert_sim(index, 12),axis=1)



### simpletransformers预测结果

In [34]:
import torch
torch.cuda.is_available()


True

In [35]:
import os
import pandas as pd
import numpy as np
from scipy.special import softmax
tmp_train=[]
tmp_test=[]
for file in os.listdir('result_kflods_pairs'):
#     print(file)
    if file.endswith('npy') and 'test' in file:
        probs=np.load('result_kflods_pairs/'+file)
        probs=softmax(probs,axis=1)[:,1]
        tmp_test.append(probs)
    if file.endswith('npy') and 'train' in file:
        probs=np.load('result_kflods_pairs/'+file)
        probs=softmax(probs,axis=1)[:,1]
        tmp_train.append(probs)


In [36]:
# train_prob_df=pd.DataFrame(np.hstack(tmp_train),columns=[f'smbert_prob_{i}' for i in range(18)])
# test_prob_df=pd.DataFrame(np.hstack(tmp_test),columns=[f'smbert_prob_{i}' for i in range(18)])
# prob_df=pd.concat([train_prob_df,test_prob_df],axis=0).reset_index(drop=True)
# prob_df

train_prob_df=pd.DataFrame(np.vstack(tmp_train).T,columns=[f'smbert_prob_{i}' for i in range(9)])
test_prob_df=pd.DataFrame(np.vstack(tmp_test).T,columns=[f'smbert_prob_{i}' for i in range(9)])
prob_df=pd.concat([train_prob_df,test_prob_df],axis=0).reset_index(drop=True)
prob_df

Unnamed: 0,smbert_prob_0,smbert_prob_1,smbert_prob_2,smbert_prob_3,smbert_prob_4,smbert_prob_5,smbert_prob_6,smbert_prob_7,smbert_prob_8
0,0.721030,0.740094,0.743169,0.705813,0.753971,0.770251,0.768171,0.742659,0.786138
1,0.768305,0.726227,0.718618,0.748502,0.734214,0.757467,0.774990,0.732668,0.727966
2,0.753795,0.747497,0.716905,0.723207,0.715409,0.728038,0.793189,0.733517,0.742522
3,0.428676,0.279509,0.329308,0.381003,0.340693,0.213485,0.429684,0.316582,0.276741
4,0.253364,0.273256,0.243242,0.249783,0.258263,0.234219,0.215183,0.302043,0.201762
...,...,...,...,...,...,...,...,...,...
9995,0.113250,0.056813,0.117426,0.200263,0.098060,0.669763,0.037557,0.179920,0.278770
9996,0.043114,0.011754,0.119420,0.017799,0.163244,0.947966,0.009549,0.160152,0.109565
9997,0.006674,0.008922,0.007845,0.005696,0.008939,0.132476,0.001972,0.007586,0.003270
9998,0.002903,0.006609,0.003428,0.005069,0.005045,0.002088,0.000410,0.005311,0.002019


In [37]:
data=pd.concat([data,prob_df],axis=1)

In [38]:
data.columns

Index(['q1', 'q2', 'label', 'q1_len', 'q2_len', 'q1q2_len_diff',
       'q1q2_len_diff_abs', 'q1q2_rate', 'q2q1_rate', 'q1_end_special',
       ...
       'sb_vec_sqeuclidean', 'smbert_prob_0', 'smbert_prob_1', 'smbert_prob_2',
       'smbert_prob_3', 'smbert_prob_4', 'smbert_prob_5', 'smbert_prob_6',
       'smbert_prob_7', 'smbert_prob_8'],
      dtype='object', length=342)

## 5 模型训练

In [39]:
no_feas=['q1','q2','label','q1_words_list','q2_words_list','query','title']
features=[col for col in data.columns if col not in no_feas]

train,test=data[:train_size],data[train_size:]
len(features)


335

In [40]:
X = train[features] # 训练集输入
y = train['label'] # 训练集标签
X_test = test[features] # 测试集输入

In [41]:
import time
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

In [42]:
n_fold = 10
folds = KFold(n_splits=n_fold, shuffle=True,random_state=1314)

In [43]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'num_leaves': 5,
    'max_depth': 6,
    'min_data_in_leaf': 450,
    'learning_rate': 0.1,
#     'learning_rate': 0.1,

    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5,
    'lambda_l1': 1,  
    'lambda_l2': 0.001,  # 越小l2正则程度越高
    'min_gain_to_split': 0.2,
}
feat_imp_df = pd.DataFrame({'feat': features, 'imp': 0}) 
oof = np.zeros(len(X))
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
    X_train, X_valid = X[features].iloc[train_index], X[features].iloc[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]
    model = model = lgb.LGBMClassifier(**params, n_estimators=50000, n_jobs=-1)
    model.fit(X_train, y_train,
              eval_set=[(X_train, y_train), (X_valid, y_valid)],
              eval_metric='binary_logloss',
              verbose=50, early_stopping_rounds=200)
    y_pred_valid = model.predict(X_valid)
    y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
    feat_imp_df['imp'] += model.feature_importances_ / 5
    oof[valid_index] = y_pred_valid.reshape(-1, )
    prediction += y_pred
prediction /= n_fold

Training until validation scores don't improve for 200 rounds
[50]	training's binary_logloss: 0.196014	valid_1's binary_logloss: 0.232631
[100]	training's binary_logloss: 0.167501	valid_1's binary_logloss: 0.23384
[150]	training's binary_logloss: 0.146128	valid_1's binary_logloss: 0.233459
[200]	training's binary_logloss: 0.12881	valid_1's binary_logloss: 0.235922
Early stopping, best iteration is:
[40]	training's binary_logloss: 0.203593	valid_1's binary_logloss: 0.23001
Training until validation scores don't improve for 200 rounds
[50]	training's binary_logloss: 0.198043	valid_1's binary_logloss: 0.210849
[100]	training's binary_logloss: 0.168977	valid_1's binary_logloss: 0.213345
[150]	training's binary_logloss: 0.148068	valid_1's binary_logloss: 0.217567
[200]	training's binary_logloss: 0.131482	valid_1's binary_logloss: 0.220158
[250]	training's binary_logloss: 0.116575	valid_1's binary_logloss: 0.224799
Early stopping, best iteration is:
[55]	training's binary_logloss: 0.194567	v

In [44]:
feat_imp_df.sort_values(by='imp',ascending=False)[:30]

Unnamed: 0,feat,imp
332,smbert_prob_6,69.6
331,smbert_prob_5,50.0
327,smbert_prob_1,48.2
334,smbert_prob_8,46.2
329,smbert_prob_3,21.6
330,smbert_prob_4,17.6
333,smbert_prob_7,12.8
13,q1_pos_5,9.8
317,sb_sb_vec_euclidean,7.4
120,q1_vec_57,6.8


In [45]:
from sklearn.metrics import accuracy_score
y_pred = (oof > 0.5).astype(int)
# score=accuracy_score(np.round(abs(oof)) ,train['label'].values)
score=accuracy_score(y_pred ,train['label'].values)

score

0.909

In [46]:
sub_pred = (prediction > 0.5).astype(int)
sample_submit['label']=sub_pred

In [47]:
sample_submit[['label']].to_csv('lgb.csv',index=None)

In [48]:
sample_submit['label'].value_counts()

1    2929
0    2071
Name: label, dtype: int64

In [49]:
# # 优化：
# 0 base 839 8406
# 1.word2vec iter=5改为iter=10 8406 8508
# 2.加了tfidf之后 0.8496 0.848 降分
# 3. 词粒度特征  0.8628
# 4. 直接添加词向量特征 0.8588 0.8608 降分
# 5.sb_sim 0.8708
# 6.tfidf_sim 0.8756 0.8752
# 7.加入bert prob 0.9096 0.9076
# 8.bert 0.907
# 9.bert单概率输出以及去除word2vec向量特征 0.9104
# 10. reg改成cls 0.9112 9078
