In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
%cd "/content/drive/MyDrive/Text Ranking"

/content/drive/MyDrive/Text Ranking


In [3]:
import jieba
import re
import Levenshtein
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from scipy.linalg import norm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [4]:
def process_unknown_data(filename):
    origin_data = open(filename).readlines()
    split_data = [line.replace("\n","").split("\t") for line in origin_data]
    pandas_data = pd.DataFrame(data=split_data)
    pandas_data.to_csv("datasets/query_data.csv",index=False,header=None)
    return pandas_data

In [5]:
#process_data = process_unknown_data('datasets/all.merge.samples')

In [6]:
data = pd.read_csv("query_data.csv")

In [7]:
data.head()

Unnamed: 0,label,qid,title
0,5,g5高速,G5京昆高速，瓦厂坪大桥路段山体险情，建议大家推迟出行！
1,5,g5高速,G5京昆高速雅西段拖乌山突降暴雪 部分路段积雪深达1米
2,5,g5高速,12月起，G5京昆高速开始“冬管”，这些地方需特别注意！
3,4,g5高速,G5O沪渝高速部分路段进行对接施工，这些车辆全天禁止通行！
4,5,g5高速,G5京昆高速因大雪、路面结冰，继续交通管制！


In [8]:
def clean_data(sentence):
    clean_punc_text = re.sub(r'[^\w\s]','',sentence).replace(" ","").strip()
    clean_punc_num_text = re.sub(r'[0-9]+','',clean_punc_text)
    clean_punc_num_eng_text = re.sub(r'[a-zA-Z]+', '', clean_punc_num_text) 
    return clean_punc_num_eng_text

data["qid"] = data["qid"].map(clean_data)
data["title"] = data["title"].map(clean_data)
data = data.drop_duplicates()
data = data.dropna()

In [9]:
data.head()

Unnamed: 0,label,qid,title
0,5,高速,京昆高速瓦厂坪大桥路段山体险情建议大家推迟出行
1,5,高速,京昆高速雅西段拖乌山突降暴雪部分路段积雪深达米
2,5,高速,月起京昆高速开始冬管这些地方需特别注意
3,4,高速,沪渝高速部分路段进行对接施工这些车辆全天禁止通行
4,5,高速,京昆高速因大雪路面结冰继续交通管制


In [10]:
def jieba_tokenize(sentence):
    return " ".join([word for word in jieba.cut(sentence)])

data["qid"] = data["qid"].apply(jieba_tokenize)
data["title"] = data["title"].apply(jieba_tokenize)
data.head()

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.831 seconds.
Prefix dict has been built successfully.


Unnamed: 0,label,qid,title
0,5,高速,京昆 高速 瓦厂 坪 大桥路 段 山体 险情 建议 大家 推迟 出行
1,5,高速,京昆 高速 雅 西段 拖 乌山 突降 暴雪 部分 路段 积雪 深达 米
2,5,高速,月 起 京昆 高速 开始 冬管 这些 地方 需 特别 注意
3,4,高速,沪 渝 高速 部分 路段 进行 对接 施工 这些 车辆 全天 禁止通行
4,5,高速,京昆 高速 因 大雪 路面 结冰 继续 交通管制


In [11]:
data['qid'] = data['qid'].replace('', np.nan)
data["title"] = data["title"].replace('', np.nan)
data.dropna(subset=['qid'],inplace=True)
data.dropna(subset=["title"],inplace=True)

In [12]:
def Levenshtein_Similarity(str1,str2):
    str2 = "".join([word for word in str2.split(" ") if word in str1])
    return Levenshtein.ratio(str1,str2)

In [16]:
levenshtein_score = data.apply(lambda row: Levenshtein_Similarity(row["qid"],row["title"]),axis=1)
levenshtein_corr = data["label"].corr(levenshtein_score)
levenshtein_corr

0.26324240399581395

In [14]:
def Jaccard_Cosine_Similarity(str1, str2):
    s2 = " ".join([ch for ch in str2 if ch in str1])
    s1 = " ".join([ch for ch in str1])
    corpus = [s1, s2]
    cv = CountVectorizer(tokenizer=lambda s: s.split())
    vectors = cv.fit_transform(corpus).toarray()
    numerator = np.sum(np.min(vectors, axis=0))
    denominator = np.sum(np.max(vectors, axis=0))
    return (1.0*numerator/denominator, np.dot(vectors[0],vectors[1])/(norm(vectors[0])*norm(vectors[1])))

In [17]:
jaccard_cos_score = data.apply(lambda row: Jaccard_Cosine_Similarity(row["qid"],row["title"]),axis=1)
jaccard_score, cos_score = jaccard_cos_score.apply(lambda x: x[0]), jaccard_cos_score.apply(lambda x: x[1])
jaccard_corr = data["label"].corr(jaccard_score)
cos_corr = data["label"].corr(cos_score)
print("jaccard_corr:", jaccard_corr)
print("cos_corr:", cos_corr)

jaccard_corr: 0.28510176443706226
cos_corr: 0.2856767051853966


In [18]:
def TFIDF_Similarity(str1, str2):
    s2 = " ".join([ch for ch in str2 if ch in str1])
    s1 = " ".join([ch for ch in str1])
    cv = TfidfVectorizer(tokenizer=lambda s: s.split())
    corpus = [s1, s2]
    vectors = cv.fit_transform(corpus).toarray()
    return np.dot(vectors[0], vectors[1]) / (norm(vectors[0]) * norm(vectors[1]))

In [19]:
tf_idf_score = data.apply(lambda row: TFIDF_Similarity(row["qid"],row["title"]),axis=1)
tf_idf_corr = data["label"].corr(tf_idf_score)
tf_idf_corr

0.2955434344434728

In [20]:
data["figure1"] = levenshtein_score
data["figure2"] = jaccard_score
data["figure3"] = cos_score
data["figure4"] = tf_idf_score

In [29]:
feature_data = data[["label","qid","figure1","figure2","figure3","figure4"]]

In [30]:
feature_data.head()

Unnamed: 0,label,qid,figure1,figure2,figure3,figure4
0,5,高速,1.0,1.0,1.0,1.0
1,5,高速,1.0,1.0,1.0,1.0
2,5,高速,1.0,1.0,1.0,1.0
3,4,高速,1.0,1.0,1.0,1.0
4,5,高速,1.0,1.0,1.0,1.0


In [31]:
Encoder = LabelEncoder()
feature_data["qid"] = Encoder.fit_transform(feature_data["qid"])

In [32]:
data_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_data = None
test_data = None
for train_index, test_index in data_split.split(feature_data, feature_data["label"]):
    train_data = feature_data.iloc[train_index,:]
    test_data = feature_data.iloc[test_index,:]
print("shape of train_data: ", train_data.shape)
print("shape of test_data: ", test_data.shape)

shape of train_data:  (324212, 6)
shape of test_data:  (81053, 6)


In [33]:
drop_train_data = train_data.dropna()
drop_test_data = test_data.dropna()

In [34]:
sort_train_data = drop_train_data.sort_values(by=['qid'])
sort_test_data = drop_test_data.sort_values(by=['qid'])

In [35]:
train_list = sort_train_data.values.tolist()
test_list = sort_test_data.values.tolist()

In [36]:
process_train_list = [" ".join([str(int(data[0])),"qid:"+str(int(data[1])),"1:"+str(data[2]),"2:"+str(data[3]),"3:"+str(data[4]),"4:"+str(data[5])]) for data in train_list]
process_test_list = [" ".join([str(int(data[0])),"qid:"+str(int(data[1])),"1:"+str(data[2]),"2:"+str(data[3]),"3:"+str(data[4]),"4:"+str(data[5])]) for data in test_list]

In [37]:
with open("query_train.txt", "w") as outfile:
    for unit_data in process_train_list:
        outfile.write(unit_data)
        outfile.write('\n') 

In [38]:
with open("query_test.txt", "w") as outfile:
    for unit_data in process_test_list:
        outfile.write(unit_data)
        outfile.write('\n') 