BOW检索评估

In [1]:
import pandas as pd

# 读取Excel文件
df = pd.read_excel("Label.xlsx")

# 定义词频检索函数
def dense_retrieval(keywords, dataframe, top_n=5):
    keywords = [keyword.strip().lower() for keyword in keywords if isinstance(keyword, str)]
    relevance_scores = []
    
    for index, row in dataframe.iterrows():
        title = str(row[1]).lower()
        score = sum(keyword in title for keyword in keywords)
        relevance_scores.append({"索引": index, "得分": score})
    
    relevance_scores = pd.DataFrame(relevance_scores)
    relevance_scores = relevance_scores.sort_values(by="得分", ascending=False)
    top_indices = relevance_scores.head(top_n)["索引"].tolist()
    return dataframe.loc[top_indices][df.columns[1]].values

# 评估检索方法的准确率
def evaluate_retrieval(dataframe, num_rows=500):
    correct_count = 0
    total_count = 0
    
    # 只处理前num_rows条数据
    limited_dataframe = dataframe.head(num_rows)
    
    for idx, row in limited_dataframe.iterrows():
        if pd.isna(row[5]) and pd.isna(row[6]) and pd.isna(row[14]):  # 跳过所有关键词都为空的行
            continue
        
        keywords_5 = str(row[5]).split(",") if not pd.isna(row[5]) else []
        keywords_6 = str(row[6]).split(",") if not pd.isna(row[6]) else []
        keywords_11 = str(row[11]).split(",") if not pd.isna(row[11]) else []
        keywords_14 = str(row[14]).split(",") if not pd.isna(row[14]) else []
        keywords = keywords_5 + keywords_6 + keywords_11 + keywords_14
        
        true_title = row[1]
        retrieved_titles = dense_retrieval(keywords, dataframe)
        
        if true_title in retrieved_titles:
            correct_count += 1
        
        total_count += 1
    
    accuracy = correct_count / total_count if total_count > 0 else 0
    return accuracy

# 计算平均准确率
accuracy = evaluate_retrieval(df, num_rows=500)
print(f"Average accuracy: {accuracy * 100:.2f}%")



  if pd.isna(row[5]) and pd.isna(row[6]) and pd.isna(row[14]):  # 跳过所有关键词都为空的行
  keywords_5 = str(row[5]).split(",") if not pd.isna(row[5]) else []
  keywords_6 = str(row[6]).split(",") if not pd.isna(row[6]) else []
  keywords_11 = str(row[11]).split(",") if not pd.isna(row[11]) else []
  keywords_14 = str(row[14]).split(",") if not pd.isna(row[14]) else []
  true_title = row[1]
  title = str(row[1]).lower()
  if pd.isna(row[5]) and pd.isna(row[6]) and pd.isna(row[14]):  # 跳过所有关键词都为空的行
  keywords_5 = str(row[5]).split(",") if not pd.isna(row[5]) else []
  keywords_6 = str(row[6]).split(",") if not pd.isna(row[6]) else []
  keywords_11 = str(row[11]).split(",") if not pd.isna(row[11]) else []
  keywords_14 = str(row[14]).split(",") if not pd.isna(row[14]) else []
  true_title = row[1]
  title = str(row[1]).lower()
  if pd.isna(row[5]) and pd.isna(row[6]) and pd.isna(row[14]):  # 跳过所有关键词都为空的行
  keywords_5 = str(row[5]).split(",") if not pd.isna(row[5]) else []
  keywords_6 = str(row[6])

Average accuracy: 46.40%


TF-IDF检索评估

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# 读取Excel文件
df = pd.read_excel("Label.xlsx")

# 定义TF-IDF检索函数
def tfidf_retrieval(keywords, dataframe, top_n=5):
    # 将关键词转换为小写，并删除空白字符
    keywords = [keyword.strip().lower() for keyword in keywords if isinstance(keyword, str)]
    
    # 将DataFrame中的文本数据转换为列表
    corpus = [str(row[1]).lower() for _, row in dataframe.iterrows()]
    
    # 初始化TF-IDF向量化器
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    
    # 对查询进行向量化
    query = " ".join(keywords)
    query_vector = vectorizer.transform([query])
    
    # 计算查询与文档的余弦相似度
    similarity = cosine_similarity(query_vector, X)
    similarity = similarity.flatten()  # 将结果展平为一维数组
    
    # 获取相似度最高的top_n行的索引
    top_indices = similarity.argsort()[-top_n:][::-1]
    
    # 返回相关度最高的几行的第一列内容
    return dataframe.loc[top_indices][df.columns[1]].values

# 评估检索方法的准确率
def evaluate_retrieval(dataframe, num_rows=500):
    correct_count = 0
    total_count = 0
    
    # 只处理前num_rows条数据
    limited_dataframe = dataframe.head(num_rows)
    
    for idx, row in limited_dataframe.iterrows():
        if pd.isna(row[5]) and pd.isna(row[6]) and pd.isna(row[14]):  # 跳过所有关键词都为空的行
            continue
        
        keywords_5 = str(row[5]).split(",") if not pd.isna(row[5]) else []
        keywords_6 = str(row[6]).split(",") if not pd.isna(row[6]) else []
        keywords_11 = str(row[11]).split(",") if not pd.isna(row[11]) else []
        keywords_14 = str(row[14]).split(",") if not pd.isna(row[14]) else []
        keywords = keywords_5 + keywords_6 + keywords_11 + keywords_14
        #keywords=keywords_5+keywords_11
        
        true_title = row[1]
        retrieved_titles = tfidf_retrieval(keywords, dataframe)
        
        if true_title in retrieved_titles:
            correct_count += 1
        
        total_count += 1
    
    accuracy = correct_count / total_count if total_count > 0 else 0
    return accuracy

# 计算平均准确率
accuracy = evaluate_retrieval(df, num_rows=500)
print(f"Average accuracy: {accuracy * 100:.2f}%")


  corpus = [(str(row[1])).lower() for _, row in limited_dataframe.iterrows()]
  if pd.isna(row[5]) and pd.isna(row[6]) and pd.isna(row[14]):  # 跳过所有关键词都为空的行
  keywords_5 = str(row[5]).split(",") if not pd.isna(row[5]) else []
  keywords_6 = str(row[6]).split(",") if not pd.isna(row[6]) else []
  keywords_11 = str(row[11]).split(",") if not pd.isna(row[11]) else []
  keywords_14 = str(row[14]).split(",") if not pd.isna(row[14]) else []
  true_title = row[1]
  if pd.isna(row[5]) and pd.isna(row[6]) and pd.isna(row[14]):  # 跳过所有关键词都为空的行
  keywords_5 = str(row[5]).split(",") if not pd.isna(row[5]) else []
  keywords_6 = str(row[6]).split(",") if not pd.isna(row[6]) else []
  keywords_11 = str(row[11]).split(",") if not pd.isna(row[11]) else []
  keywords_14 = str(row[14]).split(",") if not pd.isna(row[14]) else []
  true_title = row[1]
  if pd.isna(row[5]) and pd.isna(row[6]) and pd.isna(row[14]):  # 跳过所有关键词都为空的行
  keywords_5 = str(row[5]).split(",") if not pd.isna(row[5]) else []
  keywor

Average accuracy: 82.80%


  if pd.isna(row[5]) and pd.isna(row[6]) and pd.isna(row[14]):  # 跳过所有关键词都为空的行
  keywords_5 = str(row[5]).split(",") if not pd.isna(row[5]) else []
  keywords_6 = str(row[6]).split(",") if not pd.isna(row[6]) else []
  keywords_11 = str(row[11]).split(",") if not pd.isna(row[11]) else []
  keywords_14 = str(row[14]).split(",") if not pd.isna(row[14]) else []
  true_title = row[1]
  if pd.isna(row[5]) and pd.isna(row[6]) and pd.isna(row[14]):  # 跳过所有关键词都为空的行
  keywords_5 = str(row[5]).split(",") if not pd.isna(row[5]) else []
  keywords_6 = str(row[6]).split(",") if not pd.isna(row[6]) else []
  keywords_11 = str(row[11]).split(",") if not pd.isna(row[11]) else []
  keywords_14 = str(row[14]).split(",") if not pd.isna(row[14]) else []
  true_title = row[1]
  if pd.isna(row[5]) and pd.isna(row[6]) and pd.isna(row[14]):  # 跳过所有关键词都为空的行
  keywords_5 = str(row[5]).split(",") if not pd.isna(row[5]) else []
  keywords_6 = str(row[6]).split(",") if not pd.isna(row[6]) else []
  keywords_11 = s

BM25检索评估

In [3]:
import pandas as pd
from rank_bm25 import BM25Okapi
import numpy as np

# 读取Excel文件
df = pd.read_excel("Label.xlsx")

# 定义BM25检索函数
def bm25_retrieval(keywords, dataframe, bm25, top_n=5):
    keywords = [keyword.strip().lower() for keyword in keywords if isinstance(keyword, str)]
    query = " ".join(keywords)
    query_tokens = query.split()
    
    scores = bm25.get_scores(query_tokens)
    top_indices = np.argsort(scores)[-top_n:][::-1]
    
    return dataframe.loc[top_indices][df.columns[1]].values

# 评估检索方法的准确率
def evaluate_retrieval_bm25(dataframe, num_rows=500):
    correct_count = 0
    total_count = 0
    
    # 只处理前num_rows条数据
    limited_dataframe = dataframe.head(num_rows)
    
    # 预处理文本数据
    corpus = [(str(row[1])).lower() for _, row in limited_dataframe.iterrows()]
    tokenized_corpus = [doc.split() for doc in corpus]
    bm25 = BM25Okapi(tokenized_corpus)
    
    for idx, row in limited_dataframe.iterrows():
        if pd.isna(row[5]) and pd.isna(row[6]) and pd.isna(row[14]):  # 跳过所有关键词都为空的行
            continue
        
        keywords_5 = str(row[5]).split(",") if not pd.isna(row[5]) else []
        keywords_6 = str(row[6]).split(",") if not pd.isna(row[6]) else []
        keywords_11 = str(row[11]).split(",") if not pd.isna(row[11]) else []
        keywords_14 = str(row[14]).split(",") if not pd.isna(row[14]) else []
        keywords = keywords_5 + keywords_6 + keywords_11 + keywords_14
        #keywords=keywords_5+ keywords_11
        
        true_title = row[1]
        retrieved_titles = bm25_retrieval(keywords, dataframe, bm25)
        #print(true_title)
        #print(retrieved_titles)
        if true_title in retrieved_titles:
            correct_count += 1
        
        total_count += 1
    
    accuracy = correct_count / total_count if total_count > 0 else 0
    return accuracy

# 计算平均准确率
accuracy = evaluate_retrieval_bm25(df, num_rows=500)
print(f"Average accuracy: {accuracy * 100:.2f}%")


  if pd.isna(row[5]) and pd.isna(row[6]) and pd.isna(row[14]):  # 跳过所有关键词都为空的行
  keywords_5 = str(row[5]).split(",") if not pd.isna(row[5]) else []
  keywords_6 = str(row[6]).split(",") if not pd.isna(row[6]) else []
  keywords_11 = str(row[11]).split(",") if not pd.isna(row[11]) else []
  keywords_14 = str(row[14]).split(",") if not pd.isna(row[14]) else []
  true_title = row[1]
  corpus = [str(row[1]).lower() for _, row in dataframe.iterrows()]
  if pd.isna(row[5]) and pd.isna(row[6]) and pd.isna(row[14]):  # 跳过所有关键词都为空的行
  keywords_5 = str(row[5]).split(",") if not pd.isna(row[5]) else []
  keywords_6 = str(row[6]).split(",") if not pd.isna(row[6]) else []
  keywords_11 = str(row[11]).split(",") if not pd.isna(row[11]) else []
  keywords_14 = str(row[14]).split(",") if not pd.isna(row[14]) else []
  true_title = row[1]
  corpus = [str(row[1]).lower() for _, row in dataframe.iterrows()]
  if pd.isna(row[5]) and pd.isna(row[6]) and pd.isna(row[14]):  # 跳过所有关键词都为空的行
  keywords_5 = str(

Average accuracy: 80.00%
