In [None]:
import pandas as pd
import json
from gensim.models import Word2Vec
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# 加载词到索引的映射
def load_word2idx(file_path):
    df = pd.read_csv(file_path, header=None, names=['word', 'index'])
    return df.set_index('word')['index'].to_dict()

# 加载数据
def load_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# 准备文本数据，使用索引
def prepare_indexed_text(data, word2idx):
    indexed_data = []
    for item in data.values():
        words = item['claim_text'].split() if 'claim_text' in item else item.split()
        indexed_data.append([word2idx.get(word, word2idx.get('<UNK>', 4)) for word in words])  # 使用<UNK>对未知词汇编码
    return indexed_data

# 训练Word2Vec模型
def train_word2vec(sentences, vector_size=100, window=5):
    model = Word2Vec(sentences, vector_size=vector_size, window=window, min_count=1, workers=4, sg=1)  # 使用skip-gram模型
    return model

# 文本转向量
def text_to_vector(text_indices, model):
    return np.mean([model.wv[index] for index in text_indices if index in model.wv.key_to_index], axis=0, where=[True for _ in text_indices if _ in model.wv.key_to_index])

word2idx = load_word2idx('data/preprocessing_result.csv')
train_claims = load_json('data/train-claims.json')
evidences = load_json('data/evidence.json')

print('word2idx:', word2idx3)


# 准备索引文本数据
indexed_claims = prepare_indexed_text(train_claims, word2idx)
indexed_evidences = {id: prepare_indexed_text({id: text}, word2idx)[0] for id, text in evidences.items()}

# 训练Word2Vec模型
all_texts = indexed_claims + list(indexed_evidences.values())
w2v_model = train_word2vec(all_texts)



In [None]:
indexed_claims = prepare_indexed_text(train_claims, word2idx)
indexed_evidences = {id: prepare_indexed_text({id: text}, word2idx)[0] for id, text in evidences.items()}



In [None]:
print(indexed_evidences)


In [None]:
# 转换claims和evidences为向量
claim_vectors = {claim_id: text_to_vector(claim, w2v_model) for claim_id, claim in zip(train_claims.keys(), indexed_claims)}
evidence_vectors = {id: text_to_vector(evidence, w2v_model) for id, evidence in indexed_evidences.items()}

# 计算相似度并选择最相关的evidence
top_evidences = {}
for claim_id, claim_vector in claim_vectors.items():
    similarities = {evidence_id: cosine_similarity([claim_vector], [evidence_vector])[0][0] for evidence_id, evidence_vector in evidence_vectors.items()}
    sorted_evidences = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    top_evidences[claim_id] = sorted_evidences[:5]  # 选择相似度最高的5个evidence

print(top_evidences)
