### 显示nom(s)的Polysemy score、在原文件的第几个review出现、对应的word embedding

#### 问题：比如目标词为beau的时候，不含beau但含beaufitude的review也会被找出（不过beau不是作业要求的名词/动词）

In [4]:
import pandas as pd
import torch
from transformers import CamembertTokenizer, CamembertModel
import numpy as np
import re

def preprocess_and_embed(texts, ids, word, tokenizer, model, batch_size=8):
    """预处理文本并获取目标词嵌入"""
    word_embeddings = []
    review_embeddings = []  # 存储每个review的embedding信息

    # 对批次进行编码
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        batch_ids = ids[i:i + batch_size]  # 对应的id
        inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors="pt")

        # 获取每个句子中目标词的位置
        word_ids = [next((i+1 for i, token in enumerate(tokenizer.tokenize(text)) if word in token), None) for text in batch_texts]

        with torch.no_grad():
            outputs = model(**inputs)
            hidden_states = outputs.last_hidden_state

            # 提取目标词的embeddings并记录索引
            for idx, word_id in enumerate(word_ids):
                if word_id is not None:
                    word_embeddings.append(hidden_states[idx, word_id].numpy())
                    review_embeddings.append({'id': batch_ids[idx], 'embedding': hidden_states[idx, word_id].numpy()})

    return word_embeddings, review_embeddings

def analyze_word(word, df, sample_size=100):
    """分析单个词的多义性"""
    tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
    model = CamembertModel.from_pretrained('camembert-base')
    model.eval()

    relevant_texts = df[df['review'].str.contains(word, case=False, na=False, regex=False)]
    if relevant_texts.empty:
        print(f"No occurrences found for word: {word}")
        return None

    relevant_texts = relevant_texts.sample(n=min(len(relevant_texts), sample_size), random_state=42)

    # 获取对应的id和预处理文本
    ids = relevant_texts.iloc[:, 0].values  # 第一列是id
    processed_texts = [re.sub(r'[^\w\s]', ' ', str(text).lower()).strip() for text in relevant_texts['review']]

    word_embeddings, review_embeddings = preprocess_and_embed(processed_texts, ids, word, tokenizer, model)

    if len(word_embeddings) < 2:
        return None

    # 计算余弦相似度矩阵并得到多义性分数
    embeddings_array = np.array(word_embeddings)
    norm = np.linalg.norm(embeddings_array, axis=1, keepdims=True)
    similarity_matrix = np.dot(embeddings_array / norm, (embeddings_array / norm).T)

    polysemy_score = np.std(similarity_matrix[np.triu_indices(len(similarity_matrix), k=1)])

    return {
        'word': word,
        'occurrences': len(word_embeddings),
        'polysemy_score': polysemy_score,
        'review_embeddings': review_embeddings  # 返回每个review的目标词嵌入和对应的id
    }

def quick_polysemy_analysis(file_path, target_words, sample_size=100):
    """主分析函数"""
    df = pd.read_csv(file_path, header=None, names=['id', 'film-url', 'review', 'polarity'])
    results = [analyze_word(word, df, sample_size) for word in target_words if analyze_word(word, df, sample_size)]

    for result in results:
        print(f"Word: {result['word']}")
        print(f"Occurrences: {result['occurrences']}")
        print(f"Polysemy score: {result['polysemy_score']:.4f}\n")

        # 输出每个review的目标词嵌入及其对应的id
        for review in result['review_embeddings']:
            print(f"Review ID {review['id']}: {review['embedding'][:5]}...")  # 显示嵌入的前5个数值，避免输出过长

    return results

# 使用示例
file_path = 'test.csv'
target_words = ['histoire']
results = quick_polysemy_analysis(file_path, target_words, sample_size=20)


Word: histoire
Occurrences: 20
Polysemy score: 0.0814

Review ID 5268.0: [ 0.09102798  0.16917096 -0.0473657   0.08430825 -0.02601945]...
Review ID 16712.0: [ 0.08545251 -0.19848987  0.1817712   0.09085849 -0.03145574]...
Review ID 14784.0: [ 0.00460607  0.38902065  0.07689978  0.0377639  -0.07112113]...
Review ID 8330.0: [ 0.04240461  0.29707024  0.04431971 -0.05684883 -0.10839397]...
Review ID 10590.0: [ 0.04102797 -0.01848955  0.1679564   0.12878132 -0.10983775]...
Review ID 10994.0: [ 0.05358908  0.08450424  0.25529757 -0.0101518  -0.16198653]...
Review ID 13438.0: [ 0.04295095  0.07399024  0.07357954  0.02631695 -0.08958986]...
Review ID 3650.0: [ 0.09568378  0.07725815  0.13869886  0.12635812 -0.07432344]...
Review ID 19938.0: [ 0.01281912  0.34325552  0.05590718 -0.01954528 -0.01867477]...
Review ID 1302.0: [ 0.03350243  0.07806947 -0.03638034 -0.00755402 -0.07943187]...
Review ID 10896.0: [ 0.03729968  0.30961126 -0.13613316  0.02256236 -0.08108057]...
Review ID 4412.0: [-0.047