### 简单显示多个nom在corpus里的Polysemy score

In [15]:
import pandas as pd
import torch
from transformers import CamembertTokenizer, CamembertModel
import numpy as np
import re


In [16]:
def preprocess_text(text):
    """简化的预处理函数"""
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()


In [17]:
def get_word_embeddings(texts, word, tokenizer, model, batch_size=8):
    """获取特定词在所有上下文中的嵌入向量"""
    word_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]

        # 对批次进行编码
        inputs = tokenizer(batch_texts,
                         padding=True,
                         truncation=True,
                         max_length=512,
                         return_tensors="pt")

        # 获取每个句子中目标词的位置
        word_ids = []
        for text in batch_texts:
            tokens = tokenizer.tokenize(text)
            # 找出目标词的位置（考虑分词后可能的变化）
            word_positions = []
            for i, token in enumerate(tokens):
                if word in token:
                    word_positions.append(i + 1)  # +1 是因为[CLS]标记
            if word_positions:
                word_ids.append(word_positions[0])
            else:
                word_ids.append(None)

        # 获取embeddings
        with torch.no_grad():
            outputs = model(**inputs)
            hidden_states = outputs.last_hidden_state

            # 提取目标词的embeddings
            for idx, word_id in enumerate(word_ids):
                if word_id is not None:
                    word_embeddings.append(hidden_states[idx, word_id].numpy())

    return word_embeddings


In [18]:
def analyze_single_word(word, df, sample_size=100):
    """分析单个词的多义性"""
    # 初始化模型和分词器
    tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
    model = CamembertModel.from_pretrained('camembert-base')
    model.eval()

    # 筛选包含目标词的句子
    relevant_texts = df[df['review'].str.contains(word, case=False, na=False, regex=False)]

    if len(relevant_texts) == 0:
        print(f"No occurrences found for word: {word}")
        return None

    # 如果句子太多，随机抽样
    if len(relevant_texts) > sample_size:
        relevant_texts = relevant_texts.sample(n=sample_size, random_state=42)

    print(f"\nAnalyzing word '{word}' in {len(relevant_texts)} contexts...")

    # 预处理文本
    processed_texts = [preprocess_text(text) for text in relevant_texts['review']]

    # 获取词的所有embeddings
    word_embeddings = get_word_embeddings(processed_texts, word, tokenizer, model)

    # 确保我们有足够的embeddings来计算多义性
    if len(word_embeddings) >= 2:
        # 转换为numpy数组
        embeddings_array = np.array(word_embeddings)

        # 计算余弦相似度矩阵
        norm = np.linalg.norm(embeddings_array, axis=1, keepdims=True)
        normalized_embeddings = embeddings_array / norm
        similarity_matrix = np.dot(normalized_embeddings, normalized_embeddings.T)

        # 获取上三角矩阵的值（不包括对角线）
        upper_tri = similarity_matrix[np.triu_indices(len(similarity_matrix), k=1)]

        # 计算标准差
        polysemy_score = np.std(upper_tri)

        return {
            'word': word,
            'occurrences': len(word_embeddings),
            'polysemy_score': polysemy_score
        }
    return None


In [19]:
# 主函数
def quick_polysemy_analysis(file_path, target_words, sample_size=100):
    """主分析函数"""
    # 读取数据
    print("Reading data...")
    df = pd.read_csv(file_path)

    # 分析每个词
    results = []
    for word in target_words:
        result = analyze_single_word(word, df, sample_size)
        if result:
            results.append(result)
            print(f"Word: {result['word']}")
            print(f"Occurrences: {result['occurrences']}")
            print(f"Polysemy score: {result['polysemy_score']:.4f}\n")

    return results


In [20]:
# 使用示例
file_path = 'test.csv'  # 或者您的文件路径
target_words = ['film', 'histoire', 'bureau', 'opéra', 'rouge', 'carte', 'règle', 'avocat']
results = quick_polysemy_analysis(file_path, target_words, sample_size=50)

Reading data...

Analyzing word 'film' in 50 contexts...
Word: film
Occurrences: 50
Polysemy score: 0.0931


Analyzing word 'histoire' in 50 contexts...
Word: histoire
Occurrences: 50
Polysemy score: 0.0802


Analyzing word 'bureau' in 37 contexts...
Word: bureau
Occurrences: 37
Polysemy score: 0.1691


Analyzing word 'opéra' in 50 contexts...
Word: opéra
Occurrences: 50
Polysemy score: 0.1950


Analyzing word 'rouge' in 50 contexts...
Word: rouge
Occurrences: 50
Polysemy score: 0.1403


Analyzing word 'carte' in 50 contexts...
Word: carte
Occurrences: 44
Polysemy score: 0.1788


Analyzing word 'règle' in 50 contexts...
Word: règle
Occurrences: 50
Polysemy score: 0.1765


Analyzing word 'avocat' in 48 contexts...
Word: avocat
Occurrences: 48
Polysemy score: 0.1441

