In [1]:

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
BART中文文本编码解码Demo
使用Hugging Face的transformers库实现文本编码为embeddings，然后解码回文本
"""

import torch
from transformers import BartTokenizer, BartModel
import numpy as np

def bart_encode_decode_demo():
    """
    BART模型中文文本编码解码示例
    """
    print("=== BART中文文本编码解码Demo ===\n")
    
    # 1. 加载预训练的BART模型和分词器
    # 这里使用支持中文的BART模型
    model_name = "facebook/bart-base"  # 或者使用支持中文的模型如 "fnlp/bart-base-chinese"
    
    try:
        print("正在加载模型和分词器...")
        tokenizer = BartTokenizer.from_pretrained(model_name)
        model = BartModel.from_pretrained(model_name)
        print("模型加载完成！\n")
    except Exception as e:
        print(f"模型加载失败: {e}")
        print("请安装transformers库: pip install transformers torch")
        return
    
    # 2. 准备中文测试文本
    test_texts = [
        "今天天气很好，我们去公园散步吧。",
        "人工智能正在改变我们的生活方式。",
        "学习Python编程需要持续的练习和思考。"
    ]
    
    # 3. 对每个文本进行编码和解码
    for i, text in enumerate(test_texts, 1):
        print(f"--- 示例 {i} ---")
        print(f"原始文本: {text}")
        
        # 文本编码为token IDs
        encoded = tokenizer.encode(text, return_tensors='pt', add_special_tokens=True)
        print(f"Token IDs: {encoded.squeeze().tolist()}")
        
        # 获取embeddings（编码器输出）
        with torch.no_grad():
            # 使用BART编码器获取隐藏状态
            encoder_outputs = model.encoder(encoded)
            embeddings = encoder_outputs.last_hidden_state
            
        print(f"Embeddings形状: {embeddings.shape}")
        print(f"Embeddings均值: {embeddings.mean().item():.4f}")
        print(f"Embeddings标准差: {embeddings.std().item():.4f}")
        
        # 解码回文本
        decoded_text = tokenizer.decode(encoded.squeeze(), skip_special_tokens=True)
        print(f"解码文本: {decoded_text}")
        
        # 检查编码解码是否一致
        is_identical = (text.replace(" ", "") == decoded_text.replace(" ", ""))
        print(f"编解码一致性: {'✓' if is_identical else '✗'}")
        print()

def main():
    """
    主函数
    """
    try:
        # 基础编码解码示例
        bart_encode_decode_demo()
        
        
    except ImportError:
        print("请安装所需的依赖包:")
        print("pip install transformers torch numpy")
    except Exception as e:
        print(f"运行出错: {e}")

if __name__ == "__main__":
    main()

=== BART中文文本编码解码Demo ===

正在加载模型和分词器...


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

  [2m2025-09-16T15:10:07.346180Z[0m [33m WARN[0m  [33mRetry attempt #0. Sleeping 655.338855ms before the next attempt[0m
    [2;3mat[0m /Users/runner/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/reqwest-retry-0.7.0/src/middleware.rs:171

  [2m2025-09-16T15:10:07.548152Z[0m [33m WARN[0m  [33mRetry attempt #0. Sleeping 2.714463543s before the next attempt[0m
    [2;3mat[0m /Users/runner/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/reqwest-retry-0.7.0/src/middleware.rs:171

  [2m2025-09-16T15:10:08.585814Z[0m [33m WARN[0m  [33mRetry attempt #0. Sleeping 2.823508966s before the next attempt[0m
    [2;3mat[0m /Users/runner/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/reqwest-retry-0.7.0/src/middleware.rs:171



Cancellation requested; stopping current tasks.


KeyboardInterrupt: 