抱抱脸的模型都存在本地什么位置

In [None]:
import os
# 设置 Hugging Face 镜像
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
# 使用清华镜像
#os.environ['HF_ENDPOINT'] = 'https://mirrors.tuna.tsinghua.edu.cn/hugging-face-models'
from huggingface_hub import snapshot_download
from tqdm.auto import tqdm
# 下载模型并显示进度
model_path = snapshot_download(
    repo_id="Qwen/Qwen2.5-3B-Instruct",
    local_dir_use_symlinks=False,  # Windows 系统建议设置为 False
    tqdm_class=tqdm  # 添加进度条
)
print(f"模型保存路径: {model_path}")

In [None]:
from huggingface_hub import scan_cache_dir, HfApi

cache_info = scan_cache_dir()
api = HfApi()

# 打印所有缓存的模型
for repo in cache_info.repos:
    print(f"模型名称: {repo.repo_id}")
    print(f"本地路径: {repo.repo_path}")
    try:
        # 检查模型在 Hub 上的状态
        api.repo_info(repo_id=repo.repo_id)
        print(f"模型状态: 可访问")
    except Exception as e:
        print(f"模型状态: 不可访问或需要认证")
    print(f"占用空间: {repo.size_on_disk / 1024 / 1024:.2f} MB")
    print("-" * 50)

In [12]:
import os
from pathlib import Path
import shutil
from huggingface_hub import scan_cache_dir
import pyarrow.parquet as pq
import pyarrow as pa
from datasets import load_dataset, load_from_disk
import json
import pandas as pd

# Hugging Face 数据集默认缓存目录
cache_dir = os.path.expanduser('~/.cache/huggingface/datasets')

def get_dataset_info():
    if os.path.exists(cache_dir):
        print(f"数据集缓存目录: {cache_dir}")
        print("\n本地数据集信息:")
        
        total_size = 0
        for dataset_dir in Path(cache_dir).rglob('dataset_info.json'):
            try:
                # 读取数据集信息
                with open(dataset_dir, 'r', encoding='utf-8') as f:
                    info = json.load(f)
                
                # 基础信息统计
                dir_size = sum(f.stat().st_size for f in dataset_dir.parent.glob('**/*') if f.is_file())
                dir_size_mb = dir_size / (1024 * 1024)
                
                # 提取正确的数据集名称
                path_parts = str(dataset_dir.parent).split(os.sep)
                dataset_name = None
                config_name = None
                
                # 尝试从路径中提取数据集名称和配置名称
                for part in path_parts:
                    if "___" in part:
                        # 从目录名称中提取数据集名称，例如 openai___gsm8k -> openai/gsm8k
                        dataset_name = part.replace("___", "/")
                    # 检查是否有配置名称
                    elif part in ['main', 'socratic'] or dataset_dir.parent.name in ['main', 'socratic']:
                        config_name = part if part in ['main', 'socratic'] else dataset_dir.parent.name
                
                # 如果没有找到，使用原来的方法
                if not dataset_name:
                    dataset_name = dataset_dir.parent.parent.name
                    if dataset_name == "datasets":
                        dataset_name = dataset_dir.parent.parent.parent.name
                
                print(f"\n{'='*50}")
                print(f"数据集: {dataset_name}")
                if config_name:
                    print(f"配置: {config_name}")
                print(f"路径: {dataset_dir.parent}")
                print(f"大小: {dir_size_mb:.2f} MB")
                
                # 显示数据集详细信息
                if 'description' in info:
                    print(f"\n描述: {info['description'][:200]}...")
                if 'citation' in info:
                    print(f"引用: {info['citation'][:200]}...")
                
                # 尝试直接加载数据集
                try:
                    # 使用正确的数据集名称和配置名称加载
                    if config_name:
                        dataset = load_dataset(dataset_name, config_name, trust_remote_code=True)
                    else:
                        # 尝试获取可用的配置
                        try:
                            configs = info.get('config_names', [])
                            if configs:
                                config_name = configs[0]['name']
                                print(f"自动选择配置: {config_name}")
                                dataset = load_dataset(dataset_name, config_name, trust_remote_code=True)
                            else:
                                dataset = load_dataset(dataset_name, trust_remote_code=True)
                        except Exception:
                            dataset = load_dataset(dataset_name, trust_remote_code=True)
                    
                    if dataset is not None:
                        print("\n数据样例(前10条):")
                        # 处理数据集可能有多个分片的情况
                        if isinstance(dataset, dict):
                            split_name = next(iter(dataset.keys()))  # 获取第一个分片名称
                            df = dataset[split_name].to_pandas().head(10)
                            
                            # 设置显示选项
                            pd.set_option('display.max_colwidth', None)  # 不截断列内容
                            pd.set_option('display.width', 1000)  # 增加显示宽度
                            
                            # 逐行显示数据
                            for i, (_, row) in enumerate(df.iterrows()):
                                print(f"\n--- 数据 {i+1} ---")
                                for col, val in row.items():
                                    print(f"{col}: {val}")
                                print("-" * 40)
                            
                            # 恢复显示选项
                            pd.reset_option('display.max_colwidth')
                            pd.reset_option('display.width')
                            
                            print(f"\n数据集大小: {len(dataset[split_name])} 条")
                            print(f"字段列表: {', '.join(dataset[split_name].column_names)}")
                        else:
                            # 同样处理非字典类型的数据集
                            df = dataset.to_pandas().head(10)
                            
                            pd.set_option('display.max_colwidth', None)
                            pd.set_option('display.width', 1000)
                            
                            for i, (_, row) in enumerate(df.iterrows()):
                                print(f"\n--- 数据 {i+1} ---")
                                for col, val in row.items():
                                    print(f"{col}: {val}")
                                print("-" * 40)
                            
                            pd.reset_option('display.max_colwidth')
                            pd.reset_option('display.width')
                            
                            print(f"\n数据集大小: {len(dataset)} 条")
                            print(f"字段列表: {', '.join(dataset.column_names)}")
                except Exception as e:
                    print(f"加载数据集失败: {str(e)}")
                    # 尝试直接读取 arrow 文件
                    try:
                        arrow_files = list(dataset_dir.parent.glob('*.arrow'))
                        if arrow_files:
                            print("\n尝试直接读取 arrow 文件:")
                            for arrow_file in arrow_files[:1]:  # 只读取第一个文件作为示例
                                try:
                                    # 检查文件是否为有效的Arrow文件
                                    with open(str(arrow_file), 'rb') as f:
                                        header = f.read(6)
                                        if header == b'ARROW1':
                                            print(f"找到有效的Arrow文件: {arrow_file.name}")
                                            # 使用pyarrow读取
                                            try:
                                                with pa.memory_map(str(arrow_file), 'r') as source:
                                                    reader = pa.ipc.open_file(source)
                                                    table = reader.read_all()
                                                    
                                                    # 设置pandas显示选项
                                                    pd.set_option('display.max_colwidth', None)
                                                    pd.set_option('display.width', 1000)
                                                    
                                                    df = table.to_pandas()
                                                    print(f"成功读取数据，共 {len(df)} 条记录")
                                                    
                                                    # 逐行显示前10条数据
                                                    for i, (_, row) in enumerate(df.head(10).iterrows()):
                                                        print(f"\n--- 数据 {i+1} ---")
                                                        for col, val in row.items():
                                                            print(f"{col}: {val}")
                                                        print("-" * 40)
                                                    
                                                    # 恢复显示选项
                                                    pd.reset_option('display.max_colwidth')
                                                    pd.reset_option('display.width')
                                                    
                                                    print(f"\n数据集大小: {len(df)} 条")
                                                    print(f"字段列表: {', '.join(df.columns)}")
                                            except Exception as e:
                                                print(f"读取Arrow文件内容失败: {str(e)}")
                                        else:
                                            print(f"文件 {arrow_file.name} 不是有效的Arrow文件")
                                            
                                            # 尝试使用其他方法读取
                                            try:
                                                # 尝试使用parquet格式读取
                                                df = pq.read_table(str(arrow_file)).to_pandas()
                                                print(f"成功以Parquet格式读取: {arrow_file.name}")
                                                
                                                pd.set_option('display.max_colwidth', None)
                                                pd.set_option('display.width', 1000)
                                                
                                                for i, (_, row) in enumerate(df.head(10).iterrows()):
                                                    print(f"\n--- 数据 {i+1} ---")
                                                    for col, val in row.items():
                                                        print(f"{col}: {val}")
                                                    print("-" * 40)
                                                
                                                pd.reset_option('display.max_colwidth')
                                                pd.reset_option('display.width')
                                                
                                                print(f"\n数据集大小: {len(df)} 条")
                                                print(f"字段列表: {', '.join(df.columns)}")
                                            except Exception as e:
                                                print(f"尝试其他格式读取失败: {str(e)}")
                                except Exception as e:
                                    print(f"读取 {arrow_file.name} 失败: {str(e)}")
                        else:
                            print("未找到Arrow文件")
                            
                            # 尝试查找其他格式的数据文件
                            parquet_files = list(dataset_dir.parent.glob('*.parquet'))
                            if parquet_files:
                                print("\n尝试读取Parquet文件:")
                                for parquet_file in parquet_files[:1]:
                                    try:
                                        df = pq.read_table(str(parquet_file)).to_pandas()
                                        print(f"成功读取Parquet文件: {parquet_file.name}")
                                        
                                        pd.set_option('display.max_colwidth', None)
                                        pd.set_option('display.width', 1000)
                                        
                                        for i, (_, row) in enumerate(df.head(10).iterrows()):
                                            print(f"\n--- 数据 {i+1} ---")
                                            for col, val in row.items():
                                                print(f"{col}: {val}")
                                            print("-" * 40)
                                        
                                        pd.reset_option('display.max_colwidth')
                                        pd.reset_option('display.width')
                                        
                                        print(f"\n数据集大小: {len(df)} 条")
                                        print(f"字段列表: {', '.join(df.columns)}")
                                    except Exception as e:
                                        print(f"读取Parquet文件失败: {str(e)}")
                    except Exception as e:
                        print(f"查找数据文件失败: {str(e)}")
                
                total_size += dir_size
            except Exception as e:
                print(f"处理数据集时出错: {str(e)}")
        
        print(f"\n总占用空间: {total_size / (1024 * 1024):.2f} MB")
    else:
        print("未找到本地数据集缓存")

if __name__ == "__main__":
    get_dataset_info()

数据集缓存目录: C:\Users\jabel/.cache/huggingface/datasets

本地数据集信息:

数据集: openai/gsm8k
配置: main
路径: C:\Users\jabel\.cache\huggingface\datasets\openai___gsm8k\main\0.0.0\e53f048856ff4f594e959d75785d2c2d37b678ee
大小: 8.71 MB

描述: ...
引用: ...

数据样例(前10条):

--- 数据 1 ---
question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?
answer: Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72
----------------------------------------

--- 数据 2 ---
question: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?
answer: Weng earns 12/60 = $<<12/60=0.2>>0.2 per minute.
Working 50 minutes, she earned 0.2 x 50 = $<<0.2*50=10>>10.
#### 10
----------------------------------------

--- 数据 3 ---
question: Betty is saving money for a new wallet which costs $100. Betty has onl