In [1]:
import sys
import os

# 自动获取项目根目录（无需修改）
project_root = os.path.dirname(os.path.abspath(''))  # 获取当前notebook所在目录
sys.path.insert(0, project_root)  # 添加到Python路径

# 验证路径
print(f"✅ Notebook位置: {os.path.abspath('')}")
print(f"✅ 项目根目录: {project_root}")
print(f"✅ data_provider路径: {os.path.join(project_root, 'data_provider')}")

✅ Notebook位置: /root/autodl-tmp/Time-Series-Library
✅ 项目根目录: /root/autodl-tmp
✅ data_provider路径: /root/autodl-tmp/data_provider


In [2]:
import sktime
from sktime.datasets import load_from_tsfile_to_dataframe

# 指定你的TS文件路径
train_file = "/root/autodl-tmp/Time-Series-Library/dataset/CHBMIT_EEG_Standard/CHBMIT_EEG_Standard_TRAIN.ts"
test_file = "/root/autodl-tmp/Time-Series-Library/dataset/CHBMIT_EEG_Standard/CHBMIT_EEG_Standard_TEST.ts"

# 加载数据
try:
    X_train, y_train = load_from_tsfile_to_dataframe(train_file)
    X_test, y_test = load_from_tsfile_to_dataframe(test_file)
    print("✅ 数据加载成功！")
    print(f"训练集形状: {X_train.shape}, 标签数: {len(y_train)}")
    print(f"测试集形状: {X_test.shape}, 标签数: {len(y_test)}")
    
    # 检查第一条数据
    print("\n第一条样本的第一个通道前5个值:")
    print(X_train.iloc[0, 0][:5])  # 假设是多变量数据
    print("对应标签:", y_train[0])
    
except Exception as e:
    print("❌ 加载失败:", str(e))

❌ 加载失败: could not convert string to float: '(-8.417232'


In [9]:
import sktime
from sktime.datasets import load_from_tsfile_to_dataframe
import pandas as pd
# 指定你的TS文件路径
train_file = "/root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TRAIN.ts"
test_file = "/root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TEST.ts"

def inspect_ts_structure(filepath):
    """详细检查TS文件的结构"""
    try:
        X, y = load_from_tsfile_to_dataframe(filepath)
        print("\n" + "="*50)
        print(f"文件: {filepath}")
        print("="*50)
        
        # 1. 基本形状信息
        print(f"\n[数据形状]")
        print(f"样本数: {X.shape[0]}, 通道数: {X.shape[1]}")
        
        # 2. 检查是否为多通道
        is_multivariate = isinstance(X.iloc[0, 0], (list, pd.Series))
        print(f"是否为多通道数据: {'是' if is_multivariate else '否'}")
        
        # 3. 时间步长统计
        if is_multivariate:
            lengths = [len(channel) for sample in X.values for channel in sample]
        else:
            lengths = [len(sample) for sample in X.iloc[:, 0]]
        print(f"\n[时间步长统计]")
        print(f"最小长度: {min(lengths)}")
        print(f"最大长度: {max(lengths)}")
        print(f"平均长度: {sum(lengths)/len(lengths):.1f}")
        
        # 4. 查看第一条样本的完整结构
        print("\n[第一条样本结构]")
        print(f"标签: {y[0]}")
        for ch_idx in range(min(3, X.shape[1])):  # 只显示前3个通道
            channel_data = X.iloc[0, ch_idx]
            print(f"通道 {ch_idx+1} 的前5个值: {channel_data[:5].values}")
            print(f"通道 {ch_idx+1} 的长度: {len(channel_data)}")
        
        # 5. 标签分布
        print("\n[标签分布]")
        print(pd.Series(y).value_counts())
        
    except Exception as e:
        print(f"❌ 加载失败: {str(e)}")

# 检查训练集和测试集
inspect_ts_structure(train_file)
inspect_ts_structure(test_file)


文件: /root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TRAIN.ts

[数据形状]
样本数: 204, 通道数: 61
是否为多通道数据: 是

[时间步长统计]
最小长度: 405
最大长度: 405
平均长度: 405.0

[第一条样本结构]
标签: normal
通道 1 的前5个值: [0.000949 0.001488 0.000314 0.000995 0.002099]
通道 1 的长度: 405
通道 2 的前5个值: [0.001288 0.00114  0.00043  0.000532 0.001492]
通道 2 的长度: 405
通道 3 的前5个值: [0.000529 0.001635 0.002146 0.001744 0.001424]
通道 3 的长度: 405

[标签分布]
abnormal    147
normal       57
dtype: int64

文件: /root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TEST.ts

[数据形状]
样本数: 205, 通道数: 61
是否为多通道数据: 是

[时间步长统计]
最小长度: 405
最大长度: 405
平均长度: 405.0

[第一条样本结构]
标签: normal
通道 1 的前5个值: [0.004578 0.011682 0.017709 0.016318 0.011723]
通道 1 的长度: 405
通道 2 的前5个值: [0.007504 0.005658 0.018021 0.017266 0.002325]
通道 2 的长度: 405
通道 3 的前5个值: [0.002795 0.002748 0.012083 0.015224 0.003833]
通道 3 的长度: 405

[标签分布]
abnormal    148
normal       57
dtype: int64


In [13]:
def inspect_raw_ts_file(filepath, num_lines=30):
    """查看TS文件的原始内容"""
    with open(filepath, 'r') as f:
        print(f"=== 文件: {filepath} ===")
        for i, line in enumerate(f):
            if i >= num_lines:
                break
            print(f"Line {i}: {line.strip()}")

# 示例：查看Handwriting数据集的训练文件
train_file = "/root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TEST.ts"
inspect_raw_ts_file(train_file)

=== 文件: /root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TEST.ts ===
Line 0: #Heartbeat Description:
Line 1: #This dataset is derived from the PhysioNet/CinC Challenge 2016.
Line 2: #Heart sound recordings were sourced from several contributors around the world, collected at either a clinical or nonclinical environment, from both healthy subjects and pathological patients.
Line 3: #The heart sound recordings were collected from different locations on the body. The typical four locations are aortic area, pulmonic area, tricuspid area and mitral area, but could be one of nine different locations.
Line 4: #The sounds were divided into two classes: normal and abnormal. The normal recordings were from healthy subjects and the abnormal ones were from patients with a confirmed cardiac diagnosis.
Line 5: #The patients suffer from a variety of illnesses, but typically they are heart valve defects and coronary artery disease patients.
Line 6: #Heart valve defects include mitral v

In [23]:
import numpy as np
import os
from collections import defaultdict
import random

def convert_npz_to_ts_simple(npz_files, output_dir, dataset_name="CHBMIT_EEG", test_subjects=3, random_state=42):
    """
    简化版TS格式转换器（纯逗号分隔，无括号）
    """
    # === 初始化 ===
    random.seed(random_state)
    np.random.seed(random_state)
    os.makedirs(output_dir, exist_ok=True)
    
    # === 数据加载 ===
    subject_data = defaultdict(list)
    for npz_file in sorted(npz_files):
        try:
            with np.load(npz_file, allow_pickle=True) as data:
                subject_id = os.path.basename(npz_file).split('_')[1]
                windows = np.nan_to_num(data['windows'], nan=0.0)
                labels = data['labels'].astype(int)
                subject_data[subject_id].append({
                    'windows': windows.astype(np.float32),
                    'labels': labels
                })
        except Exception as e:
            print(f"❌ 加载失败 {npz_file}: {str(e)}")

    # === 数据划分 ===
    all_subjects = sorted(subject_data.keys())
    np.random.shuffle(all_subjects)
    test_subjs = all_subjects[:test_subjects]
    train_subjs = all_subjects[test_subjects:]
    
    def combine_subjects(subjects):
        samples, labels = [], []
        for subj in sorted(subjects):
            for data in subject_data[subj]:
                samples.append(data['windows'])
                labels.append(data['labels'])
        return np.concatenate(samples, axis=0), np.concatenate(labels, axis=0)
    
    X_train, y_train = combine_subjects(train_subjs)
    X_test, y_test = combine_subjects(test_subjs)

    # === 数值格式化 ===
    def format_value(x):
        """数值标准化为6位小数"""
        x = float(x)
        if not np.isfinite(x):
            return "0.000000"
        return f"{x:.6f}".replace("-0.000000", "0.000000")

    def write_ts_file(data, labels, subset):
        """生成纯逗号分隔的TS文件"""
        ts_path = os.path.join(output_dir, f"{dataset_name}_{subset}.ts")
        error_log = os.path.join(output_dir, f"{subset}_errors.txt")
        
        with open(ts_path, 'w') as f, open(error_log, 'w') as err_f:
            # 文件头（保持多变量声明，但实际用单变量格式）
            f.write(f"""@problemName {dataset_name}
@timestamps false
@univariate false   
@equalLength true
@seriesLength {data.shape[1]*data.shape[2]}  
@classLabel true 0 1
@data
""")
            error_count = 0
            # 总时间步长=通道数×每通道长度
            for idx in range(len(data)):
                try:
                    # 展平多通道数据（通道1的所有值 + 通道2的所有值 + ...）
                    flattened = data[idx].flatten()  # 形状: (6, 7680) → (46080,)
                    values_str = ",".join([format_value(x) for x in flattened])
                    line = f"{values_str}:{int(labels[idx])}"
                    
                    # 验证总长度
                    if len(flattened) != 6 * 7680:
                        raise ValueError(f"长度应为46080，实际为{len(flattened)}")
                    
                    f.write(line + "\n")
                except Exception as e:
                    error_count += 1
                    err_f.write(f"样本{idx}错误: {str(e)}\n")
        
        return error_count

    # === 执行转换 ===
    print("\n=== 开始转换 ===")
    train_errors = write_ts_file(X_train, y_train, "TRAIN")
    test_errors = write_ts_file(X_test, y_test, "TEST")

    # === 生成元数据 ===
    with open(os.path.join(output_dir, f"{dataset_name}_README.txt"), 'w') as f:
        f.write(f"""# {dataset_name} Dataset
- Format: Comma-separated values (no parentheses)
- Channels: 6 (flattened into single series)
- Series length: {6 * 7680} (6 channels × 7680 steps)
- Labels: 0 (non-seizure), 1 (seizure)
- Train samples: {len(X_train)}
- Test samples: {len(X_test)}
""")

    print(f"✅ 转换完成！训练集错误: {train_errors}, 测试集错误: {test_errors}")

if __name__ == "__main__":
    input_files = [
        '/root/autodl-tmp/balanced_data/balanced_chb19.npz',
        '/root/autodl-tmp/balanced_data/balanced_chb02.npz',
        '/root/autodl-tmp/balanced_data/balanced_chb23.npz',
        '/root/autodl-tmp/balanced_data/balanced_chb01.npz',
        '/root/autodl-tmp/balanced_data/balanced_chb03.npz',
        '/root/autodl-tmp/balanced_data/balanced_chb21.npz',
        '/root/autodl-tmp/balanced_data/balanced_chb20.npz',
        '/root/autodl-tmp/balanced_data/balanced_chb18.npz',
        '/root/autodl-tmp/balanced_data/balanced_chb14.npz',
        '/root/autodl-tmp/balanced_data/balanced_chb13.npz',
        '/root/autodl-tmp/balanced_data/balanced_chb10.npz',
        '/root/autodl-tmp/balanced_data/balanced_chb09.npz',
        '/root/autodl-tmp/balanced_data/balanced_chb05.npz'
    ]
    
    output_dir = "/root/autodl-tmp/Time-Series-Library/dataset"
    dataset_name = "CHBMIT_EEG_Flat"

    convert_npz_to_ts_simple(
        npz_files=[f for f in input_files if os.path.exists(f)],
        output_dir=output_dir,
        dataset_name=dataset_name,
        test_subjects=3,
        random_state=42
    )


=== 开始转换 ===
✅ 转换完成！训练集错误: 0, 测试集错误: 0


In [None]:
##
解析后形状: (5890, 144), 标签数: 5890
 (3524, 144),

In [25]:
import os
import re
import pandas as pd
import matplotlib.pyplot as plt

def safe_float_convert(s):
    """安全转换字符串到float，过滤非法字符"""
    try:
        return float(s.strip())
    except ValueError:
        # 处理科学计数法或特殊字符
        s = re.sub(r'[^\d.eE+-]', '', s)
        return float(s) if s else 0.0

def inspect_heartbeat_structure(filepath, max_samples=3):
    """增强版TS文件检查器（兼容非标准格式）"""
    print(f"\n=== 检查文件: {filepath} ===")
    
    # 1. 读取并过滤文件内容
    with open(filepath, 'r') as f:
        lines = []
        for line in f:
            line = line.strip()
            if line and not line.startswith('#'):  # 跳过注释行
                lines.append(line)
    
    # 2. 解析文件头（兼容非标准标签）
    header = {}
    data_start_line = 0
    for i, line in enumerate(lines):
        if line.startswith('@'):
            parts = line[1:].split(maxsplit=1)  # 只分割第一个空格
            key = parts[0]
            value = parts[1] if len(parts) > 1 else ""
            header[key] = value
            data_start_line = i + 1
        else:
            break
    
    print("\n=== 文件头信息 ===")
    for k, v in header.items():
        print(f"{k}: {v}")
    
    # 3. 检查数据行结构
    print("\n=== 数据行结构 ===")
    valid_samples = 0
    
    for i, line in enumerate(lines[data_start_line : data_start_line + max_samples]):
        try:
            print(f"\n样本 {i+1}:")
            
            # 分割标签和数据部分
            if ':' not in line:
                print(f"❌ 错误格式: 缺少标签分隔符 ':'")
                continue
            
            data_part, label = line.rsplit(':', 1)
            print(f"标签: {label.strip()}")
            
            # 检查是否为多通道格式
            if '(' in data_part and ')' in data_part:
                print("格式: 多通道（括号分隔）")
                channels = re.findall(r'\(([^)]+)\)', data_part)
                print(f"通道数: {len(channels)}")
                
                # 解析每个通道
                for ch_idx, ch_values in enumerate(channels[:3]):
                    try:
                        values = [safe_float_convert(x) for x in ch_values.split(',')[:5]]
                        print(f"  通道 {ch_idx+1} 前5值: {values}")
                    except Exception as e:
                        print(f"  通道 {ch_idx+1} 解析失败: {str(e)}")
                
            else:
                print("格式: 单通道（纯逗号分隔）")
                try:
                    values = [safe_float_convert(x) for x in data_part.split(',')[:10]]
                    print(f"前10个值: {values}")
                except Exception as e:
                    print(f"数值解析失败: {str(e)}")
            
            valid_samples += 1
        
        except Exception as e:
            print(f"样本 {i+1} 解析异常: {str(e)}")
            continue

    # 4. 可视化（仅当成功解析多通道数据时）
    if valid_samples > 0 and '(' in lines[data_start_line]:
        plt.figure(figsize=(12, 6))
        for i in range(min(3, valid_samples)):
            line = lines[data_start_line + i]
            try:
                data_part = line.split(':')[0]
                channels = re.findall(r'\(([^)]+)\)', data_part)
                
                for ch_idx in range(min(3, len(channels))):
                    values = [safe_float_convert(x) for x in channels[ch_idx].split(',')[:100]]
                    plt.plot(values, label=f'样本{i+1} 通道{ch_idx+1}')
            except Exception as e:
                print(f"可视化样本 {i+1} 失败: {str(e)}")
        
        plt.title("多通道数据可视化（前100时间步）")
        plt.xlabel("时间步")
        plt.ylabel("数值")
        plt.legend()
        plt.grid()
        plt.show()

# 示例用法
heartbeat_train = "/root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TRAIN.ts"
heartbeat_test = "/root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TEST.ts"

if os.path.exists(heartbeat_train):
    inspect_heartbeat_structure(heartbeat_train)
else:
    print(f"文件不存在: {heartbeat_train}")

if os.path.exists(heartbeat_test):
    inspect_heartbeat_structure(heartbeat_test)
else:
    print(f"文件不存在: {heartbeat_test}")


=== 检查文件: /root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TRAIN.ts ===

=== 文件头信息 ===
problemName: Heartbeat
timeStamps: false
missing: false
univariate: false
dimensions: 61
equalLength: true
seriesLength: 405
classLabel: true normal abnormal
data: 

=== 数据行结构 ===

样本 1:
标签: normal
格式: 单通道（纯逗号分隔）
前10个值: [0.000949, 0.001488, 0.000314, 0.000995, 0.002099, 0.001732, 0.002043, 0.000955, 0.000379, 0.001024]

样本 2:
标签: normal
格式: 单通道（纯逗号分隔）
前10个值: [0.020264, 0.017023, 0.006052, 0.00395, 0.023982, 0.034286, 0.027913, 0.019146, 0.019155, 0.011031]

样本 3:
标签: normal
格式: 单通道（纯逗号分隔）
前10个值: [0.021082, 0.020687, 0.009948, 0.002039, 0.000885, 0.001094, 0.001832, 0.002248, 0.002351, 0.000861]

=== 检查文件: /root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TEST.ts ===

=== 文件头信息 ===
problemName: Heartbeat
timeStamps: false
missing: false
univariate: false
dimensions: 61
equalLength: true
seriesLength: 405
classLabel: true normal abnormal
data: 

=== 数据行结构 ===

样本 1:
标签: 

In [26]:
import os

def check_specific_lines(filepath, line_numbers=[25, 26], context_lines=3):
    """
    检查TS文件的特定行及其上下文
    :param filepath: 文件路径
    :param line_numbers: 需要检查的行号列表（0-based索引）
    :param context_lines: 显示上下文的行数
    """
    if not os.path.exists(filepath):
        print(f"❌ 文件不存在: {filepath}")
        return

    with open(filepath, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]

    print(f"\n=== 检查文件: {filepath} ===")
    print(f"总行数: {len(lines)}")
    
    # 检查数据起始行（跳过文件头）
    data_start = 0
    for i, line in enumerate(lines):
        if line.startswith('@data'):
            data_start = i + 1
            break
    
    # 计算实际数据行号（文件行号从1开始）
    actual_line_nums = [data_start + n for n in line_numbers]
    
    for line_num in actual_line_nums:
        if line_num >= len(lines):
            print(f"⚠️ 行 {line_num+1} 超出文件范围")
            continue
            
        print(f"\n── 行 {line_num+1} ──")
        # 显示上下文
        start = max(0, line_num - context_lines)
        end = min(len(lines), line_num + context_lines + 1)
        
        for i in range(start, end):
            prefix = ">>> " if i == line_num else "    "
            print(f"{prefix}{i+1}: {lines[i][:100]}" + ("..." if len(lines[i]) > 100 else ""))
        
        # 特别检查括号
        line_content = lines[line_num]
        has_parentheses = '(' in line_content and ')' in line_content
        print(f"\n括号检查: {'✅ 存在' if has_parentheses else '❌ 不存在'}")
        if has_parentheses:
            print(f"通道数估计: {line_content.count('(')}")

# 检查Heartbeat数据集
heartbeat_train = "/root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TRAIN.ts"
heartbeat_test = "/root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TEST.ts"

check_specific_lines(heartbeat_train, [25, 26])
check_specific_lines(heartbeat_test, [25, 26])


=== 检查文件: /root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TRAIN.ts ===
总行数: 230

── 行 52 ──
    49: 0.001732,0.012121,0.01595,0.009004,0.003202,0.009445,0.009729,0.004761,0.004244,8.37E-4,0.007065,0.0...
    50: 0.007213,0.00631,0.0058,0.008099,0.007211,0.008123,0.009863,0.001098,0.002408,0.004292,0.001912,0.00...
    51: 0.002582,0.004288,0.006043,0.004039,0.005647,0.00763,0.003223,0.004864,0.002788,0.001664,0.002542,0....
>>> 52: 0.002291,0.005461,0.002758,6.91E-4,0.004543,0.002719,0.001347,2.36E-4,8.9E-4,9.06E-4,0.001258,0.0068...
    53: 0.001256,0.001453,0.002834,0.001036,0.002603,0.001306,0.002437,0.002458,0.002257,0.001173,4.22E-4,5....
    54: 9.13E-4,0.001594,0.002246,9.04E-4,0.002295,0.002789,0.001846,0.005605,0.00134,0.002681,0.002159,0.00...
    55: 0.001131,0.004136,0.006491,0.006804,0.018412,0.003221,5.38E-4,0.011317,0.010064,0.002987,0.002222,0....

括号检查: ❌ 不存在

── 行 53 ──
    50: 0.007213,0.00631,0.0058,0.008099,0.007211,0.008123,0.009863,0.001098,0.00

In [31]:
import os

def inspect_ts_structure(filepath):
    """暴力解析TS文件真实结构"""
    if not os.path.exists(filepath):
        print(f"❌ 文件不存在: {filepath}")
        return

    with open(filepath, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]

    print(f"\n=== 文件: {filepath} ===")
    
    # 提取文件头
    header = {}
    data_start = 0
    for i, line in enumerate(lines):
        if line.startswith('@'):
            key = line[1:].split()[0]
            header[key] = line[1:].split(maxsplit=1)[1] if ' ' in line[1:] else ""
            data_start = i + 1
        else:
            break
    
    print("\n=== 文件头声明 ===")
    for k, v in header.items():
        print(f"{k}: {v}")

    # 分析前3个数据行
    print("\n=== 实际数据格式 ===")
    for i in range(data_start, min(data_start+27, len(lines))):
        line = lines[i]
        print(f"\n行 {i+1}:")
        
        # 基础分割
        parts = line.split(':')
        label = parts[-1].strip() if len(parts) > 1 else "无标签"
        data_part = ':'.join(parts[:-1]) if len(parts) > 1 else line
        
        # 通道结构检测
        if '(' in data_part and ')' in data_part:
            print("🔍 检测到多通道格式（括号分隔）")
            channels = data_part.split('),(')
            channels = [ch.strip('()') for ch in channels]
            print(f"  通道数: {len(channels)}")
            print(f"  第1通道前3值: {channels[0].split(',')[:27]}...")
        else:
            print("🔍 检测到单通道格式（纯逗号分隔）")
            values = data_part.split(',')
            print(f"  总数值数: {len(values)}")
            print(f"  前3值: {values[:27]}...")
        print(f"  标签: '{label}'")

# 检查你的文件
inspect_ts_structure("/root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TRAIN.ts")


=== 文件: /root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TRAIN.ts ===

=== 文件头声明 ===

=== 实际数据格式 ===

行 1:
🔍 检测到单通道格式（纯逗号分隔）
  总数值数: 1
  前3值: ['#Heartbeat Description']...
  标签: ''

行 2:
🔍 检测到单通道格式（纯逗号分隔）
  总数值数: 1
  前3值: ['#This dataset is derived from the PhysioNet/CinC Challenge 2016.']...
  标签: '无标签'

行 3:
🔍 检测到单通道格式（纯逗号分隔）
  总数值数: 3
  前3值: ['#Heart sound recordings were sourced from several contributors around the world', ' collected at either a clinical or nonclinical environment', ' from both healthy subjects and pathological patients.']...
  标签: '无标签'

行 4:
🔍 检测到单通道格式（纯逗号分隔）
  总数值数: 4
  前3值: ['#The heart sound recordings were collected from different locations on the body. The typical four locations are aortic area', ' pulmonic area', ' tricuspid area and mitral area', ' but could be one of nine different locations.']...
  标签: '无标签'

行 5:
🔍 检测到单通道格式（纯逗号分隔）
  总数值数: 1
  前3值: ['#The sounds were divided into two classes']...
  标签: 'normal and abnormal. The normal r

In [33]:
import os

def count_data_lines_and_values(filepath):
    """直接统计数据行数和每行数值数量"""
    if not os.path.exists(filepath):
        print(f"文件不存在: {filepath}")
        return

    with open(filepath, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]

    # 找到数据开始行（跳过以@开头的文件头）
    data_lines = []
    for line in lines:
        if not line.startswith('@'):
            data_lines.append(line)

    # 统计每行数值数量
    line_stats = []
    for line in data_lines:
        # 分割数值和标签（如果有标签）
        if ':' in line:
            values_part = line.split(':')[0]
        else:
            values_part = line
        # 统计数值数量
        num_values = len(values_part.split(','))
        line_stats.append(num_values)

    # 输出结果
    print(f"文件: {filepath}")
    print(f"数据总行数: {len(data_lines)}")
    print(f"每行数值数量统计:")
    print(f"  最小: {min(line_stats)}")
    print(f"  最大: {max(line_stats)}")
    print(f"  平均: {sum(line_stats)/len(line_stats):.1f}")
    
    # 检查是否所有行数值数量相同
    if len(set(line_stats)) == 1:
        print(f"所有行数值数量相同，均为: {line_stats[0]}")
    else:
        print("警告: 不同行的数值数量不一致")
        # 打印不同数值数量的行数
        from collections import Counter
        count = Counter(line_stats)
        for num, cnt in count.items():
            print(f"  {num}个数值的行数: {cnt}")

# 使用示例
count_data_lines_and_values("/root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TRAIN.ts")

文件: /root/autodl-tmp/Time-Series-Library/dataset/Heartbeat/Heartbeat_TRAIN.ts
数据总行数: 221
每行数值数量统计:
  最小: 1
  最大: 405
  平均: 374.0
警告: 不同行的数值数量不一致
  1个数值的行数: 13
  3个数值的行数: 2
  4个数值的行数: 1
  2个数值的行数: 1
  405个数值的行数: 204


In [34]:
import os

def inspect_ts_structure(filepath):
    """暴力解析TS文件真实结构"""
    if not os.path.exists(filepath):
        print(f"❌ 文件不存在: {filepath}")
        return

    with open(filepath, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]

    print(f"\n=== 文件: {filepath} ===")
    
    # 提取文件头
    header = {}
    data_start = 0
    for i, line in enumerate(lines):
        if line.startswith('@'):
            key = line[1:].split()[0]
            header[key] = line[1:].split(maxsplit=1)[1] if ' ' in line[1:] else ""
            data_start = i + 1
        else:
            break
    
    print("\n=== 文件头声明 ===")
    for k, v in header.items():
        print(f"{k}: {v}")

    # 分析前3个数据行
    print("\n=== 实际数据格式 ===")
    for i in range(data_start, min(data_start+27, len(lines))):
        line = lines[i]
        print(f"\n行 {i+1}:")
        
        # 基础分割
        parts = line.split(':')
        label = parts[-1].strip() if len(parts) > 1 else "无标签"
        data_part = ':'.join(parts[:-1]) if len(parts) > 1 else line
        
        # 通道结构检测
        if '(' in data_part and ')' in data_part:
            print("🔍 检测到多通道格式（括号分隔）")
            channels = data_part.split('),(')
            channels = [ch.strip('()') for ch in channels]
            print(f"  通道数: {len(channels)}")
            print(f"  第1通道前3值: {channels[0].split(',')[:27]}...")
        else:
            print("🔍 检测到单通道格式（纯逗号分隔）")
            values = data_part.split(',')
            print(f"  总数值数: {len(values)}")
            print(f"  前3值: {values[:27]}...")
        print(f"  标签: '{label}'")

# 检查你的文件
inspect_ts_structure("/root/autodl-tmp/Time-Series-Library/dataset/UWaveGestureLibrary/UWaveGestureLibrary_TRAIN.ts")


=== 文件: /root/autodl-tmp/Time-Series-Library/dataset/UWaveGestureLibrary/UWaveGestureLibrary_TRAIN.ts ===

=== 文件头声明 ===

=== 实际数据格式 ===

行 1:
🔍 检测到单通道格式（纯逗号分隔）
  总数值数: 3
  前3值: ['#A set of eight simple gestures generated from accelerometers. The data consists of the X', 'Y', 'Z coordinates of each motion. Each series is 315 long. We have']...
  标签: '无标签'

行 2:
🔍 检测到单通道格式（纯逗号分隔）
  总数值数: 1
  前3值: ['#First described in [1].']...
  标签: '无标签'

行 3:
🔍 检测到单通道格式（纯逗号分隔）
  总数值数: 1
  前3值: ['#']...
  标签: '无标签'

行 4:
🔍 检测到单通道格式（纯逗号分隔）
  总数值数: 5
  前3值: ['#J. Liu', ' Z. Wang', ' L. Zhong', ' J. Wickramasuriya and V. Vasudevan', ' "uWave']...
  标签: 'Accelerometer-based personalized gesture recognition and its applications,"'

行 5:
🔍 检测到单通道格式（纯逗号分隔）
  总数值数: 5
  前3值: ['#2009 IEEE International Conference on Pervasive Computing and Communications', ' Galveston', ' TX', ' 2009', ' pp. 1-9.']...
  标签: '无标签'

行 6:
🔍 检测到单通道格式（纯逗号分隔）
  总数值数: 1
  前3值: ['@problemName UWaveGestureLibrary']...
  标签: '无标签'

行 7:

In [35]:
import os

def count_data_lines_and_values(filepath):
    """直接统计数据行数和每行数值数量"""
    if not os.path.exists(filepath):
        print(f"文件不存在: {filepath}")
        return

    with open(filepath, 'r') as f:
        lines = [line.strip() for line in f if line.strip()]

    # 找到数据开始行（跳过以@开头的文件头）
    data_lines = []
    for line in lines:
        if not line.startswith('@'):
            data_lines.append(line)

    # 统计每行数值数量
    line_stats = []
    for line in data_lines:
        # 分割数值和标签（如果有标签）
        if ':' in line:
            values_part = line.split(':')[0]
        else:
            values_part = line
        # 统计数值数量
        num_values = len(values_part.split(','))
        line_stats.append(num_values)

    # 输出结果
    print(f"文件: {filepath}")
    print(f"数据总行数: {len(data_lines)}")
    print(f"每行数值数量统计:")
    print(f"  最小: {min(line_stats)}")
    print(f"  最大: {max(line_stats)}")
    print(f"  平均: {sum(line_stats)/len(line_stats):.1f}")
    
    # 检查是否所有行数值数量相同
    if len(set(line_stats)) == 1:
        print(f"所有行数值数量相同，均为: {line_stats[0]}")
    else:
        print("警告: 不同行的数值数量不一致")
        # 打印不同数值数量的行数
        from collections import Counter
        count = Counter(line_stats)
        for num, cnt in count.items():
            print(f"  {num}个数值的行数: {cnt}")

# 使用示例
count_data_lines_and_values("/root/autodl-tmp/Time-Series-Library/dataset/UWaveGestureLibrary/UWaveGestureLibrary_TRAIN.ts")

文件: /root/autodl-tmp/Time-Series-Library/dataset/UWaveGestureLibrary/UWaveGestureLibrary_TRAIN.ts
数据总行数: 125
每行数值数量统计:
  最小: 1
  最大: 315
  平均: 302.5
警告: 不同行的数值数量不一致
  3个数值的行数: 1
  1个数值的行数: 2
  5个数值的行数: 2
  315个数值的行数: 120


结论：！！！ 实际存储结构：通道1值1,通道1值2,...,通道1值315,通道2值1,...,通道2值315,通道3值1,...,通道3值315:标签

In [None]:
def load_single(self, filepath):
    df, labels = load_from_tsfile_to_dataframe(filepath, return_separate_X_and_y=True)
    # df的shape为 (样本数, 通道数)
    # 每个单元格是一个pd.Series包含该通道的时间序列

In [None]:
lengths = df.applymap(lambda x: len(x)).values  # 每个通道的长度
horiz_diffs = np.abs(lengths - lengths[:, [0]])  # 检查同一样本各通道长度是否一致
if np.sum(horiz_diffs) > 0:
    df = df.applymap(subsample)  # 如果不一致则降采样

In [None]:
class DataEmbedding(nn.Module):
    def __init__(self, c_in, seq_len, d_model, dropout=0.1):
        # c_in: 输入通道数（来自@dimensions）
        # seq_len: 时间步长（来自@seriesLength）
        self.value_embedding = TokenEmbedding(c_in, d_model)
        self.position_embedding = PositionalEmbedding(d_model, seq_len)
        
    def forward(self, x):
        # x形状: [batch_size, seq_len, c_in]
        x = self.value_embedding(x) + self.position_embedding(x)
        return x  # 输出形状: [batch_size, seq_len, d_model]