In [None]:
import os
import numpy as np
import librosa
import pandas as pd

df_labels = pd.read_csv("chorus_labels.csv")  # 手动标注的副歌区间
label_dict = {}  # { filename -> [ (start,end), ... ] }

for _, row in df_labels.iterrows():
    fname = row["filename"]
    start_sec = float(row["start_sec"])
    end_sec = float(row["end_sec"])
    label_dict.setdefault(fname, []).append((start_sec, end_sec))

# 之后遍历 raw audio
AUDIO_FOLDER = "../data/raw/"
all_files = [f for f in os.listdir(AUDIO_FOLDER) if f.endswith(".mp3")]

X_list = []
y_list = []
for afile in all_files:
    y, sr = librosa.load(os.path.join(AUDIO_FOLDER, afile), sr=None)
    # 按 frame_size=2s, 2s=2*sr samples
    frame_size = 2.0
    # ...
    current_t = 0.0
    i = 0
    while True:
        start_sample = int(i * frame_size * sr)
        end_sample = int((i+1) * frame_size * sr)
        if start_sample >= len(y):
            break
        segment = y[start_sample:end_sample]
        # 提取特征(例如mfcc mean)
        feats = ...

        # 判定该帧是否落在副歌区间
        # (简单逻辑: 帧中心或者任何交叠大于一半)
        frame_start_sec = current_t
        frame_end_sec   = current_t + frame_size

        is_chorus = 0
        if afile in label_dict:
            for (s,e) in label_dict[afile]:
                # 如果与(s,e)有足够交叠
                overlap = min(frame_end_sec,e) - max(frame_start_sec,s)
                if overlap > frame_size*0.5:  # 超过一半则认为副歌
                    is_chorus = 1
                    break

        X_list.append(feats)
        y_list.append(is_chorus)

        current_t += frame_size
        i += 1

# 最后 X_list, y_list 就是你的训练数据
X = np.array(X_list)
y = np.array(y_list)
print("副歌帧:", sum(y), " 非副歌帧:", len(y)-sum(y))
# 存到 npy or 直接后面做训练