In [9]:
import json
audio_list = []
with open("merged.jsonl", 'r', encoding='utf-8') as jsonl_file:
    # 2. 逐行读取文件内容
    for line in jsonl_file:
        # 3. 解析每一行的 JSON 数据
        try:
            data = json.loads(line.strip())
            audio_list.append(data["audio_path"])
            # 4. 对解析后的数据进行操作
            # 在这里可以对 data 进行你需要的操作，例如打印或保存到其他文件
        except json.JSONDecodeError as e:
            # 处理解析错误，例如跳过或记录错误
            print(f"JSON解析错误：{e}")
print(audio_list)

['要到几岁才开始不相信圣诞老人的存在00:00:02.270000.wav', '这种无聊到家的话题对我来说根本不痛不痒的00:00:05.580000.wav', '不过00:00:08.830000.wav', '如果说到我是几岁开始不相信圣诞老人00:00:09.530000.wav', '那个只存在于幻想世界的穿着红衣服的老公公的存在00:00:11.900000.wav', '我能很确定地说00:00:14.620000.wav', '我根本打从一开始就不相信00:00:16.220000.wav', '我知道幼儿园圣诞节庆祝会时出现的圣诞老人是假的00:00:19.870000.wav', '即使没有撞见老妈正在亲吻圣诞老人00:00:24.450000.wav', '还有邪恶组织以及和他们战斗的动画特摄漫画英雄00:00:37.410000.wav', '我发现他们根本不存在于这个世界上的时候已经很晚了00:00:41.520000.wav', '只不过一直不想承认而已00:00:48.430000.wav', '因为我的内心深处00:00:50.650000.wav', '还有邪恶组织能够出现在眼前的00:00:55.150000.wav', '于是我开始常常惊叹世界的物理法则是多么正确00:01:01.900000.wav', '宇宙人？未来人？超能力者？00:01:09.520000.wav', '这种东西怎么可能存在嘛00:01:11.740000.wav', '不过还是希望他们存在啊…00:01:13.010000.wav', '我就像是参考着真实与幻想之间的最大公约数般渐渐地长大了00:01:15.020000.wav', '逐渐习惯了这个世界的平凡00:01:27.880000.wav', '我就这样没什么感慨地成为了高中生…00:01:30.390000.wav', '——然后遇到了那家伙00:01:33.070000.wav', '我对普通的人类没有兴趣00:01:41.540000.wav', '就尽管来找我吧！00:01:49.310000.wav', '说完了00:01:51.380000.wav', '刚刚那是噱头吗？00:01:54.700000.wav', '她还真是个不折不扣的美女啊00:01:58.470000.wa

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!git clone https://github.com/BlairLeng/VoiceForge.git

Cloning into 'VoiceForge'...
remote: Enumerating objects: 61, done.[K
remote: Counting objects: 100% (61/61), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 61 (delta 15), reused 51 (delta 9), pack-reused 0[K
Receiving objects: 100% (61/61), 11.96 MiB | 12.58 MiB/s, done.
Resolving deltas: 100% (15/15), done.


In [5]:
import os
import numpy as np
import torch
import pickle


import requests
from VoiceForge.audio_feature_ext.modules.ecapa_tdnn import EcapaTdnn, SpeakerIdetification
from VoiceForge.audio_feature_ext.data_utils.reader import load_audio, CustomDataset

class AudioFeatureExtraction:
    def __init__(self,model_director='./audio_feature_ext/models', feature_method='melspectrogram' ):
        self.use_model = ''
        self.model_director = model_director
        self.feature_method = feature_method
        self.model = None
        self.device = None
        self.load_model()

    def init_models(self,path):
        model_urls = ['https://huggingface.co/scixing/voicemodel/resolve/main/model.pth',
                      'https://huggingface.co/scixing/voicemodel/resolve/main/model.state',
                      'https://huggingface.co/scixing/voicemodel/resolve/main/optimizer.pth']
        listdir = os.listdir(path)
        for url in model_urls:
            filename = url.split('/')[-1]
            if filename in listdir:
                continue
            r = requests.get(url, allow_redirects=True)
            print(f'downloading model pth {filename}')
            open(f'{path}/{filename}', 'wb').write(r.content)
            print(f'{filename} success download')

    def load_model(self):
        dataset = CustomDataset(data_list_path=None, feature_method=self.feature_method)
        ecapa_tdnn = EcapaTdnn(input_size=dataset.input_size)
        self.model = SpeakerIdetification(backbone=ecapa_tdnn)
        self.device = torch.device("cuda")
        self.model.to(self.device)

        if not os.path.exists(self.model_director):
            os.makedirs(self.model_director)
        model_files = ['model.pth', 'model.state', 'optimizer.pth']
        for file in model_files:
            if not os.path.exists(f'{self.model_director}/{file}'):
                self.init_models(self.model_director)

        # 加载模型
        model_path = os.path.join(self.model_director, 'model.pth')
        model_dict = self.model.state_dict()
        param_state_dict = torch.load(model_path)
        for name, weight in model_dict.items():
            if name in param_state_dict.keys():
                if list(weight.shape) != list(param_state_dict[name].shape):
                    param_state_dict.pop(name, None)
        self.model.load_state_dict(param_state_dict, strict=False)
        print(f"成功加载模型参数和优化方法参数：{model_path}")
        self.model.eval()

    def infer(self, audio_path, duration):
        data = load_audio(audio_path, mode='infer', feature_method=self.feature_method,
                          chunk_duration=duration)
        data = data[np.newaxis, :]
        data = torch.tensor(data, dtype=torch.float32, device=self.device)
        feature = self.model.backbone(data)
        return feature.data.cpu().numpy()

In [6]:
AFE = AudioFeatureExtraction()

downloading model pth model.pth
model.pth success download
downloading model pth model.state
model.state success download
downloading model pth optimizer.pth
optimizer.pth success download
成功加载模型参数和优化方法参数：./audio_feature_ext/models/model.pth


In [7]:
import wave
import contextlib

In [None]:
import json

# 要写入的内容
data = [
    {"name": "John", "age": 30},
    {"name": "Alice", "age": 25},
    {"name": "Bob", "age": 35}
]

# 指定要写入的 JSONL 文件名
file_name = "data.jsonl"

# 打开文件以进行写入
with open(file_name, "w") as jsonl_file:
    # 逐行写入内容
    for item in data:
        # 将字典转换为 JSON 字符串，并添加换行符
        json_str = json.dumps(item)
        jsonl_file.write(json_str + "\n")

print(f"已将内容写入文件 {file_name}")

In [17]:
import os
import fnmatch

# 定义文件夹路径
folder_path = '/content/drive/MyDrive/GPTData/Haruhi_audio'

audio_feature_list = []
index_list = []
# 使用os模块列出文件夹中所有的文件和子文件夹
for index, audio_path in enumerate(audio_list):
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            # 使用fnmatch模块匹配以.wav结尾的文件
            if fnmatch.fnmatch(file, audio_path):
                # 打印或处理符合条件的文件
                joint_path = os.path.join(root, file)
                with contextlib.closing(wave.open(joint_path, 'rb')) as f:
                    frames = f.getnframes()
                    rate = f.getframerate()
                    duration = frames / float(rate)
                audio_feature_list.append(AFE.infer(joint_path, duration)[0])
                index_list.append(index)





In [18]:
data_total = []
with open("merged.jsonl", 'r') as input_file:
    # 逐行处理JSONL文件
    for line in input_file:
        # 加载JSON数据
        data = json.loads(line.strip())
        data_total.append(data)

print("处理完成，已将新项添加到JSONL文件中。")

处理完成，已将新项添加到JSONL文件中。


In [19]:
print(len(audio_feature_list))

1144


In [20]:
print(len(data_total))

1144


In [24]:
with open("feature.jsonl", 'w') as output_file:
    for index, value in enumerate(audio_feature_list):
        data_total[index]["audio_feature"] = audio_feature_list[index].tolist()
        updated_json_str = json.dumps(data_total[index],ensure_ascii=False)
    # 写入修改后的JSON字符串到输出文件
        output_file.write(updated_json_str + '\n')

In [25]:
import json
from torch.utils.data import Dataset, DataLoader

In [36]:
class CustomDataset(Dataset):
    def __init__(self, file_path):
        self.data = []
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                # 假设每行都是一个JSON对象
                item = json.loads(line)
                self.data.append(item)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        # 返回单个数据项，根据需要进行预处理
        role = sample["role"]
        text = sample["text"]
        audio_feature = sample["audio_feature"]
        position_feature = sample["position"]
        return {
            "role": role,
            "text": text,
            "audio_feature": torch.tensor(audio_feature),
            "position_feature": position_feature
        }

In [37]:
dataset = CustomDataset('/content/feature.jsonl')

In [56]:
from torch.utils.data import DataLoader, random_split

# 定义数据集大小
dataset_size = len(dataset)

# 定义训练集、验证集和测试集的大小比例（例如，60%训练，20%验证，20%测试）
train_size = int(0.85 * dataset_size)
test_size = int(0.1 * dataset_size)
val_size = dataset_size - train_size - test_size

# 随机切分数据集为训练集和临时集（包括验证集和测试集）
train_dataset, temp_dataset = random_split(dataset, [train_size, dataset_size - train_size])

# 再次随机切分临时集为验证集和测试集
val_dataset, test_dataset = random_split(temp_dataset, [val_size, test_size])

# 创建训练集、验证集和测试集的 DataLoader
batch_size = 64  # 设置每个批次的大小
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [57]:
len(train_loader)

16

In [58]:
len(test_loader)

2

In [59]:
len(val_loader)

1

In [60]:
for batch in val_loader:
    print(batch)

{'role': ['长门', '阿虚', '阿虚', '古泉', '阿虚', '长门', '长门', '阿虚', '阿虚', '春日', '阿虚', '新川先生', '春日', '阿虚', '朝比奈', '春日', '长门', '古泉', '古泉', '阿虚', '阿虚', '阿虚', '春日', '阿虚', '长门', '阿虚', '阿虚', '春日', '春日', '阿虚', '古泉', '阿虚妹妹', '阿虚', '朝比奈', '阿虚', '朝比奈', '阿虚', '古泉', '阿虚', '长门', '阿虚', '春日', '春日', '新川先生', '春日', '阿虚', '长门', '长门', '春日', '阿虚', '春日', '朝比奈', '新川先生', '古泉', '朝比奈', '阿虚', '阿虚', '春日'], 'text': ['在某弓形群岛的一个地区喷出的信息爆炸', '如果凉宫春日闭上嘴坐在位子上的时候', '但是很奇怪啊', '但合宿不是也没什么不好吗', '我只是随便问她一些问题而已啊', '我都是这样度过的', '因为它们没有语言', '我说你也给我一起来阻止她啊！', '而且还有可能把你赶出去哦', '这次一定要找到什么！', '犹豫了很长时间都没有说出一句话的朝比奈学姐', '这个很难得知', '从现在起这件屋子就是我们的社团活动室了', '我这么说的原因线索之一', '也许你不会相信', '这可是要把我们SOS团的活动记录流传后世的珍贵资料！', '你曾对我说不论什么人来都不可以开门', '每个人都有自己娱乐的方式', '不过我们只是在以最值得畏惧的可能性为前提而行动着', '那么这里应该是文艺社吧？', '也许是地热的缘故', '…接不上话了', '我没绑架她啦', '留下这么一句就走了', '这三年里没有出现什么特别的不确定因素', '关于创立同好会的规定', '周三', '…我想是不会的', '这里是SOS团', '什么死刑啊', '我们输了', '清了～第三名！', '保留意见', '是？', '简称为SOS团', '因为你是被凉宫同学选中的人！', '这种东西怎么可能存在嘛', '这个情况正是所谓的「Closed Circle」', '但是我最惊讶的是春日会好好地回答我的问题', '我喜欢你', '恐怕会轻松很多吧'

In [None]:
class Cluster:
    def __init__(self):
        pass
        self.audio_feature_list = []
        self.position_feature_list = []

    def get_audio_argmax_m(self, m, batch):
        pass

    # def get


class ClusterSystem:
    def __init__(self, k, m, batch):
        pass
        self.k = k
        self.m = m
        self.roles = batch["role"]
        self.texts = batch["text"]
        self.audio_features = batch["audio_feature"]
        self.position_features = batch["position_feature"]

    def initialize_clusters(self):
        pass


    def choose_cluster(self):
        pass

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# 定义一个简单的2层MLP模型
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)  # 如果是多类分类，使用softmax作为激活函数

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.softmax(x)
        return x

# 定义模型的超参数
input_size = 784
hidden_size = 128
output_size = 5

# 创建模型实例
model = MLP(input_size, hidden_size, output_size)
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()  # 交叉熵损失函数通常用于分类问题
optimizer = optim.Adam(model.parameters(), lr=0.001)  # 使用Adam优化器