In [None]:
%pip install pfrl@git+https://github.com/voidful/pfrl.git
%pip install textrl==0.2.15
%pip install ipywidgets

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("lca0503/speech-chatgpt-base-ar-v2-epoch10-wotrans")
model = AutoModelForSeq2SeqLM.from_pretrained("lca0503/speech-chatgpt-base-ar-v2-epoch10-wotrans")
model.eval()
model.cuda()

In [None]:
import pfrl
from textrl import TextRLEnv,TextRLActor
from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer, AutoModelWithLMHead
import logging
import sys
import torch
logging.basicConfig(level=logging.INFO, stream=sys.stdout, format='')
from NISQA.nisqa.NISQA_model import nisqaModel

In [None]:
print("model", model)
print("tokenizer", tokenizer)

In [None]:
base_path = '/work/b0990106x/TextRL'

**Environment: NISQA**

In [None]:
class MyRLEnv(TextRLEnv):
    def __init__(self, model, tokenizer, device="cuda", observation_input=[], max_length=100, compare_sample=2):
        super().__init__(model, tokenizer, observation_input, max_length, compare_sample)
        self.device = torch.device(device)
        self.model = model
        self.tokenizer = tokenizer
        self.observation_input = observation_input

        
    def get_reward(self, input_path=None, output_dir=None): # predicted will be the list of predicted token
        args = {
            'mode': 'predict_file',  # For example, 'predict_file', 'predict_dir', or 'predict_csv'
            'pretrained_model': f'{base_path}/NISQA/weights/nisqa.tar',  # Name of the pretrained model file
            'deg': f'{base_path}/NISQA/wav/test.wav',  # Path to the speech file if mode is predict_file
            'data_dir': None,  # Folder with speech files if mode is predict_dir
            'output_dir': f'{base_path}/NISQA/result',  # Folder to output results.csv
            'csv_file': None,  # Name of the csv file if mode is predict_csv
            'csv_deg': None,  # Column name in csv with file names/paths if mode is predict_csv
            'num_workers': 0,  # Number of workers for PyTorch's DataLoader
            'bs': 1,  # Batch size for predicting
            'ms_channel': None,  # Audio channel in case of a stereo file, if needed
        }

        if input_path is not None:
            args['deg'] = input_path

        args['tr_bs_val'] = args['bs']
        args['tr_num_workers'] = args['num_workers']
        
        nisqa = nisqaModel(args)
        prediction = nisqa.predict()
        reward = int(prediction['mos_pred'].iloc[0])
        return reward


In [None]:
observation_list = [{'input': 'Hello, how are you?'}]
input_path = f'{base_path}/NISQA/wav/test.wav'
output_dir = f'{base_path}/NISQA/result'

env = MyRLEnv(model, tokenizer ,observation_input=observation_list, max_length=100, compare_sample=2)
print(env.get_reward())

In [None]:
# %pip install datasets
# from datasets import load_from_disk ,load_dataset

# dataset = load_dataset("lca0503/soxdata_encodec")
# dataset.save_to_disk("data")

# dataset = load_dataset("lca0503/soxdata_encodec", split="+".join(["train"]))
# dataset = dataset.filter(lambda x : len(x[f"src_encodec_0"]) <= 700)
# dataset = dataset.shuffle(0).select(range(1))

**Agent: Text-Instruction-Guided Voice Conversion Model**

In [None]:
# from datasets import load_from_disk ,load_dataset

# # dataset.save_to_disk("data-encodec")
# dataset = load_from_disk("data-encodec")

In [None]:
import random
import numpy as np
import soundfile as sf
import torch
from datasets import load_dataset, load_from_disk
from encodec import EncodecModel
from vc.encodec_model.nar_bart_model import NARBartForConditionalGeneration
from argparse import ArgumentParser, Namespace
from transformers import (AutoTokenizer, BartForConditionalGeneration,
                          BatchEncoding)
import vc.trainer_encodec_vc_inference as vc_inference
from types import SimpleNamespace

args = SimpleNamespace(
    dataset="lca0503/soxdata_small_encodec",
    splits=["train"],
    ground_truth_only=False,
    cascade_ar_nar=True,
    nar_model_only=False,
    ground_truth_model_name="voidful/bart-base-unit",
    ar_checkpoint="lca0503/speech-chatgpt-base-ar-v2-epoch10-wotrans",
    nar_checkpoint="lca0503/speech-chatgpt-base-nar-v2-epoch4-wotrans",
    ground_truth_output_path="output_wav/vc/ground_truth/train_1.wav",
    cascade_output_path="output_wav/vc/ar_nar_cascade/train_1.wav",
    nar_output_path="output_wav/vc/nar/train_1.wav",
    seed=0,
    device="cuda"
)

input_path = f'{base_path}/data-encodec'
output_dir = f'{base_path}/output/test.wav'
vc = vc_inference.run(args, input_path, output_dir)

