# Generowanie danych testowych

In [1]:
from src.prepare_datasets import prepare_2_mix_dataset

prepare_2_mix_dataset('data/2mix', 5, 10, 4, 8, n=1000, sr=16000, seed=42)

100%|██████████| 1000/1000 [00:09<00:00, 106.07it/s]


# Przetwarzanie przez modele

In [1]:
import librosa
import torch
import numpy as np
from scipy.io.wavfile import write
from tqdm import tqdm
import time
import os

from src.metrics import get_SI_SDR

In [4]:
def match_size(ref_wav, gen_wav):
    if len(ref_wav) > len(gen_wav):
        gen_wav = np.pad(gen_wav, (0, len(ref_wav) - len(gen_wav)), 'constant')
    elif len(ref_wav) < len(gen_wav):
        gen_wav = gen_wav[:len(ref_wav)]
    return gen_wav

In [3]:
def split_with_function(model, split_function, n=1000, sr=16000):
    times = []
    os.makedirs(f'data/2mix/{model}/speaker1', exist_ok=True)
    os.makedirs(f'data/2mix/{model}/speaker2', exist_ok=True)
    for i in tqdm(range(n)):
        mixed = librosa.load(f'data/2mix/mixed/{i}.wav', sr=sr)[0]
        ref1 = librosa.load(f'data/2mix/speaker1/{i}.wav', sr=sr)[0]
        ref2 = librosa.load(f'data/2mix/speaker2/{i}.wav', sr=sr)[0]

        start = time.perf_counter()
        speaker1, speaker2 = split_function(mixed)
        end = time.perf_counter()

        times.append(end - start)

        speaker1 = speaker1 / np.max(np.abs(speaker1))
        speaker2 = speaker2 / np.max(np.abs(speaker2))
        spk1 = match_size(ref1, speaker1)
        spk2 = match_size(ref2, speaker2)
        sdr1_1 = get_SI_SDR(ref1, spk1)
        sdr1_2 = get_SI_SDR(ref2, spk2)
        sdr2_1 = get_SI_SDR(ref1, spk2)
        sdr2_2 = get_SI_SDR(ref2, spk1)
        if sdr1_1 + sdr1_2 < sdr2_1 + sdr2_2:
            speaker1, speaker2 = speaker2, speaker1
        write(f'data/2mix/{model}/speaker1/{i}.wav', sr, speaker1)
        write(f'data/2mix/{model}/speaker2/{i}.wav', sr, speaker2)
    
    return np.mean(times)

### Mossformer2

In [7]:
import sys
sys.path.append('src/mossformer2')

from src.mossformer2.mossformer2 import MossFormer2_adapted

mossformer_16 = MossFormer2_adapted('src/mossformer2/checkpoints')

  checkpoint = torch.load(


In [8]:
def split_with_mossformer2(input):
    spk1, spk2 = mossformer_16(np.array([input]))
    return spk1, spk2

In [9]:
mossformer2_time = split_with_function('mossformer2', split_with_mossformer2, n=1000, sr=16000)

100%|██████████| 1000/1000 [30:34<00:00,  1.83s/it]


In [10]:
print(f'Mossformer2 time: {mossformer2_time}')

Mossformer2 time: 1.809367655600101


### Sepreformer

In [4]:
from src.sepreformer.sepreformer import SepReformer
sepreformer = SepReformer()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
def split_with_sepreformer(input):
    input = torch.Tensor(input).to(device)
    spk1, spk2 = np.array(sepreformer(input))
    return spk1, spk2

In [6]:
sepreformer_time = split_with_function('sepreformer', split_with_sepreformer, n=1000, sr=8000)

100%|██████████| 1000/1000 [14:26<00:00,  1.15it/s]


In [7]:
print(f'SepReformer time: {sepreformer_time}')

SepReformer time: 0.8386903082000208


### Fast-GeCo

In [5]:
from src.fastgeco.fastgeco import GeCo

fastgeco = GeCo('src/fastgeco/Fast-GeCo/checkpoints')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  encoder_weights = torch.load(os.path.join(ckpt_path, 'encoder.ckpt'))
  masknet_weights = torch.load(os.path.join(ckpt_path, 'masknet.ckpt'))
  decoder_weights = torch.load(os.path.join(ckpt_path, 'decoder.ckpt'))
  if ismodule(module) and hasattr(module, '__file__'):
Lightning automatically upgraded your loaded checkpoint from v1.5.10 to v2.4.0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint c:\Studia\Master_Thesis\src\fastgeco\Fast-GeCo\checkpoints\fastgeco.ckpt`


In [6]:
def split_with_fastgeco(input):
    input = torch.Tensor(input).unsqueeze(0).to(device)
    spk1, spk2 = fastgeco(input)
    return np.array(spk1.cpu()), np.array(spk2.cpu())

In [7]:
fastgeco_time = split_with_function('fastgeco', split_with_fastgeco, n=1000, sr=8000)

100%|██████████| 1000/1000 [24:11<00:00,  1.45s/it]


In [8]:
print(f'Fast-GeCo time: {fastgeco_time}')

Fast-GeCo time: 1.4423784811999358


# Mierzenie metryk

In [1]:
from src.metrics import calculate_metrics

In [2]:
import pandas as pd
import torch

In [3]:
ref_files1 = [f'data/2mix/speaker1/{i}.wav' for i in range(1000)]
ref_files2 = [f'data/2mix/speaker2/{i}.wav' for i in range(1000)]
mixed_files = [f'data/2mix/mixed/{i}.wav' for i in range(1000)]

est_files1 = {model: [f'data/2mix/{model}/speaker1/{i}.wav' for i in range(1000)] for model in ['mossformer2', 'sepreformer', 'fastgeco']}
est_files2 = {model: [f'data/2mix/{model}/speaker2/{i}.wav' for i in range(1000)] for model in ['mossformer2', 'sepreformer', 'fastgeco']}

texts_df = pd.read_csv('data/2mix/texts.csv')
texts1 = list(texts_df['text1'])
texts2 = list(texts_df['text2'])

In [5]:
import whisper

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
transcription_model = whisper.load_model("turbo", device=device)

  checkpoint = torch.load(fp, map_location=device)


In [13]:
results_spk1 = calculate_metrics(ref_files1, est_files1, mixed_files,
                                 sr=16000, texts=texts1, transcription_model=transcription_model)

100%|██████████| 1000/1000 [00:01<00:00, 946.07it/s]
100%|██████████| 1000/1000 [1:09:10<00:00,  4.15s/it]


In [14]:
import json
import pandas as pd

with open('results_spk1_16kHz.json', 'w') as json_file:
    json.dump(results_spk1, json_file, indent=4)

results_df = pd.DataFrame.from_dict(results_spk1, orient='index')
results_df

Unnamed: 0,SI-SDR,MCD,SDR,SIM,PESQ,SDRi,WER,CER,SI-SDRi,STOI
mossformer2,18.029698,1.129666,17.921039,0.997427,2.9718,21.776397,0.0065,0.003352,23.363458,0.976391
sepreformer,-5.443266,7.884259,-3.655967,0.705213,1.204853,0.199391,0.50225,0.405541,-0.109506,0.663606
fastgeco,10.886481,2.773467,10.958125,0.969844,1.89367,14.813482,0.064208,0.036664,16.220241,0.937498


In [6]:
results_spk1_8k = calculate_metrics(ref_files1, est_files1, mixed_files,
                                 sr=8000, texts=texts1, transcription_model=transcription_model)

100%|██████████| 1000/1000 [00:03<00:00, 279.61it/s]
100%|██████████| 1000/1000 [1:16:57<00:00,  4.62s/it]


In [7]:
import json
import pandas as pd

with open('results_spk1_8kHz.json', 'w') as json_file:
    json.dump(results_spk1_8k, json_file, indent=4)

results_df = pd.DataFrame.from_dict(results_spk1_8k, orient='index')
results_df

Unnamed: 0,CER,SIM,SI-SDR,MCD,SDRi,SDR,WER,SI-SDRi,STOI,PESQ
mossformer2,0.004648,0.996549,19.35869,1.129666,23.122032,19.194258,0.0095,24.766047,0.976239,3.263695
sepreformer,0.40618,0.704465,-5.20149,7.884259,0.194192,-3.733581,0.50325,0.205868,0.662982,1.269351
fastgeco,0.036664,0.969844,14.315315,2.773467,17.861085,13.933312,0.064208,19.722672,0.940168,2.434325


In [8]:
results_spk2 = calculate_metrics(ref_files2, est_files2, mixed_files,
                                 sr=16000, texts=texts2, transcription_model=transcription_model)

100%|██████████| 1000/1000 [00:01<00:00, 980.61it/s]
100%|██████████| 1000/1000 [1:16:28<00:00,  4.59s/it]


In [9]:
import json
import pandas as pd

with open('results_spk2_16kHz.json', 'w') as json_file:
    json.dump(results_spk2, json_file, indent=4)

results_df = pd.DataFrame.from_dict(results_spk2, orient='index')
results_df

Unnamed: 0,CER,SIM,SI-SDR,MCD,SDRi,SDR,WER,SI-SDRi,STOI,PESQ
mossformer2,6.9e-05,0.99996,23.275609,0.92455,16.558932,22.74028,0.00025,17.948322,0.991391,3.710053
sepreformer,0.47092,0.715802,1.572594,8.008774,-8.097942,-1.916594,0.582562,-3.754693,0.716087,1.369325
fastgeco,0.002046,0.998683,14.232491,3.242107,7.651517,13.832865,0.004313,8.905204,0.967227,2.431882


In [10]:
results_spk2_8k = calculate_metrics(ref_files2, est_files2, mixed_files,
                                 sr=8000, texts=texts2, transcription_model=transcription_model)

100%|██████████| 1000/1000 [00:02<00:00, 376.27it/s]
100%|██████████| 1000/1000 [1:16:37<00:00,  4.60s/it]


In [11]:
import json
import pandas as pd

with open('results_spk2_8kHz.json', 'w') as json_file:
    json.dump(results_spk2_8k, json_file, indent=4)

results_df = pd.DataFrame.from_dict(results_spk2_8k, orient='index')
results_df

Unnamed: 0,CER,SIM,SI-SDR,MCD,SDRi,SDR,WER,SI-SDRi,STOI,PESQ
mossformer2,1.1e-05,0.999994,24.749967,0.92455,17.752468,23.997347,6.3e-05,19.349058,0.990359,3.857428
sepreformer,0.470839,0.71617,1.892908,8.008774,-8.163492,-1.918614,0.5825,-3.508,0.719825,1.50333
fastgeco,0.002046,0.998683,19.52517,3.242107,11.789016,18.033894,0.004313,14.124261,0.975052,3.298651


In [6]:
import librosa
for file in mixed_files:
    try:
        data, sr = librosa.load(file, sr=16000, mono=True)
    except Exception as e:
        print(f"{file}")

In [14]:
import librosa
for file in est_files1['fastgeco']:
    try:
        data, sr = librosa.load(file, sr=16000, mono=True)
    except Exception as e:
        print(f"{file}")