In [1]:
import librosa, librosa.display
import numpy as np
from pydub import AudioSegment
import my_utils
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

n_epochs = 3
# batch_size_train = 64
# batch_size_test = 1000
learning_rate = 0.01
# momentum = 0.5
log_interval = 10

n_fft = 1024
hop_length = 8
sr = 16000



In [2]:
n_fft = 1024
hop_length = 8
sr = 16000

voice1, sr_female = librosa.load("../recordings/voice1/arctic_a0001_female.wav", sr=16000) 
voice2, sr_male = librosa.load("../recordings/voice2/arctic_a0002_male.wav", sr=16000) 

# pad smaller array with zeros, so both audio files have the same length
voice1, voice2 = my_utils.make_wav_files_same_size(voice1, voice2)

# tried to reconstruct from spectrogram - np.abs(spectrogram) is used inside 'reconstruct' function
# we have to send the spectrogram itself and not np.abs(spectrogram) because :
# Z = np.abs(spectrogram) * np.exp(np.angle(spectrogram) * 1j)
spectrogram = librosa.stft(librosa.to_mono(voice1), window='hann', n_fft=n_fft, hop_length=hop_length)
voice1_rec = my_utils.reconstruct(spectrogram=spectrogram)
librosa.output.write_wav('../recordings/REC.wav', voice1_rec, sr = 16000)

In [3]:
def get_STFT_AF_frames(AF, PF_size):
    frames = []
    
    i = 0
    start = (int)(i * (PF_size / 2))
    stop = start + PF_size
    
    while start < len(AF) and stop <= len(AF):
        frame = AF[start:stop]
        frame = librosa.stft(librosa.to_mono(frame), window='hann', n_fft=n_fft, hop_length=hop_length)
        frame = np.abs(frame)
        
        tensor = []
        tensor.append(frame)
#         print("tensor", np.asarray(tensor).shape)
        
        frames.append(tensor)
        
        i += 1
        start = (int)(i * (PF_size / 2))
        stop = start + PF_size
    
    return np.asarray(frames)

In [4]:
def get_AF_from_mix(mix_frames, current_position, number_of_PF_per_AF, PF_length):
    AF = []
    if current_position < (number_of_PF_per_AF-1):
        # ex: get_AF_from_mix(mix, 2, 4, 320) -> trebuie AF din mix[0: 940]
        # start = 0
        # stop = (2+1) * 320 = 940 => adica am frame-urile {0, 1, 2}
        stop = (current_position + 1) * PF_length
        AF = mix[: stop]
    else:
        # ex: get_AF_from_mix(mix, 5, 4, 320) -> trebuie AF din mix[640: 1920]
        # start = (5-4) + 1  * 320 = 640
        # stop = (5+1) * 320 = 1920 => adica am frame-urile {2, 3, 4, 5}
        start = (current_position - number_of_PF_per_AF + 1) * PF_length
        stop = (current_position + 1) * PF_length
        AF = mix[start : stop]
        
    return AF

In [5]:
def get_input_target_masks_for_voices(voice1, voice2):
    voice1_frames = np.array([female[i:i + samples_per_frame] for i in range(0, len(mix), samples_per_frame)])
    voice2_frames = np.array([male[i:i + samples_per_frame] for i in range(0, len(mix), samples_per_frame)])

    mask_frames = []
    n_fft = 1024
    hop_length = 8
    sr = 16000
    
    for i in range(0, voice1_frames.shape[0]):
        stft_1 = librosa.stft(librosa.to_mono(voice1_frames[i]), window='hann', n_fft=n_fft, hop_length=hop_length)
        stft_2 = librosa.stft(librosa.to_mono(voice2_frames[i]), window='hann', n_fft=n_fft, hop_length=hop_length)
        
        mask_frames.append(my_utils.compute_mask(stft_1, stft_2))
        
    return np.array(mask_frames)

In [6]:
def get_input_stft_frames_for_audio(mix):
    stft_frames = []
    n_fft = 1024
    hop_length = 8
    sr = 16000
    
    for i in range(0, mix.shape[0]):
        stft = librosa.stft(librosa.to_mono(mix[i]), window='hann', n_fft=n_fft, hop_length=hop_length)
        
        stft_frames.append(stft)
        
    return np.array(stft_frames)

In [7]:
male_filename = '../recordings/male1/arctic_a0002.wav'
female_filename = '../recordings/female1/arctic_a0001.wav'

male, sr = librosa.load(male_filename, sr=16000) 
female, sr = librosa.load(female_filename, sr=16000) 

# pad smaller array with zeros, so both audio files have the same length
female, male = my_utils.make_wav_files_same_size(female, male)

# load the mixed audio 
mix = female + male

male = np.array(male)
female = np.array(female)
mix = np.array(mix)

frame_length_ms = 20
mix_length_ms = len(mix) / sr * 1000
samples_per_frame = (int)(len(mix) * frame_length_ms / mix_length_ms)

print("mix total length: ",len(mix))
print("mix total length (ms) : ", mix_length_ms)
print("nb of samples for", frame_length_ms,"ms frame =", samples_per_frame, "samples/array")

mix_audio_frames = np.array([mix[i:i + samples_per_frame] for i in range(0, len(mix), samples_per_frame)])

mask_frames = get_input_target_masks_for_voices(voice1, voice2)
print("Mask frames shape: ", mask_frames.shape)
print("Mask -one- frame shape: ", mask_frames[0].shape)


mix total length:  80000
mix total length (ms) :  5000.0
nb of samples for 20 ms frame = 320 samples/array
Mask frames shape:  (250, 513, 41)
Mask -one- frame shape:  (513, 41)


In [8]:
import torch

# train set for only one audio
train_set_input = torch.from_numpy(mix_audio_frames)
# target contains the calculated masks for each PF from mix
train_set_target = torch.from_numpy(mask_frames)

# 250 ferestre a cate 320 valori fiecare / 250 PF-uri
print("Train set input shape: ", train_set_input.shape)
# targetul are aceeasi dim la shape[0] ca si inputul -> acelasi nr de frame-uri
print("Train set input shape: ", train_set_target.shape)

# asa ar trebui sa calculez un AF, luand cate 4 PF-uri, ultimul fiind cel curent
AF = get_AF_from_mix(mix_frames=mix, current_position= 3, number_of_PF_per_AF= 4, PF_length=samples_per_frame)

# AF = train_set_input[0, :]
# AF = np.append(AF, train_set_input[1, :])
# AF = np.append(AF, train_set_input[2, :])
# AF = np.append(AF, train_set_input[3, :])

print("AF: ", AF.shape)
assert(AF.shape == (1280,))


# AF_frames va intra ca "data" in CNN cu network(data)
PF_size = len(train_set_input[0, :])
AF_frames = get_STFT_AF_frames(AF, PF_size)

# create the shape for the tensor how the pytorch input data needs: entries, nr of channels, height, width
print("AF_frames shape after concatenating 4 PFs + overlap: ", AF_frames.shape)
assert(AF_frames.shape == (7, 1, 513, 41))


# train_set_input need to be bounded, so each PF from it needs to be associated with PF from voice1 + voice2
# keys will be train_set_input
# values will be train_set_target = each position of train_set_target will have a coresp mask
train_set = dict(zip(train_set_input, train_set_target))
print("train set PFs: ", len(train_set))

Train set input shape:  torch.Size([250, 320])
Train set input shape:  torch.Size([250, 513, 41])
AF:  (1280,)
AF_frames shape after concatenating 4 PFs + overlap:  (7, 1, 513, 41)
train set PFs:  250


In [16]:
class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        
        # layer 1
        self.conv1 = nn.Conv2d(1, 20, kernel_size=5, stride=1, padding=2)
        self.conv1_batch = nn.BatchNorm2d(20)
        
        # layer 2
        self.conv2 = nn.Conv2d(20, 20, kernel_size=5, stride=1, padding=2)
        self.conv2_batch = nn.BatchNorm2d(20)
        
        # layer 3
        self.conv3 = nn.Conv2d(20, 1, kernel_size=5, stride=1, padding=2)
        self.conv3_batch = nn.BatchNorm2d(1)

    def forward(self, x):
        
        # layer 1
        x = self.conv1(x)
#         x = F.max_pool2d(x, 2)
        x = nn.functional.sigmoid(x)
        x = self.conv1_batch(x)
        print("1: ", self.conv1..shape)
        
        # layer 2
        x = self.conv2(x)
#         x = F.max_pool2d(x, 2)
        x = nn.functional.sigmoid(x)
        x = self.conv2_batch(x)
        print("2: ", x.shape)
        
        # layer 2
        x = self.conv3(x)
#         x = F.max_pool2d(x, 2)
        x = nn.functional.sigmoid(x)
        x = self.conv3_batch(x)
        print("3: ", x.shape)
        
        return x

In [17]:
# create network
network = Network()

# if cuda is avaibable, send network's params to gpu
if torch.cuda.is_available():
    network.cuda()
    
# set optimizer -> article : Adam, lr = 0.001, b1 = 0.9, b2 = 0.999
optimizer = optim.Adam(network.parameters(), lr = 0.001, betas = (0.9, 0.999))
loss_function = nn.MSELoss()

In [18]:
# init arrays for train/test errors
train_losses = []
train_counter = []

test_losses = []
test_counter = [i*len(train_set) for i in range(n_epochs)]

In [40]:
number_of_PF_per_AF = 4
samples_per_frame = 320 # pentru 20ms per frame

def train(epoch):
    mask = []
    network.train()

    mix_frames = [train_set.keys()]
#     print(type(mix_frames))
    
    # input is current PF => network will learn the mask for the current PF
    for index, (input, target) in enumerate(train_set.items()):
        
        # asa ar trebui sa calculez un AF, luand cate N PF-uri, ultimul fiind cel curent
        input_AF = get_AF_from_mix(mix_frames=mix_frames, current_position= index, number_of_PF_per_AF= number_of_PF_per_AF, 
                                 PF_length=samples_per_frame)
        
        # np.abs pe fiecare frame -> torch nu accepta numere complexe
        input_AF_STFT_frames = get_STFT_AF_frames(AF=input_AF, PF_size=len(input))
#         print("train_set_input:", input_AF_STFT_frames.shape)
        
        data = torch.from_numpy(input_AF_STFT_frames)

        # if cuda is available, send (data, target) to gpu
        if torch.cuda.is_available():
            data, target = data.cuda(), target.cuda()

        optimizer.zero_grad()

        output = network(data)
        print(output.shape, data.shape)
        print(torch.equal(output,data))
        print("train-output shape: ", output.shape, target.shape)
        
        loss = loss_function(output, target)
        print("Loss:",loss.item())

        loss.backward()
        
        # face update la weights
        optimizer.step()
        
#         if index == 0 :
#             for name, param in network.named_parameters():
#                 print(name, '\t\t', param.shape)
            
#         break
        mask.append(output[output.shape[0]-1][0].shape)

#         print("Epoch", epoch, "[", index * len(data), "/", len(data))
        if index % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, index, len(train_set),
            100. * index / len(train_set), loss.item()))
            train_losses.append(loss.item())
            train_counter.append(
            (index) + ((epoch-1)*len(train_set.items())))
#             torch.save(network.state_dict(), './results/model.pth')
#             torch.save(optimizer.state_dict(), './results/optimizer.pth')
    print(len(mask))
    return mask

In [41]:
import torch

# -------------------------------------------- LOAD AUDIO FILE AND CREATE MIX------------------------------
male_filename = '../recordings/male1/arctic_a0002.wav'
female_filename = '../recordings/female1/arctic_a0001.wav'

male, sr = librosa.load(male_filename, sr=16000) 
female, sr = librosa.load(female_filename, sr=16000) 

# pad smaller array with zeros, so both audio files have the same length
female, male = my_utils.make_wav_files_same_size(female, male)

# load the mixed audio 
mix = female + male

male = np.array(male)
female = np.array(female)
mix = np.array(mix)


# -------------------------------- CREATE TRAIN_SET_INPUT & TRAIN_SET_TARGET FOR ONE AUDIO FILE

# train set for only one audio
mix_audio_frames = np.array([mix[i:i + samples_per_frame] for i in range(0, len(mix), samples_per_frame)])
train_set_input = torch.from_numpy(mix_audio_frames)

# target contains the calculated masks for each PF from mix
mask_frames = get_input_target_masks_for_voices(voice1, voice2)
train_set_target = torch.from_numpy(mask_frames)

# 250 ferestre a cate 320 valori fiecare / 250 PF-uri
# print("Train set input shape: ", train_set_input.shape)

# targetul are aceeasi dim la shape[0] ca si inputul -> acelasi nr de frame-uri
# print("Train set target shape: ", train_set_target.shape)

# train_set_input need to be bounded, so each PF from it needs to be associated with PF from voice1 + voice2
# keys will be train_set_input
# values will be train_set_target = each position of train_set_target will have a coresp mask

train_set = dict(zip(train_set_input, train_set_target))
# print("train set PFs: ", len(train_set))

# ------------------------------------------- CALL NETWORK ---------------------------------------------
mask = train(0)
# for epoch in range(1, n_epochs + 1):
#     train(epoch)
#     test()



1:  torch.Size([1, 20, 513, 41])
2:  torch.Size([1, 20, 513, 41])
3:  torch.Size([1, 1, 513, 41])
torch.Size([1, 1, 513, 41]) torch.Size([1, 1, 513, 41])
False
train-output shape:  torch.Size([1, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.4610132575035095
1:  torch.Size([3, 20, 513, 41])
2:  torch.Size([3, 20, 513, 41])
3:  torch.Size([3, 1, 513, 41])
torch.Size([3, 1, 513, 41]) torch.Size([3, 1, 513, 41])
False
train-output shape:  torch.Size([3, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.45454150438308716
1:  torch.Size([5, 20, 513, 41])
2:  torch.Size([5, 20, 513, 41])
3:  torch.Size([5, 1, 513, 41])
torch.Size([5, 1, 513, 41]) torch.Size([5, 1, 513, 41])
False
train-output shape:  torch.Size([5, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.5006790161132812
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])


1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.4463821053504944
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.4264862835407257
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.5322068929672241
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
L

1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 1.129584550857544
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 1.1197940111160278
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 1.1096289157867432
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Lo

1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.5883923172950745
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.6884921789169312
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.8011838793754578
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
L

1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.8262348175048828
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.8293415904045105
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.8351647853851318
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
L

1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.5141059160232544
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.5515415072441101
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.5447453260421753
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
L

1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.2981683313846588
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.2972499430179596
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.29632648825645447
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])


1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.2681380808353424
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.26721587777137756
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])
Loss: 0.2662948966026306
1:  torch.Size([7, 20, 513, 41])
2:  torch.Size([7, 20, 513, 41])
3:  torch.Size([7, 1, 513, 41])
torch.Size([7, 1, 513, 41]) torch.Size([7, 1, 513, 41])
False
train-output shape:  torch.Size([7, 1, 513, 41]) torch.Size([513, 41])


In [56]:
# print(mask)

l = np.array(0,)
new_mask = torch.tensor(l)
torch.cat(mask, out=new_mask)

print(new_mask.shape)

TypeError: expected Tensor as element 0 in argument 0, but got torch.Size