In [1]:
#!pip install soundfile seaborn psutil pydub librosa  pydub  tqdm 

In [1]:
# from google.colab import drive # To Get an Access to Google Drive
import os
import pandas as pd
import numpy as np
from pydub import AudioSegment
from pydub.playback import play
import random
import librosa
import librosa.display
from tqdm import tqdm
import matplotlib.pyplot as plt
import IPython.display as ipd
import warnings
import psutil
import sys
import soundfile as sf

#!rm -r newClips mixAudio spectrogram mask temp vector csv # to reset the folders
warnings.filterwarnings("ignore", message="")



In [152]:
# ----- main -------------------------------------------------------------------------

# --> selcting path
!mkdir newClips mixAudio spectrogram mask temp vector csv
def path(folder):

    directory = {
        "clips": "clips/",
        "newClips": "newClips/",
        "mixAudio": "mixAudio/",
        "spectrogram": "spectrogram/",
        "mask": "mask/",
        "temp": "temp/",
        "vector":"vector/",
        "csv":"csv/"
    }

    return directory[folder]

# --> Print summary of Hyperparameters
def summary(num_of_samples, length, start_time, end_time, num_of_audios, num_combined_voices, num_lowest_length, num_highest_length):
    print(f"--> Number of samples : {num_of_samples}")
    print(f"--> Number of newClips: {num_of_audios}")
    print(f"--> Number of combined voices: {num_combined_voices}")
    print(f"--> Clip length : {length}")
    print(f"--> Clip Timing : [ {start_time / 1000} : {end_time / 1000} ] ")
    print(f"--> Voices Domain : [ {num_lowest_length / 1000} : {num_highest_length / 1000} ] ")

# --> Load Files
def load(path):
    var = np.load(path)
    return var

# --> Save DataFrames
def save_df(file, name):
    newData_path = path('csv') + name
    file.to_csv(f'{newData_path}.csv', index=False)  # Save as CSV file
    
# ----- Tools -------------------------------------------------------------------------

# --> check a varible size in mb
def tool_var_size(var):
    size_in_bytes = sys.getsizeof(var)
    size_in_mb = size_in_bytes / (1024 * 1024)
    print(f"The size is {size_in_mb:.2f} MB.")

# --> check RAM size
def tool_ram_size():    
    vm_stats = psutil.virtual_memory()
    available_str = psutil._common.bytes2human(vm_stats.available)
    
    print("Available memory: {}".format(available_str))
    
# --> Get ID number from a path
def tool_file_id(file_path):
    file_path = file_path.split('.')[0]
    id_number = file_path.split('_')[-1]
    return id_number
    
# ----- Audio -------------------------------------------------------------------------

# --> play [audio_path]
def audio_play_path(audio_path):
    audio = AudioSegment.from_file(audio_path)
    return audio

# --> play [spectrogram]
def audio_play_spec(spec, name='spec_to_audio', target_dbfs= -24):    
    audio = librosa.istft(spec)
  
    newData_path = os.path.join('temp', f'{name}.wav')

    sf.write(newData_path, audio, 22050) # to convert it to AudioSegment type
    audio = AudioSegment.from_file(newData_path)
    
    current_dbfs = audio.dBFS
    gain_needed = target_dbfs - current_dbfs
    audio = audio + gain_needed
    return audio

# ----- Spectogram -------------------------------------------------------------------------

# --> Create Spectrogram
def spec_creation(audio_path, sr=22050):
    y, sr = librosa.load(audio_path)
    spectrogram = librosa.stft(y)
    spectrogram = spec_normalization(spectrogram)
    return spectrogram

# --> Scale Factor Normalization 
def spec_normalization(spec):
    spec_max = np.max(spec)
    spec_min = np.min(spec)

    scale_factor = max(abs(spec_max), abs(spec_min))

    spec_norm = spec / scale_factor
    normalized_spec = np.clip(spec_norm, -1.0, 1.0)
    return(normalized_spec)

# --> visualization  
def spec_visualize(spec):
    librosa.display.specshow((spec), sr=22050, x_axis='time', y_axis='log')

# --> Applay mask on Spectrogram
def spec_applay_mask(spec_path, mask_path):
    
    spec = np.load(spec_path) if isinstance(spec_path, str) else spec_path
    mask = np.load(mask_path) if isinstance(mask_path, str) else mask_path        
    
    masked_spectrogram = spec * mask   
    return masked_spectrogram

# --> save audio as spectrgram and return the path
def spec_save(audio_path):
    newData_path = path('spectrogram') + 'spec_' + tool_file_id(audio_path)+ '.npy'
    spec = spec_creation(audio_path)
    
    np.save(newData_path, spec)
    return(newData_path)

# ----- Mask -------------------------------------------------------------------------

# --> create spectrogram binary mask
def mask_creation(audio_path, percentile=97):
    spec = spec_creation(audio_path)
    mag = np.abs(spec)

    threshold = np.percentile(mag, percentile)
    mask = (mag > threshold)
    
    return mask.astype(int)

# --> save audio as binary mask and return the path
def mask_save(audio_path):
    newData_path = path('mask') + 'mask_' + tool_file_id(audio_path)+ '.npy'
    mask = mask_creation(audio_path)
    
    np.save(newData_path, mask)
    return(newData_path)

# --> Convert a many maskS to one vector 
def mask_vector(m1_path, m2_path, m3_path):
    mask_1 = load(m1_path)
    mask_2 = load(m2_path)
    mask_3 = load(m3_path)
    
    masks = np.concatenate((mask_1, mask_2, mask_3), axis=1)
    vector = np.reshape(masks, (-1,))
    
    return vector

# --> save vector 
def mask_vector_save(m1_path, m2_path, m3_path):
    mID_1 = tool_file_id(m1_path)
    mID_2 = tool_file_id(m2_path)
    mID_3 = tool_file_id(m3_path)
    
    newData_path = path('vector') + f'vector_{mID_1}_{mID_2}_{mID_3}.npy'
    vector = mask_vector(m1_path, m2_path, m3_path)
    
    np.save(newData_path, vector)
    return newData_path

# convert vector to masks
def mask_vector_resahpe(vector, num_frequency_bins = 1025):
    masks = vector.reshape(num_frequency_bins,-1)
    coulmns = int(masks.shape[1]/num_combined_voices)

    mask_1 = masks[:, :coulmns]
    mask_2 = masks[:, coulmns:coulmns*2]
    mask_3 = masks[:, coulmns*2:]
    
    return mask_1, mask_2, mask_3

mkdir: cannot create directory ‘newClips’: File exists
mkdir: cannot create directory ‘mixAudio’: File exists
mkdir: cannot create directory ‘spectrogram’: File exists
mkdir: cannot create directory ‘mask’: File exists
mkdir: cannot create directory ‘temp’: File exists
mkdir: cannot create directory ‘vector’: File exists
mkdir: cannot create directory ‘csv’: File exists


In [4]:
# FINAL Numbers of samples in the DataFrame
num_of_samples= 10

# The clip length in seconds
length = 2

# Define the clip from audio with a fixed length (end_time - start_time) in milliseconds 
start_time = 1500
end_time = start_time + (length * 1000) #FIXED

# Total number of new fixed length audios
num_of_audios = num_of_samples * 3 # FIXED

# Number of merging voices fixed for now
num_combined_voices = 3 # FIXED

# Define the domain to pick voices
num_lowest_length = end_time #FIXED (end_time)
num_highest_length = 6000

# Summary 
summary(num_of_samples, length, start_time, end_time, num_of_audios, num_combined_voices, num_lowest_length, num_highest_length)

--> Number of samples : 10
--> Number of newClips: 30
--> Number of combined voices: 3
--> Clip length : 2
--> Clip Timing : [ 1.5 : 3.5 ] 
--> Voices Domain : [ 3.5 : 6.0 ] 


In [5]:
audio_files = os.listdir('clips')
data = pd.read_csv('validated.tsv', sep='\t')
audio_time = pd.read_table('times.txt', header=None, names=['Time']).squeeze()

In [6]:
split_data = audio_time.apply(lambda x: x.split('/')[-1])
split_data = split_data.apply(lambda x: x.split('='))

file_names = split_data.apply(lambda x: x[0].strip()).str[:-1]
time = split_data.apply(lambda x: x[1].strip())

mask = file_names.isin(data.path) 
mask = mask[mask == True]

file_names = file_names.where(mask).dropna()
time = time.where(mask).dropna()

In [7]:
# Split time.txt into file Name and time
split_data = audio_time.apply(lambda x: x.split('/')[-1])
split_data = split_data.apply(lambda x: x.split('='))

file_names = split_data.apply(lambda x: x[0].strip()).str[:-1]
time = split_data.apply(lambda x: x[1].strip())

mask = file_names.isin(data.path) 
mask = mask[mask == True]

file_names = file_names.where(mask).dropna()
time = time.where(mask).dropna()

df_time = pd.DataFrame({'path':file_names, 'time':time}) 
df_time = df_time.sort_values(by='time', key=lambda x: x.astype(int), ignore_index=True)

df_time = df_time.where((df_time['time'].astype(int) > num_lowest_length) & (df_time['time'].astype(int) < num_highest_length)).dropna()
df_time = df_time.reset_index(drop=True)

In [8]:
csv = data.drop(['client_id', 'up_votes','down_votes','variant','segment', 'accents', 'locale', 'age', 'gender','sentence'], axis=1)
csv = csv.merge(df_time, on='path', how='right')
csv = csv.sample(frac=1).reset_index(drop=True)

In [9]:
csv['exist'] = False

for i in tqdm(range(len(csv))):
    newData_path = path('clips') + csv['path'][i]
    if os.path.exists(newData_path):
        csv.loc[i, 'exist'] = True

csv = csv[csv['exist'] == True]
csv = csv.reset_index(drop=True)

100%|██████████| 2252/2252 [00:06<00:00, 335.73it/s]


In [10]:
csv['newClip'] = ''
for i in tqdm(range(0,num_of_audios)): 
    newData_path = path('newClips') + csv['path'][i]
    
    clip = AudioSegment.from_file(path('clips') + csv['path'][i])
    clip = clip[start_time:end_time]

    clip.export(newData_path, format='mp3')
    csv['newClip'][i] = newData_path
    
csv = csv[:num_of_audios]

100%|██████████| 30/30 [00:10<00:00,  2.93it/s]


In [11]:
df_audio = pd.DataFrame({'mixAudio': [], 'x1': [], 'x2': [], 'x3': []})

index = 0

for i in tqdm(range(0,num_of_samples)): 
    newData_path = path('mixAudio') + 'mixAudio_' + str(i) + '.mp3'
    mix = AudioSegment.from_file(csv['newClip'][index])
    
    for j in range(1,num_combined_voices): 
        Speaker_voice = AudioSegment.from_file(csv['newClip'][index+j])
        mix = mix.overlay(Speaker_voice)
    
    mix.export(newData_path, format='mp3')    
    df_audio = df_audio.append({'mixAudio': newData_path,'x1':csv['newClip'][index], 'x2':csv['newClip'][index+1], 'x3':csv['newClip'][index+2]}, ignore_index=True)
    index = index + num_combined_voices # Skip merge audio and go for next
save_df(df_audio, 'audio_path')

100%|██████████| 10/10 [00:08<00:00,  1.21it/s]


In [12]:
tqdm.pandas()
df = pd.DataFrame({'spectrogram': [], 'mask_1': [], 'mask_2': [], 'mask_3': [], 'vector': []})

df['spectrogram'] = df_audio['mixAudio'].progress_apply(lambda x: spec_save(x))

df['mask_1'] = df_audio['x1'].progress_apply(lambda x: mask_save(x))
df['mask_2'] = df_audio['x2'].progress_apply(lambda x: mask_save(x))
df['mask_3'] = df_audio['x3'].progress_apply(lambda x: mask_save(x))

df['vector'] = df[['mask_1', 'mask_2', 'mask_3']].progress_apply(lambda row: mask_vector_save(row['mask_1'], row['mask_2'], row['mask_3']), axis=1)
save_df(df, 'Final')

100%|██████████| 10/10 [00:02<00:00,  4.09it/s]
100%|██████████| 10/10 [00:00<00:00, 13.42it/s]
100%|██████████| 10/10 [00:00<00:00, 11.67it/s]
100%|██████████| 10/10 [00:00<00:00, 11.70it/s]
100%|██████████| 10/10 [00:00<00:00, 24.61it/s]


In [13]:
# Used for reshape vectore back to Masks 
num_frequency_bins = load(df['mask_1'][0]).shape[0]
print( '' if num_frequency_bins == 1025 else f'!! Need to change num_frequency_bins to {num_frequency_bins}', end='')

In [31]:
df = pd.read_csv('csv/Final.csv')
df.head()

Unnamed: 0,spectrogram,mask_1,mask_2,mask_3,vector
0,spectrogram/spec_0.npy,mask/mask_37870464.npy,mask/mask_37532523.npy,mask/mask_37300591.npy,vector/vector_37870464_37532523_37300591.npy
1,spectrogram/spec_1.npy,mask/mask_37457241.npy,mask/mask_37701314.npy,mask/mask_38011105.npy,vector/vector_37457241_37701314_38011105.npy
2,spectrogram/spec_2.npy,mask/mask_37531336.npy,mask/mask_37539141.npy,mask/mask_37396167.npy,vector/vector_37531336_37539141_37396167.npy
3,spectrogram/spec_3.npy,mask/mask_37306217.npy,mask/mask_37679894.npy,mask/mask_37935235.npy,vector/vector_37306217_37679894_37935235.npy
4,spectrogram/spec_4.npy,mask/mask_37780661.npy,mask/mask_37289274.npy,mask/mask_37937553.npy,vector/vector_37780661_37289274_37937553.npy
