In [None]:
%cd /notebooks
!git clone https://github.com/speechbrain/speechbrain/
%cd /notebooks/speechbrain
!git pull
%pip install -r requirements.txt
%pip install -e . 
%cd /notebooks

In [None]:
%cd /notebooks/
!git clone https://github.com/huggingface/transformers.git
%cd transformers
!git pull
%pip install -e .
%cd /notebooks/

In [None]:
%pip install --force-reinstall   torch torchvision  torchaudio --extra-index-url https://download.pytorch.org/whl/cu121

In [None]:
%pip install requests moviepy librosa pytube pydub

In [None]:
from pydub import AudioSegment
import os

def download_video_at_url(url,output_path):
    # Utilisez la commande ! pour télécharger le fichier en utilisant aria2c
    !aria2c --console-log-level=error -c -x 16 -s 16 -k 1M -o {output_path} {url}
def split_audio_into_chunks(file_path, output_dir,minutes=5):
    """
    Splits an audio file into five-minute chunks and saves them to the output directory.
    
    :param file_path: The path to the input audio file.
    :param output_dir: The directory where the audio chunks will be saved.
    """
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Load the audio file
    audio = AudioSegment.from_file(file_path)
    
    # Define the length of each chunk in milliseconds (5 minutes)
    chunk_length_ms = minutes * 60 * 1000
    
    # Split the audio into chunks
    chunks = []
    for i in range(0, len(audio), chunk_length_ms):
        chunk = audio[i:i + chunk_length_ms]
        chunks.append(chunk)
    
    # Export each chunk as a separate audio file
    for i, chunk in enumerate(chunks):
        chunk_filename = f"{output_dir}/chunk_{i}.wav"  # You can change the format to '.mp3' if needed
        chunk.export(chunk_filename, format="wav")  # Change 'wav' to 'mp3' if exporting as MP3
        print(f"Exported {chunk_filename}")



# Function to download YouTube video
def download_youtube_video(youtube_url, output_path):
    !/root/miniconda3/envs/tts/bin/yt-dlp  -o {output_path} {youtube_url}
    return output_path

# Function to extract audio from video
def extract_audio_from_video(video_path, audio_output_path):
    with VideoFileClip(video_path) as video:
        audio = video.audio
        audio.write_audiofile(audio_output_path, codec='aac')

# Function to resample audio
def resample_audio(input_audio_path, output_audio_path, target_sr):
    signal, sr_orig = librosa.load(input_audio_path, sr=None)  # Load audio without resampling
    signal_resampled = librosa.resample(signal, orig_sr=sr_orig, target_sr=target_sr)  # Resample audio
    sf.write(output_audio_path, signal_resampled, target_sr)  # Save resampled audio


In [None]:
from pytube import YouTube
from moviepy.editor import VideoFileClip
import librosa
import soundfile as sf
from IPython.display import Audio

# URL of the YouTube video
youtube_url = 'https://www.youtube.com/watch?v=OT_KEqdJvUA'

# Paths for the downloaded video and audio files
video_path = 'downloaded_video.mp4'
audio_path = 'extracted_audio.aac'
resampled_audio_path = 'resampled_audio.wav'




In [None]:

!rm /notebooks/{video_path}
# Download the video from YouTube
download_youtube_video(youtube_url, video_path)

In [None]:
# Target sampling rate
target_sampling_rate = 24000

# Extract audio from the downloaded video
extract_audio_from_video(video_path, audio_path)

# Resample the extracted audio to the target sampling rate
resample_audio(audio_path, resampled_audio_path, target_sampling_rate)

print(f"Resampled audio saved to: {resampled_audio_path}")

In [None]:
%env CUDA_VISIBLE_DEVICES=0

In [None]:
%pip install cython speechbrain


In [None]:
%pip install pyannote.audio

In [None]:
!rm -rf ./reseampled_audio
split_audio_into_chunks('resampled_audio.wav', './reseampled_audio',2)

In [None]:
from huggingface_hub import login
login()

In [None]:
from speechbrain.pretrained import SepformerSeparation as separator
import torchaudio

model = separator.from_hparams(source="speechbrain/sepformer-libri3mix", savedir='pretrained_models/sepformer-libri3mix',run_opts={"device":"cuda"} )

est_sources = model.separate_file(path='resampled_audio.wav') 

torchaudio.save("source1hat.wav", est_sources[:, :, 0].detach().cpu(), 8000)
torchaudio.save("source2hat.wav", est_sources[:, :, 1].detach().cpu(), 8000)
torchaudio.save("source3hat.wav", est_sources[:, :, 2].detach().cpu(), 8000)

In [None]:
Audio('source3hat.wav')

In [None]:
from pyannote.audio import Pipeline, Audio
from pydub import AudioSegment
import torch
import os

pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1")
pipeline.to(torch.device("cuda"))
audio_path="resampled_audio.wav"
# apply pretrained pipeline
diarization = pipeline(audio_path)
audio = AudioSegment.from_wav(audio_path)
output_dir = "by_speaker"
!rm -rf {output_dir}
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

speakers = {}
for turn, _, speaker in diarization.itertracks(yield_label=True):
    start_ms = int(turn.start * 1000)
    end_ms = int(turn.end * 1000)
    
    speaker_audio = audio[start_ms:end_ms]
    if speaker not in speakers:
        speakers[speaker] = speaker_audio
    else:
        speakers[speaker] += speaker_audio

for speaker, audio in speakers.items():
    audio.export(os.path.join(output_dir, f"{speaker}.wav"), format="wav")

In [None]:
!rm -rf ./reseampled_audio
split_audio_into_chunks('/notebooks/by_speaker/SPEAKER_01.wav', './reseampled_audio',2)

In [None]:
from pydub import AudioSegment

# Load audio file
audio = AudioSegment.from_wav("/notebooks/by_speaker/SPEAKER_01.wav") 

# Get start and end times in milliseconds
start_time = 0.05*60*1000 + 0*1000 # 2 min 54 sec
end_time = 0.15*60*1000 + 0*1000  # 4 min

# Extract segment
segment = audio[start_time:end_time]

# Resample to 24000 Hz
segment = segment.set_frame_rate(16000)

# Convert to mono
segment = segment.set_channels(1)

# Export to wav file
segment.export("segment_24kmono.wav", format="wav")

In [None]:
from speechbrain.pretrained import SepformerSeparation as separator
import torchaudio
from IPython.display import Audio
model = separator.from_hparams(
source="speechbrain/sepformer-dns4-16k-enhancement", savedir='pretrained_models/sepformer-dns4-16k-enhancement',run_opts={"device":"cuda"} )

# for custom file, change path
est_sources = model.separate_file(path='/notebooks/by_speaker/SPEAKER_01.wav') 

torchaudio.save("source2hat.wav", est_sources[:, :, 0].detach().cpu(), 16000)
#torchaudio.save("source2hat.wav", est_sources[:, :, 1].detach().cpu(), 16000)
Audio(est_sources[:, :, 0].detach().cpu(),rate=16000)
#Audio(est_sources[:, :, 1].detach().cpu(),rate=16000)

In [None]:
Audio(est_sources[:, :, 1].detach().cpu(),rate=8000)

In [None]:
Audio(est_sources[:, :, 0].detach().cpu(),rate=8000)

In [None]:
import torchaudio
from speechbrain.pretrained import WaveformEnhancement

enhance_model = WaveformEnhancement.from_hparams(
    source="speechbrain/mtl-mimic-voicebank",
    savedir="pretrained_models/mtl-mimic-voicebank",
)
enhanced = enhance_model.enhance_file("segment_24kmono.wav")

# Saving enhanced signal on disk
torchaudio.save('enhanced.wav', enhanced.unsqueeze(0).cpu(), 16000)

In [None]:
from IPython.display import Audio

Audio('enhanced.wav')


In [None]:
import torch
import torchaudio
from speechbrain.pretrained import SpectralMaskEnhancement

enhance_model = SpectralMaskEnhancement.from_hparams(
    source="speechbrain/metricgan-plus-voicebank",
    savedir="pretrained_models/metricgan-plus-voicebank",
)

# Load and add fake batch dimension
noisy = enhance_model.load_audio(
    "source3hat.wav"
).unsqueeze(0)

# Add relative length tensor
enhanced = enhance_model.enhance_batch(noisy, lengths=torch.tensor([1.]))

# Saving enhanced signal on disk
torchaudio.save('enhanced.wav', enhanced.cpu(), 16000)