In [None]:
# Install dependencies
!pip install wget
!apt-get install sox libsndfile1 ffmpeg
!pip install text-unidecode
!pip install matplotlib>=3.3.2

## Install NeMo
BRANCH = 'r2.0.0rc0'
!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]

## Grab the config we'll use in this example
!mkdir configs
!wget -P configs/ https://raw.githubusercontent.com/NVIDIA/NeMo/$BRANCH/examples/asr/conf/citrinet/config_bpe.yaml

"""
Remember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!
Alternatively, you can uncomment the exit() below to crash and restart the kernel, in the case
that you want to use the "Run All Cells" (or similar) option.
"""
# exit()


Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9656 sha256=2d64d209187cf96f9bca533b8d824f20ffc80a161cd77eec85cf114771306757
  Stored in directory: /root/.cache/pip/wheels/8b/f1/7f/5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libsndfile1 is already the newest version (1.0.31-2ubuntu0.1).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
The following additional packages will be installed:
  libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa libsox-fmt-base libsox3 libwavpack1
Suggested packages:
  libsox-fmt-all
The following NEW packages will be installed

'\nRemember to restart the runtime for the kernel to pick up any upgraded packages (e.g. matplotlib)!\nAlternatively, you can uncomment the exit() below to crash and restart the kernel, in the case\nthat you want to use the "Run All Cells" (or similar) option.\n'

In [None]:
!git clone https://github.com/7egment/pyannote-3.1-offline
!pip install pyannote-audio==3.1.1

Cloning into 'pyannote-3.1-offline'...
remote: Enumerating objects: 9, done.[K
remote: Counting objects: 100% (9/9), done.[K
remote: Compressing objects: 100% (8/8), done.[K
remote: Total 9 (delta 1), reused 9 (delta 1), pack-reused 0[K
Receiving objects: 100% (9/9), 28.80 MiB | 17.98 MiB/s, done.
Resolving deltas: 100% (1/1), done.
Collecting pyannote-audio==3.1.1
  Downloading pyannote.audio-3.1.1-py2.py3-none-any.whl (208 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.7/208.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting asteroid-filterbanks>=0.4 (from pyannote-audio==3.1.1)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl (29 kB)
Collecting lightning>=2.0.1 (from pyannote-audio==3.1.1)
  Downloading lightning-2.3.3-py3-none-any.whl (808 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m808.5/808.5 kB[0m [31m25.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyannote.pipeline>=3.0.1 (from pyannote-audio==3.1.1)
  

In [None]:
# Update the paths in the configuration file
!sed -i 's|/home/e6quisitory/pyannote-3.1-offline/seg.bin|pyannote-3.1-offline/seg.bin|g' /content/pyannote-3.1-offline/config.yaml
!sed -i 's|/home/e6quisitory/pyannote-3.1-offline/vox.bin|pyannote-3.1-offline/vox.bin|g' /content/pyannote-3.1-offline/config.yaml




In [None]:
# NeMo's "core" package
import nemo
# NeMo's ASR collection - this collections contains complete ASR models and
# building blocks (modules) for ASR
import nemo.collections.asr as nemo_asr

In [None]:
import os
import json
import torch
from nemo.collections.asr.models import EncDecCTCModelBPE
from pyannote.audio import Pipeline
from pydub import AudioSegment
import numpy as np

# Initialize the ASR model
asr_model = EncDecCTCModelBPE.restore_from(restore_path="/content/MetanoiaLabsModel.nemo")
asr_model.eval()

# Initialize the diarization pipeline
pipeline = Pipeline.from_pretrained("pyannote-3.1-offline/config.yaml")

# Directory containing WAV files
audio_dir = "./"

# Directory to save the JSON files
output_dir = "./Finaljsonfiles"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# List all WAV files in the directory
audio_files = [os.path.join(audio_dir, f) for f in os.listdir(audio_dir) if f.endswith('.wav')]

# Function to transcribe a segment
def transcribe_segment(audio_segment):
    with torch.no_grad():
        audio_tensor = torch.from_numpy(np.array(audio_segment.get_array_of_samples(), dtype=np.float32))
        return asr_model.transcribe([audio_tensor], batch_size=1)[0]

# Process each audio file
for audio_file in audio_files:
    audio_id = os.path.basename(audio_file).split('.')[0]

    try:
        # Load the audio file using pydub
        audio = AudioSegment.from_wav(audio_file)

        # Diarize the audio file
        diarization = pipeline(audio_file)

        # Prepare a list to store JSON output
        json_output = []

        # Extract and transcribe each segment
        for turn, _, speaker in diarization.itertracks(yield_label=True):
            start_time = turn.start
            end_time = turn.end

            # Check if the segment duration is valid
            if end_time <= start_time:
                print(f"Skipping invalid segment from {start_time} to {end_time} in {audio_file}")
                continue

            # Extract the audio segment using pydub
            segment_audio = audio[start_time * 1000:end_time * 1000]  # pydub works in milliseconds

            # Transcribe the audio segment
            text = transcribe_segment(segment_audio)

            # Append the transcription to the JSON output
            json_output.append({
                "start": start_time,
                "end": end_time,
                "speaker": speaker,
                "text": text
            })
    except Exception as e:
        print(f"Error processing {audio_file}: {e}")
        continue

    # Save the JSON output to a file
    json_file = os.path.join(output_dir, f"{audio_id}.json")
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(json_output, f, ensure_ascii=False, indent=4)

    print(f"Processed and saved {json_file}")

print("All files processed.")


[NeMo I 2024-07-21 17:08:06 mixins:172] Tokenizer SentencePieceTokenizer initialized with 64 tokens


[NeMo W 2024-07-21 17:08:07 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /content/Last_Hope.json
    sample_rate: 16000
    batch_size: 16
    trim_silence: true
    max_duration: 16.7
    shuffle: true
    num_workers: 8
    pin_memory: true
    is_tarred: false
    tarred_audio_filepaths: null
    shard_strategy: scatter
    shuffle_n: 2048
    bucketing_strategy: synced_randomized
    bucketing_batch_size: null
    
[NeMo W 2024-07-21 17:08:07 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: /content/Last_Hope.json
    sample_rate: 16000
    batch_size: 16
    shuffle: false
    num_work

[NeMo I 2024-07-21 17:08:07 features:305] PADDING: 16
[NeMo I 2024-07-21 17:08:07 save_restore_connector:263] Model EncDecCTCModelBPE was successfully restored from /content/MetanoiaLabsModel.nemo.
Error processing ./audio_sample_12.wav: Provided `audio` data contains numpy or torch tensors, however the class does not have `sample_rate` attribute. Please set `sample_rate` attribute to the model explicitly.
All files processed.
