In [None]:
! apt-get update && apt-get install -y libsndfile1 ffmpeg
! pip install Cython
! pip install packaging
! pip -q install nemo_toolkit['asr']

In [None]:
import nemo
import nemo.collections.asr as nemo_asr
from nemo.collections.asr.models import EncDecCTCModel

In [None]:
import os
import zipfile
import wave
import json
import pandas as pd
import torch
from omegaconf import OmegaConf
from omegaconf import DictConfig
from pytorch_lightning.utilities.model_summary import ModelSummary
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import Callback
import matplotlib.pyplot as plt
from pytorch_lightning.callbacks import ModelCheckpoint
import librosa

# Manifest Creation:
For inference we will need a manifest file that contains a description row of each sample for the model to diarize. <br><br>
Manifest Row Example:
```json
{"audio_filepath": "wavs/audio_sample_75.wav", "offset": 0, "duration": 11.0, "label": "infer", "text": "-", "rttm_filepath": null}
```

In [None]:
# Code to create the manifest file

import os
import json
import librosa

def create_wav_manifest(folder_path, output_json_path):
    with open(output_json_path, 'w', encoding='utf-8') as fout:
        # Loop through all files in the folder
        for filename in os.listdir(folder_path):

            # Check if the file is a .wav file
            if filename.lower().endswith('.wav'):

                # Create the full file path
                file_path = os.path.join(folder_path, filename)
                duration = librosa.core.get_duration(filename=file_path)

                # Create a dictionary for this file
                file_entry = {
                    "audio_filepath": file_path,
                    "offset": 0,
                    "duration": duration,
                    "label": "infer",
                    "text": "-"
                }

                json.dump(file_entry, fout)
                fout.write('\n')
    
    
    print(f"Manifest file created at {output_json_path}")

# Example usage
folder_path = 'inference/wavs/'
output_json_path = 'inference/test_manifest.json'
create_wav_manifest(folder_path, output_json_path)

# Inference

In [None]:
inf_config = {
    "name": "ClusterDiarizer",
    "num_workers": 1,
    "sample_rate": 16000,
    "batch_size": 64,
    "device": 'cuda',
    "verbose": True,
    "diarizer": {
        "manifest_filepath": "inference/test_manifest.json",
        "out_dir": "/kaggle/working/",
        "oracle_vad": False,
        "collar": 0.25,
        "ignore_overlap": True,
        "vad": {
            "model_path": "models/vad_multilingual_marblenet.nemo",
            "external_vad_manifest": None,
            "parameters": {
                "window_length_in_sec": 0.63,
                "shift_length_in_sec": 0.01,
                "smoothing": False,
                "overlap": 0.5,
                "onset": 0.8,
                "offset": 0.6,
                "pad_onset": -0.05,
                "pad_offset": 0,
                "min_duration_on": 0,
                "min_duration_off": 0.6,
                "filter_speech_first": True
            }
        },
        "speaker_embeddings": {
            "model_path": "models/titanet-l.nemo",
            "parameters": {
                "window_length_in_sec": [1.5,1.25,1.0,0.75,0.5],
                "shift_length_in_sec": [0.75,0.625,0.5,0.375,0.1],
                "multiscale_weights": [1,1,1,1,1],
                "save_embeddings": True
            }
        },
        "clustering": {
            "parameters": {
                "oracle_num_speakers": False,
                "max_num_speakers": 5,
                "enhanced_count_thres": 80,
                "max_rp_threshold": 0.25,
                "sparse_search_volume": 30,
                "maj_vote_spk_count": False,
                "chunk_cluster_count": 50,
                "embeddings_per_chunk": 10000
            }
        },
        "msdd_model": {
            "model_path": 'models/diar_msdd_telephonic.nemo',
            "parameters": {
                "use_speaker_model_from_ckpt": True,
                "infer_batch_size": 25,
                "sigmoid_threshold": [0.7, 1.0],
                "seq_eval_mode": False,
                "split_infer": True,
                "diar_window_length": 50,
                "overlap_infer_spk_limit": 5
            }
        }
    }
}

inf_config = OmegaConf.create(inf_config)

In [None]:
from nemo.collections.asr.models.msdd_models import NeuralDiarizer
oracle_vad_msdd_model = NeuralDiarizer(cfg=inf_config)

oracle_vad_msdd_model.diarize()

The output will be .rttm files with the diarization segments, We need to apply the speech recognition model the these segments so the final output can look like this:

```json
[
    {
        "start": 0.05,
        "end": 1.95,
        "speaker": 2,
        "text": "في هاي لايك"
    },
    {
        "start": 1.95,
        "end": 3.45,
        "speaker": 4,
        "text": "أنا حبيت أوي"
    },
    {
        "start": 3.45,
        "end": 5.45,
        "speaker": 0,
        "text": " إن أنا كوفين جزء منها و ده"
    },
    {
        "start": 5.45,
        "end": 6.05,
        "speaker": 4,
        "text": "و ده اللي بحب"
    },
    {
        "start": 6.05,
        "end": 7.949999999999999,
        "speaker": 1,
        "text": "شكر عليه والله كمان برضو"
    },
    {
        "start": 7.95,
        "end": 9.55,
        "speaker": 3,
        "text": "الأبضاء في واتش"
    },
    {
        "start": 10.36,
        "end": 11.0,
        "speaker": 4,
        "text": "إن إحنا"
    }
]
```

In [None]:
conformer_ctc = nemo_asr.models.EncDecCTCModel.restore_from('models/Conformer-CTC-Char_final.nemo')
summary = ModelSummary(conformer_ctc)
print(summary)

conformer_ctc.to('cuda')

In [None]:
def parse_rttm(rttm_file):
    segments = []
    with open(rttm_file, 'r') as file:
        lines = file.readlines()
        for line in lines:
            parts = line.strip().split()
            start_time = float(parts[3])
            duration = float(parts[4])
            end_time = start_time + duration
            speaker = parts[7]
            segments.append({
                "start": start_time,
                "end": end_time,
                "speaker": speaker
            })
    return segments

def transcribe_segment(wav_file, start, end):
    with wave.open(wav_file, 'rb') as wf:
        wf.setpos(int(start * wf.getframerate()))
        segment_frames = wf.readframes(int((end - start) * wf.getframerate()))
        segment_wav_file = 'temp_segment.wav'
        with wave.open(segment_wav_file, 'wb') as segment_wf:
            segment_wf.setnchannels(wf.getnchannels())
            segment_wf.setsampwidth(wf.getsampwidth())
            segment_wf.setframerate(wf.getframerate())
            segment_wf.writeframes(segment_frames)
        transcription = conformer_ctc.transcribe([segment_wav_file])[0]
        os.remove(segment_wav_file)
    return transcription

def process_rttm_and_transcribe(rttm_file, wav_file, output_json):
    segments = parse_rttm(rttm_file)
    results = []
    for segment in segments:
        text = transcribe_segment(wav_file, segment["start"], segment["end"])
        result = {
            "start": segment["start"],
            "end": segment["end"],
            "speaker": int(segment["speaker"][-1]),
            "text": text
        }
        results.append(result)
    with open(output_json, 'w', encoding='utf-8') as json_file:
        json.dump(results, json_file, indent=4, ensure_ascii=False)

In [None]:
def process_folder(wav_folder, rttm_folder, output_folder):
    # Ensure output folder exists
    os.makedirs(output_folder, exist_ok=True)

    # Loop through all files in the WAV folder
    for filename in os.listdir(wav_folder):
        if filename.lower().endswith('.wav'):
            # Construct full file paths
            wav_file = os.path.join(wav_folder, filename)
            rttm_file = os.path.join(rttm_folder, filename.replace('.wav', '.rttm'))
            output_json = os.path.join(output_folder, filename.replace('.wav', '.json'))

            # Check if RTTM file exists
            if not os.path.exists(rttm_file):
                print(f"RTTM file not found for {filename}. Skipping...")
                continue

            # Process the file
            try:
                process_rttm_and_transcribe(rttm_file, wav_file, output_json)
                print(f"Processed {filename} successfully.")
            except Exception as e:
                print(f"Error processing {filename}: {str(e)}")

# Example usage
wav_folder = 'inference/wavs/'
rttm_folder = 'pred_rttms/'
output_folder = 'Final_Submission/'

process_folder(wav_folder, rttm_folder, output_folder)