# References

### Diarization
https://colab.research.google.com/drive/1X5XTiob6irFq8NJM831S0ADwz5_wIS-r#scrollTo=M_i_C1dVFp0J

---


Importing all the required modules and setting the environment variables

In [1]:
import json
import logging
import os
import wget
import re
from pathlib import Path
from torch import hub

device = 'cuda'
CACHE_DIRECTORY = Path(os.getcwd())/'.cache'
# hub.set_dir(str(CACHE_DIRECTORY))
os.environ['TRANSFORMERS_CACHE'] = str(CACHE_DIRECTORY)
os.environ['NEMO_CACHE_DIR'] = str(CACHE_DIRECTORY)

from omegaconf import OmegaConf
from tqdm import tqdm
import whisperx

from Configs import RESOURCES
from TranscriptionUtils import Transcriber
from SummarizationUtils import SummaryUtils



Initial transcription

In [2]:
sampleAudio = RESOURCES/"audio"/"audio.wav"
sampleAudioPath = str(RESOURCES/"audio"/"audio.wav")

transcriber = Transcriber()
resultInitial = transcriber.transcribe(sampleAudioPath)

resultInitial["text"]

" I can create an end point for you that at the end of the day, just before the person come in, you just update it. For that person to become that signatory. Because this was already proposed by Choraya the last time to be able to update the signatory. This is not a problem. So meaning if that's the case, during the very first time that we call you to create the agreement that time, we only pass you the borrower. Correct. Then after once we know who is who already, then we pass you another secretary. Because what happen is when I talk to the secure PDF, I do not certify, once I certify that part, I do not flatten the debt field. The rest all I flatten, but only the one I don't flatten is those signing fields for the lawyer. That's the only one we are going to do. So that means the hashing that we have at that time will not be the same. So once the lawyer come in and put in the details, I think that hashing will change again. That will affect your partner. I think the hashing throughout

In [None]:
resultInitial["language"]

Alignment using whisperX

In [None]:
modelName = 'WAV2VEC2_ASR_LARGE_LV60K_960H'
alignmentModel, metadata = whisperx.load_align_model(language_code=resultInitial["language"], device=device, model_name=modelName)
resultAligned = whisperx.align(resultInitial["segments"], alignmentModel, metadata, sampleAudioPath, device)

In [None]:
# Storing words <> timestamps mapping in a file.
with open(str(CACHE_DIRECTORY/'word_ts.text'), 'w+') as f:
    for line in resultAligned['word_segments']:
        line_temp = line.copy()
        # WhisperX don't put a space after word but just to make sure.
        line_temp['text'] = line_temp['text'].strip()
        f.write(f'{json.dumps(line_temp)}\n')

In [None]:
diarize_manifest = {
    'audio_filepath': sampleAudioPath,
    'offset': 0,
    'duration':  None,
    'label': "infer",
    'text': "-",
    'num_speakers': None,
    'rttm_filepath': str(CACHE_DIRECTORY/"diarized.rttm"),
    'uniq_id': ""
}

with open(CACHE_DIRECTORY/"manifest.json", 'w') as f:
    f.write(json.dumps(diarize_manifest))

In [None]:
MODEL_CONFIG = str(CACHE_DIRECTORY/'diar_infer_meeting.yaml')
if not os.path.exists(MODEL_CONFIG):
    config_url = "https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/diar_infer_meeting.yaml"
    MODEL_CONFIG = wget.download(config_url, str(CACHE_DIRECTORY))

config = OmegaConf.load(MODEL_CONFIG)

In [None]:
config.num_workers = 1
config.batch_size = 32

config.diarizer.manifest_filepath = str(CACHE_DIRECTORY/"manifest.json")
config.diarizer.out_dir = str(CACHE_DIRECTORY/"diarized")
config.diarizer.speaker_embeddings.model_path = 'titanet_large'
config.diarizer.speaker_embeddings.parameters.window_length_in_sec = [1.5, 1.0, 0.5]
config.diarizer.speaker_embeddings.parameters.shift_length_in_sec = [0.75, 0.5, 0.25]
config.diarizer.speaker_embeddings.parameters.multiscale_weights = [0.33, 0.33, 0.33]
config.diarizer.speaker_embeddings.parameters.save_embeddings = False

config.diarizer.ignore_overlap = False
config.diarizer.oracle_vad = False
config.diarizer.collar = 0.25


config.diarizer.vad.model_path = 'vad_multilingual_marblenet'
config.diarizer.oracle_vad = False # ----> Not using oracle VAD 

In [None]:
from nemo.collections.asr.models.msdd_models import ClusteringDiarizer

model = ClusteringDiarizer(cfg=config)

In [None]:
model.diarize()

In [None]:
speaker_ts = []
with open(str(CACHE_DIRECTORY/"diarized"/"pred_rttms"/(str(sampleAudio.stem) + ".rttm")), 'r') as f:
    lines = f.readlines()
    for line in lines:
        line_list = line.split(' ')
        s = int(float(line_list[5]) * 1000)
        e = s + int(float(line_list[8]) * 1000)
        speaker_ts.append([s, e, int(line_list[11].split('_')[-1])])

In [None]:
word_ts = []
with open(CACHE_DIRECTORY/'word_ts.text', 'r+') as f:
    for line in f:
        line_temp = json.loads(line)
        word_ts.append(line_temp)

In [None]:
def get_word_ts_anchor(s, e, option="start"):
    if option == "end":
        return e
    elif option == "mid":
        return (s + e) / 2
    return s

def get_words_speaker_mapping(wrd_ts, spk_ts, word_anchor_option="start"):
    s, e, sp = spk_ts[0]
    wrd_pos, turn_idx = 0, 0
    wrd_spk_mapping = []
    for wrd_dict in wrd_ts:
        ws, we, wrd = (
            int(wrd_dict["start"] * 1000),
            int(wrd_dict["end"] * 1000),
            wrd_dict["text"],
        )
        wrd_pos = get_word_ts_anchor(ws, we, word_anchor_option)
        while wrd_pos > float(e) and (turn_idx != len(spk_ts) - 1):
            turn_idx += 1
            turn_idx = min(turn_idx, len(spk_ts) - 1)
            s, e, sp = spk_ts[turn_idx]
        result = {"word": wrd, "start_time": ws, "end_time": we, "speaker": sp}
        wrd_spk_mapping.append(result)
    return wrd_spk_mapping

In [None]:
wsm = get_words_speaker_mapping(word_ts, speaker_ts, 'start')

In [None]:
sentence_ending_punctuations = '.?!'

def get_first_word_idx_of_sentence(word_idx, word_list, speaker_list, max_words):
  is_word_sentence_end = lambda x: x >= 0 and word_list[x][-1] in sentence_ending_punctuations
  left_idx = word_idx
  while (left_idx > 0 and word_idx - left_idx < max_words and
          speaker_list[left_idx - 1] == speaker_list[left_idx] and
          not is_word_sentence_end(left_idx - 1)):
      left_idx -= 1
      
  return left_idx if left_idx == 0 or is_word_sentence_end(left_idx - 1) else -1

def get_last_word_idx_of_sentence(word_idx, word_list, max_words):
  is_word_sentence_end = lambda x: x >= 0 and word_list[x][-1] in sentence_ending_punctuations
  right_idx = word_idx
  while (right_idx < len(word_list) and right_idx - word_idx < max_words and
          not is_word_sentence_end(right_idx)):
      right_idx += 1
      
  return right_idx if right_idx == len(word_list) - 1 or is_word_sentence_end(right_idx) else -1

def get_realigned_ws_mapping_with_punctuation(word_speaker_mapping, max_words_in_sentence = 50):
  is_word_sentence_end = lambda x: x >= 0 and word_speaker_mapping[x]['word'][-1] in sentence_ending_punctuations
  wsp_len = len(word_speaker_mapping)
  
  words_list, speaker_list = [], []
  for k, line_dict in enumerate(word_speaker_mapping):
      word, speaker = line_dict['word'], line_dict['speaker']
      words_list.append(word)
      speaker_list.append(speaker)

  k = 0
  while k < len(word_speaker_mapping):
      line_dict = word_speaker_mapping[k]
      if k < wsp_len - 1 and speaker_list[k] != speaker_list[k + 1] and not is_word_sentence_end(k):
          left_idx = get_first_word_idx_of_sentence(k, words_list, speaker_list, max_words_in_sentence)
          right_idx = get_last_word_idx_of_sentence(k, words_list, max_words_in_sentence - k + left_idx - 1) if left_idx > -1 else -1
          if min(left_idx, right_idx) == -1:
              k += 1
              continue
          
          spk_labels = speaker_list[left_idx: right_idx + 1]
          mod_speaker = max(set(spk_labels), key=spk_labels.count)
          if spk_labels.count(mod_speaker) < len(spk_labels) // 2:
              k += 1
              continue
          
          speaker_list[left_idx: right_idx + 1] = [mod_speaker] * (right_idx - left_idx + 1)
          k = right_idx
      
      k += 1
  
  k, realigned_list = 0, []
  while k < len(word_speaker_mapping):
      line_dict = word_speaker_mapping[k].copy()
      line_dict['speaker'] = speaker_list[k]
      realigned_list.append(line_dict)
      k += 1
      
  
  return realigned_list

In [None]:
def get_sentences_speaker_mapping(word_speaker_mapping, spk_ts):
    s, e, spk = spk_ts[0]
    prev_spk = spk

    snts = []
    snt = {'speaker': f'Speaker {spk}', 'start_time': s, 'end_time': e, 'text': ''}

    for wrd_dict in word_speaker_mapping:
        wrd, spk = wrd_dict['word'], wrd_dict['speaker']
        s, e = wrd_dict['start_time'], wrd_dict['end_time']
        if spk != prev_spk:
            snts.append(snt)
            snt = {'speaker': f'Speaker {spk}', 'start_time': s, 'end_time': e, 'text': ''}
        else:
            snt['end_time'] = e
        snt['text'] += wrd + ' '
        prev_spk = spk

    snts.append(snt)
    return snts

def get_speaker_aware_transcript(sentences_speaker_mapping):
  with open(CACHE_DIRECTORY/'diarization.txt', 'w') as f:
    for sentence_dict in sentences_speaker_mapping:
        sp = sentence_dict['speaker']
        text = sentence_dict['text']
        f.write(f'\n\n{sp}: {text}')

In [None]:
wsm = get_realigned_ws_mapping_with_punctuation(wsm)
ssm = get_sentences_speaker_mapping(wsm, speaker_ts)
get_speaker_aware_transcript(ssm)

In [None]:
diarized = ""
with open(CACHE_DIRECTORY/"diarization.txt", "r") as f:
    diarized = f.read()
    diarized = re.sub("\n", "", diarized).lower()
diarized

In [3]:
from DiarizationUtils import Diarizer
diarizer = Diarizer()
diarized = diarizer.diarize(resultInitial, sampleAudioPath)

[NeMo W 2023-01-13 17:43:00 optimizers:55] Apex was not found. Using the lamb or fused_adam optimizer will error out.
[NeMo W 2023-01-13 17:43:00 experimental:27] Module <class 'nemo.collections.asr.models.audio_to_audio_model.AudioToAudioModel'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-01-13 17:43:00 experimental:27] Module <class 'nemo.collections.asr.modules.audio_modules.SpectrogramToMultichannelFeatures'> is experimental, not ready for production and is not fully supported. Use at your own risk.
    
[NeMo W 2023-01-13 17:43:01 experimental:27] Module <class 'nemo.collections.asr.data.audio_to_audio.BaseAudioDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.
[NeMo W 2023-01-13 17:43:01 experimental:27] Module <class 'nemo.collections.asr.data.audio_to_audio.AudioToTargetDataset'> is experimental, not ready for production and is not fully supported. Use at your own ris

[00:01.103 --> 00:06.840]  I can create an end point for you that at the end of the day, just before the person come in, you just update it.
[00:07.564 --> 00:09.919]  For that person to become that signatory.
[00:10.161 --> 00:15.739]  Because this was already proposed by Choraya the last time to be able to update the signatory.
[00:16.869 --> 00:17.556]  This is not a problem.
[00:18.040 --> 00:26.419]  So meaning if that's the case, during the very first time that we call you to create the agreement that time, we only pass you the borrower.
[00:27.102 --> 00:27.796]  Correct.
[00:28.221 --> 00:32.274]  Then after once we know who is who already, then we pass you another secretary.
[00:34.601 --> 00:46.817]  Because what happen is when I talk to the secure PDF, I do not certify, once I certify that part, I do not flatten the debt field.
[00:49.301 --> 00:54.940]  The rest all I flatten, but only the one I don't flatten is those signing fields for the lawyer.
[00:55.963 --> 01:00.960]

splitting manifest: 100%|██████████| 1/1 [00:04<00:00,  4.43s/it]
vad: 100%|██████████| 7/7 [00:04<00:00,  1.62it/s]
creating speech segments: 100%|██████████| 1/1 [00:00<00:00,  3.50it/s]
[1/3] extract embeddings: 100%|██████████| 3/3 [00:01<00:00,  2.64it/s]
[2/3] extract embeddings: 100%|██████████| 4/4 [00:01<00:00,  3.13it/s]
[3/3] extract embeddings: 100%|██████████| 7/7 [00:01<00:00,  4.59it/s]
clustering: 100%|██████████| 1/1 [00:01<00:00,  1.25s/it]


In [7]:
repr(diarized)

'"Speaker 0: i can create an end point for you that at the end of the day, just before the person come in, you just update it. for that person to become that signatory. because this was already proposed by choraya the last time to be able to update the signatory.  Speaker 1: this is not a problem. so meaning if that\'s the case, during the very first time that we call you to create the agreement that time, we only pass you the borrower.  Speaker 0: correct.  Speaker 1: then after once we know who is who already, then we pass you another secretary.  Speaker 0: because what happen is when i talk to the secure pdf, i do not certify, once i certify that part, i do not flatten the debt field. the rest all i flatten, but only the one i don\'t flatten is those signing fields for the lawyer.  Speaker 1: that\'s the only one we are going to do. so that means the hashing that we have at that time will not be the same.  Speaker 0: so once the lawyer come in and put in the details, i think that ha

In [4]:
summarizer = SummaryUtils()
summary = summarizer.summarize(diarized, maxLen=400, minLen=100, lengthPenalty=2.0, repetitionPenalty=1.2)

In [5]:
print(repr(summary))

"Speaker 0 and Speaker 1 discuss how to update the document before the person comes in to become a signatory. They agree that the hashing of the document will not change once the lawyer comes in and changes the details. The lawyer will use a digital signature on the document. Speaker 1 suggests to use a mobile app for the lawyer's signature, so they don't have to worry about it when the application is finished.   It was already proposed by choraya the last time, so nothing has changed.\nbefore hand, no? ah, yep. in general. "
