In [1]:
import os
import torch
import librosa
import soundfile as sf
import warnings
import numpy
warnings.filterwarnings("ignore", category=FutureWarning, module="whisper")
import faster_whisper
# Run Whisper and store .pkl
from faster_whisper import WhisperModel, BatchedInferencePipeline, decode_audio
import pickle
import numpy as np
from dataclasses import dataclass
from typing import List, Sequence

# Helper Functions

In [2]:
punct_model_langs = [
    "en",
    "fr",
    "de",
    "es",
    "it",
    "nl",
    "pt",
    "bg",
    "pl",
    "cs",
    "sk",
    "sl",
]

LANGUAGES = {
    "en": "english",
    "zh": "chinese",
    "de": "german",
    "es": "spanish",
    "ru": "russian",
    "ko": "korean",
    "fr": "french",
    "ja": "japanese",
    "pt": "portuguese",
    "tr": "turkish",
    "pl": "polish",
    "ca": "catalan",
    "nl": "dutch",
    "ar": "arabic",
    "sv": "swedish",
    "it": "italian",
    "id": "indonesian",
    "hi": "hindi",
    "fi": "finnish",
    "vi": "vietnamese",
    "he": "hebrew",
    "uk": "ukrainian",
    "el": "greek",
    "ms": "malay",
    "cs": "czech",
    "ro": "romanian",
    "da": "danish",
    "hu": "hungarian",
    "ta": "tamil",
    "no": "norwegian",
    "th": "thai",
    "ur": "urdu",
    "hr": "croatian",
    "bg": "bulgarian",
    "lt": "lithuanian",
    "la": "latin",
    "mi": "maori",
    "ml": "malayalam",
    "cy": "welsh",
    "sk": "slovak",
    "te": "telugu",
    "fa": "persian",
    "lv": "latvian",
    "bn": "bengali",
    "sr": "serbian",
    "az": "azerbaijani",
    "sl": "slovenian",
    "kn": "kannada",
    "et": "estonian",
    "mk": "macedonian",
    "br": "breton",
    "eu": "basque",
    "is": "icelandic",
    "hy": "armenian",
    "ne": "nepali",
    "mn": "mongolian",
    "bs": "bosnian",
    "kk": "kazakh",
    "sq": "albanian",
    "sw": "swahili",
    "gl": "galician",
    "mr": "marathi",
    "pa": "punjabi",
    "si": "sinhala",
    "km": "khmer",
    "sn": "shona",
    "yo": "yoruba",
    "so": "somali",
    "af": "afrikaans",
    "oc": "occitan",
    "ka": "georgian",
    "be": "belarusian",
    "tg": "tajik",
    "sd": "sindhi",
    "gu": "gujarati",
    "am": "amharic",
    "yi": "yiddish",
    "lo": "lao",
    "uz": "uzbek",
    "fo": "faroese",
    "ht": "haitian creole",
    "ps": "pashto",
    "tk": "turkmen",
    "nn": "nynorsk",
    "mt": "maltese",
    "sa": "sanskrit",
    "lb": "luxembourgish",
    "my": "myanmar",
    "bo": "tibetan",
    "tl": "tagalog",
    "mg": "malagasy",
    "as": "assamese",
    "tt": "tatar",
    "haw": "hawaiian",
    "ln": "lingala",
    "ha": "hausa",
    "ba": "bashkir",
    "jw": "javanese",
    "su": "sundanese",
    "yue": "cantonese",
}

# language code lookup by name, with a few language aliases
TO_LANGUAGE_CODE = {
    **{language: code for code, language in LANGUAGES.items()},
    "burmese": "my",
    "valencian": "ca",
    "flemish": "nl",
    "haitian": "ht",
    "letzeburgesch": "lb",
    "pushto": "ps",
    "panjabi": "pa",
    "moldavian": "ro",
    "moldovan": "ro",
    "sinhalese": "si",
    "castilian": "es",
}


langs_to_iso = {
    "af": "afr",
    "am": "amh",
    "ar": "ara",
    "as": "asm",
    "az": "aze",
    "ba": "bak",
    "be": "bel",
    "bg": "bul",
    "bn": "ben",
    "bo": "tib",
    "br": "bre",
    "bs": "bos",
    "ca": "cat",
    "cs": "cze",
    "cy": "wel",
    "da": "dan",
    "de": "ger",
    "el": "gre",
    "en": "eng",
    "es": "spa",
    "et": "est",
    "eu": "baq",
    "fa": "per",
    "fi": "fin",
    "fo": "fao",
    "fr": "fre",
    "gl": "glg",
    "gu": "guj",
    "ha": "hau",
    "haw": "haw",
    "he": "heb",
    "hi": "hin",
    "hr": "hrv",
    "ht": "hat",
    "hu": "hun",
    "hy": "arm",
    "id": "ind",
    "is": "ice",
    "it": "ita",
    "ja": "jpn",
    "jw": "jav",
    "ka": "geo",
    "kk": "kaz",
    "km": "khm",
    "kn": "kan",
    "ko": "kor",
    "la": "lat",
    "lb": "ltz",
    "ln": "lin",
    "lo": "lao",
    "lt": "lit",
    "lv": "lav",
    "mg": "mlg",
    "mi": "mao",
    "mk": "mac",
    "ml": "mal",
    "mn": "mon",
    "mr": "mar",
    "ms": "may",
    "mt": "mlt",
    "my": "bur",
    "ne": "nep",
    "nl": "dut",
    "nn": "nno",
    "no": "nor",
    "oc": "oci",
    "pa": "pan",
    "pl": "pol",
    "ps": "pus",
    "pt": "por",
    "ro": "rum",
    "ru": "rus",
    "sa": "san",
    "sd": "snd",
    "si": "sin",
    "sk": "slo",
    "sl": "slv",
    "sn": "sna",
    "so": "som",
    "sq": "alb",
    "sr": "srp",
    "su": "sun",
    "sv": "swe",
    "sw": "swa",
    "ta": "tam",
    "te": "tel",
    "tg": "tgk",
    "th": "tha",
    "tk": "tuk",
    "tl": "tgl",
    "tr": "tur",
    "tt": "tat",
    "uk": "ukr",
    "ur": "urd",
    "uz": "uzb",
    "vi": "vie",
    "yi": "yid",
    "yo": "yor",
    "yue": "yue",
    "zh": "chi",
}


whisper_langs = sorted(LANGUAGES.keys()) + sorted(
    [k.title() for k in TO_LANGUAGE_CODE.keys()]
)


def create_config(output_dir):
    DOMAIN_TYPE = "telephonic"  # Can be meeting, telephonic, or general based on domain type of the audio file
    CONFIG_FILE_NAME = f"diar_infer_{DOMAIN_TYPE}.yaml"
    CONFIG_URL = f"https://raw.githubusercontent.com/NVIDIA/NeMo/main/examples/speaker_tasks/diarization/conf/inference/{CONFIG_FILE_NAME}"
    MODEL_CONFIG = os.path.join(output_dir, CONFIG_FILE_NAME)
    if not os.path.exists(MODEL_CONFIG):
        MODEL_CONFIG = wget.download(CONFIG_URL, output_dir)

    config = OmegaConf.load(MODEL_CONFIG)

    data_dir = os.path.join(output_dir, "data")
    os.makedirs(data_dir, exist_ok=True)

    meta = {
        "audio_filepath": os.path.join(output_dir, "mono_file.wav"),
        "offset": 0,
        "duration": None,
        "label": "infer",
        "text": "-",
        "rttm_filepath": None,
        "uem_filepath": None,
    }
    with open(os.path.join(data_dir, "input_manifest.json"), "w") as fp:
        json.dump(meta, fp)
        fp.write("\n")

    pretrained_vad = "vad_multilingual_marblenet"
    pretrained_speaker_model = "titanet_large"
    config.num_workers = 0  # Workaround for multiprocessing hanging with ipython issue
    config.diarizer.manifest_filepath = os.path.join(data_dir, "input_manifest.json")
    config.diarizer.out_dir = (
        output_dir  # Directory to store intermediate files and prediction outputs
    )

    config.diarizer.speaker_embeddings.model_path = pretrained_speaker_model
    config.diarizer.oracle_vad = (
        False  # compute VAD provided with model_path to vad config
    )
    config.diarizer.clustering.parameters.oracle_num_speakers = False

    # Here, we use our in-house pretrained NeMo VAD model
    config.diarizer.vad.model_path = pretrained_vad
    config.diarizer.vad.parameters.onset = 0.8
    config.diarizer.vad.parameters.offset = 0.6
    config.diarizer.vad.parameters.pad_offset = -0.05
    config.diarizer.msdd_model.model_path = (
        "diar_msdd_telephonic"  # Telephonic speaker diarization model
    )

    return config


def get_word_ts_anchor(s, e, option="start"):
    if option == "end":
        return e
    elif option == "mid":
        return (s + e) / 2
    return s

def get_audio_extraction(input_file, output_file, start_min, end_min):
    audio = AudioSegment.from_file(input_file, format="wav")
    start_min *= 60000
    end_min *= 60000
    ten_minutes = audio[start_min:end_min]  # Pydub uses milliseconds
    ten_minutes.export(output_file, format="wav")


def get_words_speaker_mapping(wrd_ts, spk_ts, word_anchor_option="start"):
    s, e, sp = spk_ts[0]
    wrd_pos, turn_idx = 0, 0
    wrd_spk_mapping = []
    for wrd_dict in wrd_ts:
        ws, we, wrd = (
            int(wrd_dict["start"] * 1000),
            int(wrd_dict["end"] * 1000),
            wrd_dict["text"],
        )
        wrd_pos = get_word_ts_anchor(ws, we, word_anchor_option)
        while wrd_pos > float(e):
            turn_idx += 1
            turn_idx = min(turn_idx, len(spk_ts) - 1)
            s, e, sp = spk_ts[turn_idx]
            if turn_idx == len(spk_ts) - 1:
                e = get_word_ts_anchor(ws, we, option="end")
        wrd_spk_mapping.append(
            {"word": wrd, "start_time": ws, "end_time": we, "speaker": sp}
        )
    return wrd_spk_mapping


sentence_ending_punctuations = ".?!"


def get_first_word_idx_of_sentence(word_idx, word_list, speaker_list, max_words):
    is_word_sentence_end = (
        lambda x: x >= 0 and word_list[x][-1] in sentence_ending_punctuations
    )
    left_idx = word_idx
    while (
        left_idx > 0
        and word_idx - left_idx < max_words
        and speaker_list[left_idx - 1] == speaker_list[left_idx]
        and not is_word_sentence_end(left_idx - 1)
    ):
        left_idx -= 1

    return left_idx if left_idx == 0 or is_word_sentence_end(left_idx - 1) else -1


def get_last_word_idx_of_sentence(word_idx, word_list, max_words):
    is_word_sentence_end = (
        lambda x: x >= 0 and word_list[x][-1] in sentence_ending_punctuations
    )
    right_idx = word_idx
    while (
        right_idx < len(word_list) - 1
        and right_idx - word_idx < max_words
        and not is_word_sentence_end(right_idx)
    ):
        right_idx += 1

    return (
        right_idx
        if right_idx == len(word_list) - 1 or is_word_sentence_end(right_idx)
        else -1
    )


def get_realigned_ws_mapping_with_punctuation(
    word_speaker_mapping, max_words_in_sentence=50
):
    is_word_sentence_end = (
        lambda x: x >= 0
        and word_speaker_mapping[x]["word"][-1] in sentence_ending_punctuations
    )
    wsp_len = len(word_speaker_mapping)

    words_list, speaker_list = [], []
    for k, line_dict in enumerate(word_speaker_mapping):
        word, speaker = line_dict["word"], line_dict["speaker"]
        words_list.append(word)
        speaker_list.append(speaker)

    k = 0
    while k < len(word_speaker_mapping):
        line_dict = word_speaker_mapping[k]
        if (
            k < wsp_len - 1
            and speaker_list[k] != speaker_list[k + 1]
            and not is_word_sentence_end(k)
        ):
            left_idx = get_first_word_idx_of_sentence(
                k, words_list, speaker_list, max_words_in_sentence
            )
            right_idx = (
                get_last_word_idx_of_sentence(
                    k, words_list, max_words_in_sentence - k + left_idx - 1
                )
                if left_idx > -1
                else -1
            )
            if min(left_idx, right_idx) == -1:
                k += 1
                continue

            spk_labels = speaker_list[left_idx : right_idx + 1]
            mod_speaker = max(set(spk_labels), key=spk_labels.count)
            if spk_labels.count(mod_speaker) < len(spk_labels) // 2:
                k += 1
                continue

            speaker_list[left_idx : right_idx + 1] = [mod_speaker] * (
                right_idx - left_idx + 1
            )
            k = right_idx

        k += 1

    k, realigned_list = 0, []
    while k < len(word_speaker_mapping):
        line_dict = word_speaker_mapping[k].copy()
        line_dict["speaker"] = speaker_list[k]
        realigned_list.append(line_dict)
        k += 1

    return realigned_list


def get_sentences_speaker_mapping(word_speaker_mapping, spk_ts):
    sentence_checker = nltk.tokenize.PunktSentenceTokenizer().text_contains_sentbreak
    s, e, spk = spk_ts[0]
    prev_spk = spk

    snts = []
    snt = {"speaker": f"Speaker {spk}", "start_time": s, "end_time": e, "text": ""}

    for wrd_dict in word_speaker_mapping:
        wrd, spk = wrd_dict["word"], wrd_dict["speaker"]
        s, e = wrd_dict["start_time"], wrd_dict["end_time"]
        if spk != prev_spk or sentence_checker(snt["text"] + " " + wrd):
            snts.append(snt)
            snt = {
                "speaker": f"Speaker {spk}",
                "start_time": s,
                "end_time": e,
                "text": "",
            }
        else:
            snt["end_time"] = e
        snt["text"] += wrd + " "
        prev_spk = spk

    snts.append(snt)
    return snts


def get_speaker_aware_transcript(sentences_speaker_mapping, f):
    previous_speaker = sentences_speaker_mapping[0]["speaker"]
    f.write(f"{previous_speaker}: ")

    for sentence_dict in sentences_speaker_mapping:
        speaker = sentence_dict["speaker"]
        sentence = sentence_dict["text"]

        # If this speaker doesn't match the previous one, start a new paragraph
        if speaker != previous_speaker:
            f.write(f"\n\n{speaker}: ")
            previous_speaker = speaker

        # No matter what, write the current sentence
        f.write(sentence + " ")


def format_timestamp(
    milliseconds: float, always_include_hours: bool = False, decimal_marker: str = "."
):
    assert milliseconds >= 0, "non-negative timestamp expected"

    hours = milliseconds // 3_600_000
    milliseconds -= hours * 3_600_000

    minutes = milliseconds // 60_000
    milliseconds -= minutes * 60_000

    seconds = milliseconds // 1_000
    milliseconds -= seconds * 1_000

    hours_marker = f"{hours:02d}:" if always_include_hours or hours > 0 else ""
    return (
        f"{hours_marker}{minutes:02d}:{seconds:02d}{decimal_marker}{milliseconds:03d}"
    )


def write_srt(transcript, file):
    """
    Write a transcript to a file in SRT format.

    """
    for i, segment in enumerate(transcript, start=1):
        # write srt lines
        print(
            f"{i}\n"
            f"{format_timestamp(segment['start_time'], always_include_hours=True, decimal_marker=',')} --> "
            f"{format_timestamp(segment['end_time'], always_include_hours=True, decimal_marker=',')}\n"
            f"{segment['speaker']}: {segment['text'].strip().replace('-->', '->')}\n",
            file=file,
            flush=True,
        )


def find_numeral_symbol_tokens(tokenizer):
    numeral_symbol_tokens = [
        -1,
    ]
    for token, token_id in tokenizer.get_vocab().items():
        has_numeral_symbol = any(c in "0123456789%$£" for c in token)
        if has_numeral_symbol:
            numeral_symbol_tokens.append(token_id)
    return numeral_symbol_tokens


def _get_next_start_timestamp(word_timestamps, current_word_index, final_timestamp):
    # if current word is the last word
    if current_word_index == len(word_timestamps) - 1:
        return word_timestamps[current_word_index]["start"]

    next_word_index = current_word_index + 1
    while current_word_index < len(word_timestamps) - 1:
        if word_timestamps[next_word_index].get("start") is None:
            # if next word doesn't have a start timestamp
            # merge it with the current word and delete it
            word_timestamps[current_word_index]["word"] += (
                " " + word_timestamps[next_word_index]["word"]
            )

            word_timestamps[next_word_index]["word"] = None
            next_word_index += 1
            if next_word_index == len(word_timestamps):
                return final_timestamp

        else:
            return word_timestamps[next_word_index]["start"]


def filter_missing_timestamps(
    word_timestamps, initial_timestamp=0, final_timestamp=None
):
    # handle the first and last word
    if word_timestamps[0].get("start") is None:
        word_timestamps[0]["start"] = (
            initial_timestamp if initial_timestamp is not None else 0
        )
        word_timestamps[0]["end"] = _get_next_start_timestamp(
            word_timestamps, 0, final_timestamp
        )

    result = [
        word_timestamps[0],
    ]

    for i, ws in enumerate(word_timestamps[1:], start=1):
        # if ws doesn't have a start and end
        # use the previous end as start and next start as end
        if ws.get("start") is None and ws.get("word") is not None:
            ws["start"] = word_timestamps[i - 1]["end"]
            ws["end"] = _get_next_start_timestamp(word_timestamps, i, final_timestamp)

        if ws["word"] is not None:
            result.append(ws)
    return result


def cleanup(path: str):
    """path could either be relative or absolute."""
    # check if file or directory exists
    if os.path.isfile(path) or os.path.islink(path):
        # remove file
        os.remove(path)
    elif os.path.isdir(path):
        # remove directory and all its content
        shutil.rmtree(path)
    else:
        raise ValueError("Path {} is not a file or dir.".format(path))


def process_language_arg(language: str, model_name: str):
    """
    Process the language argument to make sure it's valid and convert language names to language codes.
    """
    if language is not None:
        language = language.lower()
    if language not in LANGUAGES:
        if language in TO_LANGUAGE_CODE:
            language = TO_LANGUAGE_CODE[language]
        else:
            raise ValueError(f"Unsupported language: {language}")

    if model_name.endswith(".en") and language != "en":
        if language is not None:
            logging.warning(
                f"{model_name} is an English-only model but received '{language}'; using English instead."
            )
        language = "en"
    return language

# Transcriptions

In [None]:

# num_list = ["1554","1713","1731","1738","1833","1944"]
num_list = ["1554"]

model_name = "large-v3"
device_type = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device_type)

compute_type = "float16"
suppress_numerals = True
whisper_model = faster_whisper.WhisperModel(
    model_name, device=device_type, compute_type=compute_type
)
whisper_pipeline = faster_whisper.BatchedInferencePipeline(whisper_model)

language = None  # autodetect language
batch_size = 8
is_fast = True
config_path = ""
is_configured = True

for num in num_list:
    #audio_path = f'./Dataset/Audio/{num}.wav'
    audio_path = "first_1_minutes.wav"
    vocal_target = audio_path

    audio_waveform = faster_whisper.decode_audio(vocal_target)
    # if sample_rate != 16000:
    #     print(f"Resampling from {sample_rate} Hz to 16000 Hz...")
    #     speech, sample_rate = librosa.load(audio_file, sr=None)
    #     speech = librosa.resample(speech, orig_sr=sample_rate, target_sr=16000)
    #     sample_rate = 16000
    #     sf.write(audio_file, speech, sample_rate)

    if is_configured:
        transcript_segments, info = whisper_pipeline.transcribe(
            audio_waveform,
            language=language,
            beam_size=1,
            temperature=0.0,
            without_timestamps=False,
            vad_filter=True,
        )
    else:
        transcript_segments, info = whisper_pipeline.transcribe(
            audio_waveform,
            language=language,
            without_timestamps=False,
            vad_filter=True,
        )
    result = []
    for seg in transcript_segments:
        result.append({"text": seg.text,"start":seg.start, "end": seg.end})
        print(str(seg.start) + "-" + str(seg.end) + ": "+ seg.text)
        

        
    # full_transcript = "".join(segment.text for segment in transcript_segments)
    # file_path = f"./output/{path_name}{config_path}_{num}.txt"
    # output_path = f"./output/{path_name}{config_path}_{num}_L.txt"
    # with open(file_path, "w", encoding="utf-8") as file:
    #     file.write(full_transcript.lower())
    # utils.process_text(file_path,output_path)

del whisper_model, whisper_pipeline
torch.cuda.empty_cache()

Using device: cuda
0.336-3.036:  So I'm just going to be asking you to do some talking.
3.516-3.856:  Okay.
4.656-7.256:  So how do you think your speech is these days?
7.256-16.136:  It's good, but it will be better.
35.344-43.344:  I have trouble with uh, and and the and all that.
43.344-44.344:  Mm-hmm.
44.344-45.344:  Mm-hmm.
45.344-46.344:  Mm-hmm.
46.344-49.344:  Do you remember when you had your stroke?
49.344-50.344:  Yeah.
50.344-51.344:  Mm-hmm.
51.344-52.344:  Um, could you tell me about it?
52.344-53.344:  Oh gosh.
53.344-59.344:  Well, I got um, I got up to check the


# Wav2Vec

In [None]:
from transformers import pipeline
#https://github.com/huggingface/transformers/blob/main/examples/research_projects/wav2vec2/FINE_TUNE_XLSR_WAV2VEC2.md
#https://github.com/crazycloud/mispronunciation-detection-diagnosis-wav2vec2-and-llm/blob/main/notebooks/mispronunciation-detection.ipynb
#https://github.com/NeuralVox/OpenPhonemizer
pipe = pipeline(model="mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme")
transcription = pipe("first_1_minutes.wav")["text"]
print(transcription)

Device set to use cuda:0


soʊ aɪ d͡ʒʌs ɡˌɪnɪ bi æskɪŋ ju tʌ tisʌn hɔkioʊkʌsow hæv ju θɪŋk jɚ spit͡ʃɪz ðiz deɪzɪts ɡʊd beɪ ʌm  mubi bɛtɚ ʌ aɪ kʌn  lɪtʌl aɪ kʌn bɹid ʌ lɪtʌl b  ʌ aɪ hæv tɹʌbʌl wɪd  n ʌn tnhoʊæd du ju ɹɪmɛmɚ wɛn iju haɪdɚistɹˌʌkjʌ ʌnd kʊd ju tɛl m ju bɑt ɪt oʊɡɑʃʌwɛl aɪ ɡɑt ɛm aɪ ɡɑt ʌp tɪ t͡ʃɛk dʌ


In [7]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
import torch
import torchaudio

# Load model and processor
model_name = "vitouphy/wav2vec2-xls-r-300m-phoneme"
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForCTC.from_pretrained(model_name)


In [2]:
from transformers import pipeline

# Load the model
# pipe = pipeline(model="vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
pipe = pipeline(model="vitouphy/wav2vec2-xls-r-300m-phoneme")

# Process raw audio
output = pipe("first_1_minutes.wav", chunk_length_s=10, stride_length_s=(4, 2))

# Print the transcription
print(output)


Device set to use cuda:0


{'text': 'h#sowaymjhahs gahntahbiyaes kihngyuwtihduwsahm taakihngowkeyh#sowhhawdyuwthihngk yerr s piychihzih dhiyz deyzh#iht s guhdh#bayh#ahmh#ahwihl biybehterahmh#aykahnh#ahh#lihtahlh#hhaykahn riydahlihtahl bahth#ahmh#dahh#ayhhaevh#chahbahl wihthahihnahn dternaanh#hhaabayh#ihaaiyayh#duwyuwrihmehm berwehn yuwhhaed yers t rahngkh#yawhhh#ahndkuhd yuwtehl miyahbawtihtowgaashh#hhahwehlaygaaahmh#aygaatahp h#tuwchehkdher'}


# Fine-Tuned Model

In [None]:


model_name   = "/home/arsalan77x/Whisper/chatwhisper-en-ct2"
device_type  = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16"
print("Using device:", device_type)
whisper_model  = WhisperModel(
    model_name,
    device=device_type,
    compute_type=compute_type,
)
whisper_pipeline = BatchedInferencePipeline(whisper_model)
language   = "en"          
batch_size = 8
num_list = ["1554", "1713","1731","1738","1833","1944"]


for num in num_list:
    folder_path = f'Dataset/{num}_chunks'
    wav_count = sum(1 for file in os.listdir(folder_path) if file.endswith('.wav'))
    for chunk_num in range(1, wav_count+1):
        print(chunk_num)
        filename = f"chunk_{chunk_num:03}.wav"
        audio_path = os.path.join(f"Dataset/{num}_chunks", filename)
        audio_waveform = decode_audio(audio_path)

        segments, info = whisper_pipeline.transcribe(
            audio_waveform,
            language=language,
            beam_size=1,
            word_timestamps=True,
            temperature=0.0,
            # without_timestamps=False,
            # vad_filter=True,
        )

        result = ""
        path = f"Dataset/{num}_whisper"
        if not os.path.exists(path):
            os.makedirs(path)
        out_path = os.path.join(path, f'{chunk_num:03}.pkl')
        is_speech = False
        for seg in segments:
            is_speech = True
            with open(out_path, 'wb') as f:
                pickle.dump(seg.words, f)
        if not is_speech:
            with open(out_path, 'wb') as f:
                pickle.dump([], f)


del whisper_model, whisper_pipeline
torch.cuda.empty_cache() 

Using device: cuda
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236


In [9]:


# --- data model ---------------------------------------------------------------
@dataclass
class Word:
    start: float
    end: float
    word: str
    probability: float  # 0‒1 range


# --- robust word-level anomaly detector ---------------------------------------
def robust_logprob_threshold(logps: Sequence[float], k: float = 2.5) -> float:
    """
    Compute a *lower* threshold on log-probs using Median ± k·MAD.
    Words whose log-p < threshold are flagged as anomalies.

    Parameters
    ----------
    logps : 1-D iterable of per-word ln(probability)
    k     : how many "robust σ" below the median to cut off. 2.5‒3 ≈ 99 % conf.
    """
    logps = np.asarray(logps)
    med = np.median(logps)
    mad = np.median(np.abs(logps - med)) * 1.4826  # convert MAD→σ
    return med - k * mad


def flag_anomalous_words(segment: List[Word], k: float = 2.5) -> List[Word]:
    """
    Return the list of Word objects whose confidence is anomalously low
    relative to the rest of *this* segment.
    """
    logps = [np.log(max(w.probability, 1e-12)) for w in segment]
    thr = robust_logprob_threshold(logps, k)
    return [w for w, lp in zip(segment, logps) if lp < thr]


# --- demo on the user's segment ----------------------------------------------
user_segment = [
    Word(0.33, 0.79, ' So', 0.48828125),
    Word(0.79, 0.95, " I'm", 0.9443359375),
    Word(0.95, 1.09, ' just', 0.91650390625),
    Word(1.09, 1.23, ' gonna', 0.6044921875),
    Word(1.23, 1.43, ' be', 0.90673828125),
    Word(1.43, 1.77, ' asking', 0.91259765625),
    Word(1.77, 2.05, ' you', 0.92724609375),
    Word(2.05, 2.27, ' to', 0.819580078125),
    Word(2.27, 2.49, ' do', 0.89208984375),
    Word(2.49, 2.67, ' some', 0.849609375),
    Word(2.67, 3.07, ' talking.', 0.91650390625),
    Word(3.53, 3.85, ' Okay.', 0.68798828125),
    Word(4.47, 5.11, ' So', 0.8515625),
    Word(5.11, 5.59, ' how', 0.71875),
    Word(5.59, 5.71, ' do', 0.9091796875),
    Word(5.71, 5.81, ' you', 0.93310546875),
    Word(5.81, 6.03, ' think', 0.85205078125),
    Word(6.03, 6.19, ' your', 0.8583984375),
    Word(6.19, 6.51, ' speech', 0.76123046875),
    Word(6.51, 6.71, ' is', 0.95068359375),
    Word(6.71, 6.99, ' these', 0.74609375),
    Word(6.99, 7.25, ' days?', 0.8681640625),
    Word(7.83, 8.19, ' Mm,', 0.1995849609375),
    Word(8.63, 9.13, " it's", 0.9716796875),
    Word(9.13, 9.45, ' good,', 0.9404296875),
    Word(9.73, 10.13, ' but', 0.9423828125),
    Word(10.13, 12.41, ' um...', 0.4840087890625),
    Word(12.41, 13.99, ' uh...', 0.5841064453125)
]
num_list = ['1554']
for num in num_list:
    folder_path = f'Dataset/{num}_whisper'
    wav_count = sum(1 for file in os.listdir(folder_path) if file.endswith('.pkl'))
    word_list = []
    for chunk_num in range(1, wav_count+1):
        with open(f'Dataset/{num}_whisper/{chunk_num:03}.pkl', 'rb') as f:
            words = pickle.load(f)
            for w in words:
                word_list.append([w.word,w.probability])

print(len(word_list))
print(word_list)
#anomalies = flag_anomalous_words(user_segment, k=2.5)
#[(w.word.strip(), w.probability) for w in anomalies]


6952


In [None]:
import os
import pickle
# Write transcript .txt files
for num in num_list:
    whole_text = ""
    folder_path = f'Dataset/{num}_whisper'
    wav_count = sum(1 for file in os.listdir(folder_path) if file.endswith('.pkl'))
    for chunk_num in range(1, wav_count+1):
        with open(f'Dataset/{num}_whisper/{chunk_num:03}.pkl', 'rb') as f:
            words = pickle.load(f)
            print(words)
            text = ""
            for w in words:
                text += w.word
            with open(f'Dataset/{num}_whisper/{chunk_num:03}.txt', 'w') as out_file:
                out_file.write(text.strip())
            whole_text += text + "\n"
    with open(f'Dataset/whisper_{num}.txt', 'w') as out_file:
        out_file.write(whole_text.strip())

[Word(start=np.float64(1.9799999999999978), end=np.float64(2.679999999999999), word=' okay', probability=np.float64(0.470947265625)), Word(start=np.float64(2.679999999999999), end=np.float64(3.38), word=' so', probability=np.float64(0.7646484375)), Word(start=np.float64(3.38), end=np.float64(4.02), word=' first', probability=np.float64(0.8779296875)), Word(start=np.float64(4.02), end=np.float64(4.54), word=" I'm", probability=np.float64(0.8076171875)), Word(start=np.float64(4.54), end=np.float64(4.72), word=' just', probability=np.float64(0.93212890625)), Word(start=np.float64(4.72), end=np.float64(4.88), word=' gonna', probability=np.float64(0.7021484375)), Word(start=np.float64(4.88), end=np.float64(5.08), word=' be', probability=np.float64(0.9013671875)), Word(start=np.float64(5.08), end=np.float64(5.52), word=' asking', probability=np.float64(0.91259765625)), Word(start=np.float64(5.52), end=np.float64(5.8), word=' you', probability=np.float64(0.92529296875)), Word(start=np.float64

# Gemini Align

In [None]:
from google import genai
gen_client = genai.Client(api_key="AIzaSyBI5a5BNcHB3EwLfmRq3guzlrFMy1GYRPM")  


In [15]:
from transformers import pipeline

# Load the model
# pipe = pipeline(model="vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
pipe = pipeline(model="vitouphy/wav2vec2-xls-r-300m-phoneme")

# Process raw audio
output = pipe("1.mp3", chunk_length_s=10, stride_length_s=(4, 2))

# Print the transcription
print(output)


Device set to use cuda:0


{'text': 'h#kuhd yuwtehl miyahbawtihth#aaihzwliyihndihsehm berh#aywahzh#h#ahlownh#ahn d mays maalhhawzh#ahh#biyhhayn d mayahh#f rehn dwah mayh#ahh#daazh#aen dahh#'}


In [16]:
output = output['text'].replace(" ", "").replace("h#", " \n")
output

' \nkuhdyuwtehlmiyahbawtiht \naaihzwliyihndihsehmber \naywahz \n \nahlown \nahndmaysmaalhhawz \nah \nbiyhhayndmayah \nfrehndwahmay \nah \ndaaz \naendah \n'

In [21]:
from google import genai
from pydantic import BaseModel
import re
from google.genai import types
gen_client = genai.Client(api_key="AIzaSyBI5a5BNcHB3EwLfmRq3guzlrFMy1GYRPM")
cfg = types.GenerateContentConfig(
    automatic_function_calling=types.AutomaticFunctionCallingConfig(
        disable=True           
    )
)
myfile = gen_client.files.upload(file='1.mp3')
prompt = "Given the audio file and phoneme transcription below, try to separate words by adding space between characters. note that the trancription might not be accurate but do not change the characters and just add spaces." \
 " \n ' \nkuhdyuwtehlmiyahbawtiht \naaihzwliyihndihsehmber \naywahz \n \nahlown \nahndmaysmaalhhawz \nah \nbiyhhayndmayah \nfrehndwahmay \nah \ndaaz \naendah \n'"

response = gen_client.models.generate_content(
  model="gemini-2.5-flash-preview-05-20",
  contents=[prompt, myfile]
)

print(response.text)

Here's the separation of words based on the provided phonemes and the audio:

`kuhd yuw tehl miy ah bawtiht`
`aaih zw liy ihn dihsehmber`
`ay wahz`

`ahlown`
`ahnd may smaalh hawz`
`ah`
`biyhhaynd may ah`
`frehnd wah may`
`ah`
`daaz`
`aend ah`


# Gemini Transcribe

In [34]:
import os
from pydub import AudioSegment
num_list = ["1731","1738","1833","1944"]
for num in num_list:
    # Path to the directory containing .wav files
    input_dir = f'Dataset/{num}_chunks'

    # Path to the directory where .mp3 files will be saved
    output_dir = f'{num}_mp3'

    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Loop over files in input directory
    for filename in os.listdir(input_dir):
        if filename.lower().endswith('.wav'):
            wav_path = os.path.join(input_dir, filename)
            mp3_filename = os.path.splitext(filename)[0] + '.mp3'
            mp3_path = os.path.join(output_dir, mp3_filename)
            
            # Load WAV and export as MP3
            audio = AudioSegment.from_wav(wav_path)
            audio.export(mp3_path, format='mp3')
            
            print(f'Converted: {wav_path} -> {mp3_path}')


Converted: Dataset/1731_chunks/chunk_104.wav -> 1731_mp3/chunk_104.mp3
Converted: Dataset/1731_chunks/chunk_248.wav -> 1731_mp3/chunk_248.mp3
Converted: Dataset/1731_chunks/chunk_300.wav -> 1731_mp3/chunk_300.mp3
Converted: Dataset/1731_chunks/chunk_230.wav -> 1731_mp3/chunk_230.mp3
Converted: Dataset/1731_chunks/chunk_147.wav -> 1731_mp3/chunk_147.mp3
Converted: Dataset/1731_chunks/chunk_142.wav -> 1731_mp3/chunk_142.mp3
Converted: Dataset/1731_chunks/chunk_135.wav -> 1731_mp3/chunk_135.mp3
Converted: Dataset/1731_chunks/chunk_140.wav -> 1731_mp3/chunk_140.mp3
Converted: Dataset/1731_chunks/chunk_179.wav -> 1731_mp3/chunk_179.mp3
Converted: Dataset/1731_chunks/chunk_033.wav -> 1731_mp3/chunk_033.mp3
Converted: Dataset/1731_chunks/chunk_107.wav -> 1731_mp3/chunk_107.mp3
Converted: Dataset/1731_chunks/chunk_088.wav -> 1731_mp3/chunk_088.mp3
Converted: Dataset/1731_chunks/chunk_061.wav -> 1731_mp3/chunk_061.mp3
Converted: Dataset/1731_chunks/chunk_228.wav -> 1731_mp3/chunk_228.mp3
Conver

In [30]:
from google import genai
from pydantic import BaseModel
import re
from google.genai import types
gen_client = genai.Client(api_key="AIzaSyBI5a5BNcHB3EwLfmRq3guzlrFMy1GYRPM")
cfg = types.GenerateContentConfig(
    automatic_function_calling=types.AutomaticFunctionCallingConfig(
        disable=True           
    )
)

In [None]:
import os
import time

num_list = ["1713"]

for num in num_list:
    folder_path = f'Dataset/{num}_mp3'
    output_path = f'Dataset/{num}_gemini'
    os.makedirs(output_path, exist_ok=True)  # create output directory if not exists

    mp3_count = sum(1 for file in os.listdir(folder_path) if file.endswith('.mp3'))
    
    for chunk_num in range(1, mp3_count + 1):
        filename = f"chunk_{chunk_num:03}.mp3"
        audio_path = os.path.join(folder_path, filename)

        myfile = gen_client.files.upload(file=audio_path)
        prompt = "Transcribe the given Speech."

        response = gen_client.models.generate_content(
            model="gemini-2.5-flash-preview-05-20",
            contents=[prompt, myfile]
        )

        print(response.text)
        time.sleep(4)

        # Save response text as a .txt file
        txt_filename = f"chunk_{chunk_num:03}.txt"
        txt_path = os.path.join(output_path, txt_filename)
        with open(txt_path, 'w', encoding='utf-8') as f:
            f.write(response.text)


ClientError: 400 FAILED_PRECONDITION. {'error': {'code': 400, 'message': 'User location is not supported for the API use.', 'status': 'FAILED_PRECONDITION'}}

In [2]:
import os

def combine_txt_files(input_folder, output_file):
    txt_files = sorted([f for f in os.listdir(input_folder) if f.endswith('.txt')])
    
    with open(output_file, 'w', encoding='utf-8') as outfile:
        for filename in txt_files:
            file_path = os.path.join(input_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as infile:
                content = infile.read()
                outfile.write(content + "\n")  # Add a newline between files (optional)

# Example usage
input_dir = 'Dataset/1833_gemini'
output_txt = 'Dataset/gemini_1833.txt'
combine_txt_files(input_dir, output_txt)
