In [1]:
# Cell 1: Imports and nest_asyncio setup

import os
import asyncio
import torch

# --- Import your configuration ---
import config

# --- Import the globally accessible AudioManager and ModelLoaderManager instances ---
from AudioManager import audio_manager
from ModelLoaderManager import model_loader_manager
# Import the new CommandLineMenu class
from CommandLineMenu import CommandLineMenu

# Important for running asyncio multiple times in a notebook cell
import nest_asyncio
nest_asyncio.apply()

print("All necessary modules imported and nest_asyncio applied.")

ModelLoaderManager initialized. Detected device: CUDA
All necessary modules imported and nest_asyncio applied.


In [2]:
# Cell 2: Model and Audio Manager Initialization (Run ONCE)

print("Starting model and audio manager initialization...")

# Ensure necessary directories are created at startup
os.makedirs(config.AUDIO_RECORDINGS_DIR, exist_ok=True)
os.makedirs(config.EMBEDDING_OUTPUT_DIR, exist_ok=True)
print(f"Created directories: {config.AUDIO_RECORDINGS_DIR}, {config.EMBEDDING_OUTPUT_DIR}")

# --- STEP 1: Initialize ModelLoaderManager ONCE ---
print("Initializing all NeMo models via ModelLoaderManager...")
model_loader_manager.initialize_all_models()
print("NeMo models initialized successfully.")

# --- STEP 2: Initialize AudioManager ONCE with models from ModelLoaderManager ---
print("Initializing AudioManager...")
audio_manager.initialize(
    asr_model=model_loader_manager.asr_model,
    tts_model=model_loader_manager.tts_model,
    vocoder_model=model_loader_manager.vocoder_model,
    sample_rate=config.TTS_SAMPLE_RATE, # Changed from config.SAMPLE_RATE as per our discussion
    recording_duration_seconds=config.RECORDING_DURATION_SECONDS,
    audio_records_dir=config.AUDIO_RECORDINGS_DIR,
    use_tts_for_answers_flag=config.USE_TTS_FOR_ANSWERS
)
print("AudioManager initialized successfully.")

print("Models are now loaded and ready in memory.")

Starting model and audio manager initialization...
Created directories: /home/olexandro/NLP/vocal-chatbot-grag-quote-assistant/storage/audio/audio_recordings, /home/olexandro/NLP/vocal-chatbot-grag-quote-assistant/storage/audio/speaker_embeddings
Initializing all NeMo models via ModelLoaderManager...

--- Starting Model Loading Process ---
Attempting to load pre-trained Speaker Verification model (TitaNet-Large)...
[NeMo I 2025-07-05 09:48:38 cloud:58] Found existing object /home/olexandro/.cache/torch/NeMo/NeMo_2.0.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.
[NeMo I 2025-07-05 09:48:38 cloud:64] Re-using file from: /home/olexandro/.cache/torch/NeMo/NeMo_2.0.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo
[NeMo I 2025-07-05 09:48:38 common:815] Instantiating model from pre-trained checkpoint


[NeMo W 2025-07-05 09:48:39 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json
    sample_rate: 16000
    labels: null
    batch_size: 64
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    augmentor:
      noise:
        manifest_path: /manifests/noise/rir_noise_manifest.json
        prob: 0.5
        min_snr_db: 0
        max_snr_db: 15
      speed:
        prob: 0.5
        sr: 16000
        resample_type: kaiser_fast
        min_speed_rate: 0.95
        max_speed_rate: 1.05
    num_workers: 15
    pin_memory: true
    
[NeMo W 2025-07-05 09:48:39 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method 

[NeMo I 2025-07-05 09:48:39 features:305] PADDING: 16
[NeMo I 2025-07-05 09:48:40 save_restore_connector:263] Model EncDecSpeakerLabelModel was successfully restored from /home/olexandro/.cache/torch/NeMo/NeMo_2.0.0rc0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.
Successfully loaded pre-trained TitaNet-Large model.

Attempting to load ASR model (FastConformer-Large)...
[NeMo I 2025-07-05 09:48:41 mixins:172] Tokenizer SentencePieceTokenizer initialized with 1024 tokens


[NeMo W 2025-07-05 09:48:42 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 1
    shuffle: true
    num_workers: 8
    pin_memory: true
    use_start_end_token: false
    trim_silence: false
    max_duration: 20
    min_duration: 0.1
    is_tarred: false
    tarred_audio_filepaths: null
    shuffle_n: 2048
    bucketing_strategy: fully_randomized
    bucketing_batch_size: null
    
[NeMo W 2025-07-05 09:48:42 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    manifest_filepath: null
    sample_rate: 16000
    batch_size: 32
    shuffle: false
    num_workers: 8
    pin_m

[NeMo I 2025-07-05 09:48:42 features:305] PADDING: 0


    


[NeMo I 2025-07-05 09:48:43 rnnt_models:224] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}
[NeMo I 2025-07-05 09:48:43 rnnt_models:224] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}


[NeMo W 2025-07-05 09:48:43 rnnt_loop_labels_computer:270] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: No `cuda-python` module. Please do `pip install cuda-python>=12.3`


[NeMo I 2025-07-05 09:48:43 rnnt_models:224] Using RNNT Loss : warprnnt_numba
    Loss warprnnt_numba_kwargs: {'fastemit_lambda': 0.0, 'clamp': -1.0}


[NeMo W 2025-07-05 09:48:43 rnnt_loop_labels_computer:270] No conditional node support for Cuda.
    Cuda graphs with while loops are disabled, decoding speed will be slower
    Reason: No `cuda-python` module. Please do `pip install cuda-python>=12.3`


[NeMo I 2025-07-05 09:48:44 save_restore_connector:263] Model EncDecRNNTBPEModel was successfully restored from /home/olexandro/.cache/huggingface/hub/models--nvidia--stt_en_fastconformer_transducer_large/snapshots/24a0e16aa9ebfb00b61300514c89cd84a4950021/stt_en_fastconformer_transducer_large.nemo.
Successfully loaded ASR model from HuggingFace.

Attempting to load FastPitch TTS model...
[NeMo I 2025-07-05 09:48:44 cloud:58] Found existing object /home/olexandro/.cache/torch/NeMo/NeMo_2.0.0rc0/tts_en_fastpitch_align/b7d086a07b5126c12d5077d9a641a38c/tts_en_fastpitch_align.nemo.
[NeMo I 2025-07-05 09:48:44 cloud:64] Re-using file from: /home/olexandro/.cache/torch/NeMo/NeMo_2.0.0rc0/tts_en_fastpitch_align/b7d086a07b5126c12d5077d9a641a38c/tts_en_fastpitch_align.nemo
[NeMo I 2025-07-05 09:48:44 common:815] Instantiating model from pre-trained checkpoint


 NeMo-text-processing :: INFO     :: Creating ClassifyFst grammars.
[NeMo W 2025-07-05 09:49:06 en_us_arpabet:66] apply_to_oov_word=None, This means that some of words will remain unchanged if they are not handled by any of the rules in self.parse_one_word(). This may be intended if phonemes and chars are both valid inputs, otherwise, you may see unexpected deletions in your input.
[NeMo W 2025-07-05 09:49:06 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.torch.data.TTSDataset
      manifest_filepath: /ws/LJSpeech/nvidia_ljspeech_train_clean_ngc.json
      sample_rate: 22050
      sup_data_path: /raid/LJSpeech/supplementary
      sup_data_types:
      - align_prior_matrix
      - pitch
      n_fft: 1024
      win_length: 1024
      hop_length: 256
      window: hann
      n_mels: 80
 

[NeMo I 2025-07-05 09:49:06 features:305] PADDING: 1
[NeMo I 2025-07-05 09:49:06 save_restore_connector:263] Model FastPitchModel was successfully restored from /home/olexandro/.cache/torch/NeMo/NeMo_2.0.0rc0/tts_en_fastpitch_align/b7d086a07b5126c12d5077d9a641a38c/tts_en_fastpitch_align.nemo.
Successfully loaded FastPitch TTS model.

Attempting to load HiFi-GAN vocoder model...
[NeMo I 2025-07-05 09:49:08 cloud:58] Found existing object /home/olexandro/.cache/torch/NeMo/NeMo_2.0.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.
[NeMo I 2025-07-05 09:49:08 cloud:64] Re-using file from: /home/olexandro/.cache/torch/NeMo/NeMo_2.0.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo
[NeMo I 2025-07-05 09:49:08 common:815] Instantiating model from pre-trained checkpoint


[NeMo W 2025-07-05 09:49:23 modelPT:176] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/train_finetune.txt
      min_duration: 0.75
      n_segments: 8192
    dataloader_params:
      drop_last: false
      shuffle: true
      batch_size: 64
      num_workers: 4
    
[NeMo W 2025-07-05 09:49:23 modelPT:183] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method and provide a valid configuration file to setup the validation data loader(s). 
    Validation config : 
    dataset:
      _target_: nemo.collections.tts.data.datalayers.MelAudioDataset
      manifest_filepath: /home/fkreuk/data/val_finetune.txt
      min_duration: 3
      n_segments: 66150


[NeMo I 2025-07-05 09:49:24 features:305] PADDING: 0


[NeMo W 2025-07-05 09:49:24 features:282] Using torch_stft is deprecated and has been removed. The values have been forcibly set to False for FilterbankFeatures and AudioToMelSpectrogramPreprocessor. Please set exact_pad to True as needed.


[NeMo I 2025-07-05 09:49:24 features:305] PADDING: 0


    


[NeMo I 2025-07-05 09:49:53 save_restore_connector:263] Model HifiGanModel was successfully restored from /home/olexandro/.cache/torch/NeMo/NeMo_2.0.0rc0/tts_hifigan/e6da322f0f7e7dcf3f1900a9229a7e69/tts_hifigan.nemo.
Successfully loaded HiFi-GAN vocoder model.
--- All Models Loading Process Completed ---
NeMo models initialized successfully.
Initializing AudioManager...
Initializing AudioManager...
AudioManager initialized successfully.
AudioManager initialized successfully.
Models are now loaded and ready in memory.


In [None]:
# Cell 3: Start Interactive Menu (Re-run for new chatbot sessions)

print("\n--- Starting new interactive menu session ---")

# Create an instance of CommandLineMenu and start it
# The CommandLineMenu now encapsulates the interactive loop
# and the ConversationalNeo4jChatbot handles its own LLM calls.
# It will use the pre-loaded models from audio_manager.
menu = CommandLineMenu()
await menu.start_interactive_menu()

print("\n--- Interactive menu session ended. ---")
# You can now run this cell again to start a new session without reloading models.


--- Starting new interactive menu session ---
Ollama client initialized with default model 'deepseek-r1:1.5b'.
Using internal _default_ollama_call for LLM interactions.
Welcome to the Vocal Chatbot!
(Speaking response...)
NOTICE: Text to speech for computer answers is currently **enabled**.

VOCAL CHATBOT SYSTEM MENU
1. Register a new speaker
2. Authenticate User (Voice Identification)
3. Start Voice Chat (Authenticated)
4. Conversational Neo4j Query (Fully Vocal)
5. Transcribe an audio file (ASR only)
6. Convert an existing audio file to embedding
7. Synthesize speech from text (TTS only)
8. List all registered speaker embeddings
9. Delete a registered speaker embedding
10. Clear ALL registered speaker embeddings
11. Toggle TTS for computer answers (Currently: ENABLED)
0. Exit
Please speak your choice from the menu.
(Speaking response...)

--- Starting Audio Recording Setup ---
Please speak your menu choice (e.g., 'one', 'two', 'exit') for 3 seconds.
Using input device: default (ID: 

Transcribing: 100%|██████████| 1/1 [00:02<00:00,  2.99s/it]


The Transcription is: "four"
(Speaking response...)
The Transcription is: "four"
Cleaned up temporary menu audio: /home/olexandro/NLP/vocal-chatbot-grag-quote-assistant/storage/audio/audio_recordings/menu_choice_temp.wav
User's spoken choice (transcribed): four -> Processed Choice: 4
What's your initial question for the knowledge graph?
(Speaking response...)
You can say 'root' at any time to go back to the main menu.
(Speaking response...)

--- Starting Audio Recording Setup ---
Speak your initial question (or 'root') for 3 seconds.
Using input device: default (ID: 1)

Recording started for 3 seconds...
Recording finished.
Temporary audio saved for processing: /home/olexandro/NLP/vocal-chatbot-grag-quote-assistant/storage/audio/audio_recordings/temp_vocal_input.wav

--- Starting ASR Transcription for 'temp_vocal_input.wav' ---


Transcribing: 100%|██████████| 1/1 [00:00<00:00,  4.96it/s]


The Transcription is: "peer dismissed the last stage of capitalism"
(Speaking response...)
Cleaned up temporary audio: /home/olexandro/NLP/vocal-chatbot-grag-quote-assistant/storage/audio/audio_recordings/temp_vocal_input.wav
Processing your question...
(Speaking response...)
Successfully connected to Neo4j.
Neo4j driver closed.

--- Chatbot: Error: Ollama response structure was unexpected or content was empty.
(Speaking response...)
Would you like to know more about it?
(Speaking response...)
