In [1]:
import torch
from transformers import pipeline



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
audio='D:/IIIT-B-ML/data_converted/SandalWoodNewsStories_23.wav'
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [3]:
transcribe = pipeline(task="automatic-speech-recognition", model="vasista22/whisper-kannada-tiny", chunk_length_s=30, device=device)
transcribe.model.config.forced_decoder_ids = transcribe.tokenizer.get_decoder_prompt_ids(language="kn", task="transcribe")




In [4]:
import torch
import warnings
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa
import numpy as np
from pathlib import Path

def load_audio(file_path, target_sr=16000):
    """
    Load and preprocess audio file
    """
    try:
        print(f"Loading audio from {file_path}")
        # Load audio with explicit sample rate
        y, sr = librosa.load(file_path, sr=target_sr)
        print(f"Audio loaded - Shape: {y.shape}, Sample rate: {sr}")
        
        # Normalize audio
        y = librosa.util.normalize(y)
        
        return y
    except Exception as e:
        print(f"Error loading audio: {e}")
        return None

def setup_model():
    """
    Set up Whisper model with correct configuration
    """
    try:
        model_id = "vasista22/whisper-kannada-tiny"
        processor = WhisperProcessor.from_pretrained(model_id)
        model = WhisperForConditionalGeneration.from_pretrained(model_id)
        
        # Set specific generation parameters
        model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(
            language="kn",
            task="transcribe"
        )
        
        # Ensure suppress_tokens is properly initialized
        if not hasattr(model.config, 'suppress_tokens'):
            model.config.suppress_tokens = []
        
        return model, processor
    except Exception as e:
        print(f"Error setting up model: {e}")
        return None, None

def transcribe_chunk(chunk, model, processor):
    """
    Transcribe a single chunk of audio
    """
    try:
        # Process audio chunk
        inputs = processor(
            chunk,
            sampling_rate=16000,
            return_tensors="pt"
        ).input_features
        
        # Generate with specific parameters to avoid token suppression issue
        with torch.no_grad():
            generated_ids = model.generate(
                inputs,
                do_sample=False,
                num_beams=1,
                max_length=448,
                min_length=1,
                length_penalty=1.0,
                temperature=0.0,
                use_cache=True,
                suppress_tokens=None  # Explicitly disable token suppression
            )
        
        # Decode generated ids
        transcribed_text = processor.batch_decode(
            generated_ids,
            skip_special_tokens=True
        )[0].strip()
        
        return transcribed_text
    except Exception as e:
        print(f"Error transcribing chunk: {e}")
        return ""

def transcribe_audio(file_path, chunk_duration=30):
    """
    Main transcription function
    """
    try:
        # Load model and processor
        print("Setting up model...")
        model, processor = setup_model()
        if model is None or processor is None:
            return None
        
        # Load audio
        audio = load_audio(file_path)
        if audio is None:
            return None
        
        # Calculate chunk parameters
        chunk_length = 16000 * chunk_duration
        num_chunks = int(np.ceil(len(audio) / chunk_length))
        print(f"Processing audio in {num_chunks} chunks")
        
        transcription = []
        
        # Process each chunk
        for i in range(num_chunks):
            print(f"Processing chunk {i+1}/{num_chunks}")
            
            # Extract chunk
            start = i * chunk_length
            end = min((i + 1) * chunk_length, len(audio))
            chunk = audio[start:end]
            
            # Pad last chunk if necessary
            if len(chunk) < chunk_length:
                chunk = np.pad(chunk, (0, chunk_length - len(chunk)))
            
            # Transcribe chunk
            chunk_text = transcribe_chunk(chunk, model, processor)
            if chunk_text:
                transcription.append(chunk_text)
            
        return " ".join(transcription)
        
    except Exception as e:
        print(f"Error in transcribe_audio: {e}")
        return None

def main():
    # Your audio file path
    audio_path = "D:/IIIT-B-ML/data_converted/SandalWoodNewsStories_23.wav"
    
    print(f"PyTorch version: {torch.__version__}")
    print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
    
    # Suppress warnings
    warnings.filterwarnings("ignore", category=UserWarning)
    
    # Perform transcription
    print("\nStarting transcription...")
    result = transcribe_audio(audio_path)
    
    if result:
        print("\nTranscription completed successfully!")
        print("\nTranscription:", result)
    else:
        print("\nTranscription failed")

if __name__ == "__main__":
    main()

PyTorch version: 2.4.1+cpu
Device: cpu

Starting transcription...
Setting up model...
Loading audio from D:/IIIT-B-ML/data_converted/SandalWoodNewsStories_23.wav


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Audio loaded - Shape: (1216355,), Sample rate: 16000
Processing audio in 3 chunks
Processing chunk 1/3
Processing chunk 2/3
Processing chunk 3/3

Transcription completed successfully!

Transcription: ನವಿರೋಧವು ಗಂಧದ ಗುಡಿಯಲ್ಲಿ ಜಗತ್ತಿಗೆ ಗಂಧದ ಪರಿಮಳವನ್ನು ಹಬ್ಬಿಸಿದಂಥ ಗಂಧದ ನಾಡಿನಲ್ಲಿ ಇಂಥ ಗಂಧದ ನಾಡಿನ ಕಂಪನವನ್ನು ಮತ್ತಷ್ಟು ಹೆಚ್ಚಿಸುವುದಕ್ಕೆ ಪಣತೊಟ್ಟು ನಿಂತಿದ್ದಾರೆ ಗಂಧದ ಗುಡಿಯ ಗಂಡುಗಳಿಗೆ ಇವರ ಹೆಸರು ರಮೇಶ್ ಬಲೂಟೆ ಕುಪ್ಪಳದ ಕುಷ್ಟಗಿಯವರು ಕೃಷಿಯನೇ ಬುದಕ್ಕಾಗಿಸಿಕೊಂಡಿರುವಂತಾ ರಮೇಶ್ ಗಂಧದ ನಾಡನ ಬೆಳಗು� ಸಿರಿಗಂಧವನ್ನು ಬೆಳೆದು ಸಿರಿವಂತನಾಗುವಂತಹ ಯುವ ರೈತರಿಗೆ ಕರೆಕೊಟ್ಟ ರಮೇಶ್ ಶ್ರೀಗಂಧವನ್ನು ಬೆಳೆಯುವುದು ಹೇಗೆ ಅಂತ ಕೂಡ ರೈನಿಂಗನ ಕೊಟ್ಟಿದ್ದ ಇವರ ಶ್ರಮದ ಫಲವಾಗಿ ಕುಶ್ರಗೀಯ ಸಾವಿರದ ಅಯ್ದು ನೂರು ಎಕರೆ ಪ್ರದೇಶದಲ್ಲಿ ಶ್ರೀಗಂಧ ಪರಿಮಳ ಹತ್ತಿದೆ ಶ್ರೀಗಂಧ ಬೆಳೆಯುವ ಬಗ್ಗೆ ಸುಮಾರು ಎನ್ನೂರು ಐವತ್ತು ಕ್ಕೂ ಹೆಚ್ಚು ಯುವ ರೈತರಿಗ ಶ್ರೀಗಂಧದ ಕಂಪನಿಗೆ ಹೆಚ್ಚಿಸುತ್ತಿರುವ ಈ ಕಣ್ಮನಿಗೆ ಕನ್ನಡ ಪ್ರಭಾ ಹಾಗೂ ದಶಕದ ಸಂಭ್ರಮದಲ್ಲಿರುವಂತಹ ಸುವರ್ಣ ನ್ಯೂಸ್ ವತಿಯಿಂದ ಅಸಾಮಾನ್ಯ ಕನ್ನಡಿಗ ಪ್ರಶಸ್ತಿಯ ಗೌರವ


## audio to text 1

In [None]:
import torch
import warnings
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa
import numpy as np
from pathlib import Path
import json
import time
from tqdm import tqdm
import os

class WhisperTranscriber:
    def __init__(self, model_id="vasista22/whisper-kannada-tiny"):
        print("Initializing Whisper model...")
        self.processor = WhisperProcessor.from_pretrained(model_id)
        self.model = WhisperForConditionalGeneration.from_pretrained(model_id)
        
        # Set model configuration
        self.model.config.forced_decoder_ids = self.processor.get_decoder_prompt_ids(
            language="kn",
            task="transcribe"
        )
        
        # Disable token suppression
        if not hasattr(self.model.config, 'suppress_tokens'):
            self.model.config.suppress_tokens = []
            
        print("Model initialized successfully")

    def load_audio(self, file_path, target_sr=16000):
        """Load and preprocess audio file"""
        try:
            y, sr = librosa.load(file_path, sr=target_sr)
            y = librosa.util.normalize(y)
            return y
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
            return None

    def transcribe_chunk(self, chunk):
        """Transcribe a single chunk of audio"""
        try:
            inputs = self.processor(
                chunk,
                sampling_rate=16000,
                return_tensors="pt"
            ).input_features
            
            with torch.no_grad():
                generated_ids = self.model.generate(
                    inputs,
                    do_sample=False,
                    num_beams=1,
                    max_length=448,
                    min_length=1,
                    length_penalty=1.0,
                    temperature=0.0,
                    use_cache=True,
                    suppress_tokens=None
                )
            
            return self.processor.batch_decode(
                generated_ids,
                skip_special_tokens=True
            )[0].strip()
        except Exception as e:
            print(f"Error transcribing chunk: {e}")
            return ""

    def transcribe_audio(self, file_path, chunk_duration=30):
        """Transcribe complete audio file"""
        try:
            # Load audio
            audio = self.load_audio(file_path)
            if audio is None:
                return None
            
            # Process in chunks
            chunk_length = 16000 * chunk_duration
            num_chunks = int(np.ceil(len(audio) / chunk_length))
            transcription = []
            
            # Process each chunk
            for i in range(num_chunks):
                start = i * chunk_length
                end = min((i + 1) * chunk_length, len(audio))
                chunk = audio[start:end]
                
                # Pad if necessary
                if len(chunk) < chunk_length:
                    chunk = np.pad(chunk, (0, chunk_length - len(chunk)))
                
                chunk_text = self.transcribe_chunk(chunk)
                if chunk_text:
                    transcription.append(chunk_text)
            
            return " ".join(transcription)
            
        except Exception as e:
            print(f"Error transcribing {file_path}: {e}")
            return None

def process_folder(input_folder, output_folder):
    """Process all audio files in a folder"""
    # Create output folder if it doesn't exist
    output_folder = Path(output_folder)
    output_folder.mkdir(parents=True, exist_ok=True)
    
    # Initialize transcriber
    transcriber = WhisperTranscriber()
    
    # Get all WAV files
    input_folder = Path(input_folder)
    audio_files = list(input_folder.glob("*.wav"))
    
    print(f"Found {len(audio_files)} WAV files to process")
    
    # Dictionary to store results
    results = {}
    
    # Process each file
    for audio_file in tqdm(audio_files, desc="Processing files"):
        print(f"\nProcessing: {audio_file.name}")
        
        try:
            # Transcribe audio
            transcription = transcriber.transcribe_audio(str(audio_file))
            
            if transcription:
                # Save individual transcription
                output_file = output_folder / f"{audio_file.stem}.txt"
                with open(output_file, 'w', encoding='utf-8') as f:
                    f.write(transcription)
                
                # Add to results dictionary
                results[audio_file.name] = {
                    'status': 'success',
                    'transcription': transcription,
                    'output_file': str(output_file)
                }
                print(f"Success: {audio_file.name}")
            else:
                results[audio_file.name] = {
                    'status': 'failed',
                    'error': 'Transcription failed'
                }
                print(f"Failed: {audio_file.name}")
                
        except Exception as e:
            results[audio_file.name] = {
                'status': 'error',
                'error': str(e)
            }
            print(f"Error processing {audio_file.name}: {e}")
    
    # Save summary report
    report_path = output_folder / 'transcription_report.json'
    with open(report_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=2)
    
    return results

def main():
    # Configure paths
    input_folder = "D:/IIIT-B-ML/data_converted"
    output_folder = "D:/IIIT-B-ML/transcriptions"
    
    print(f"PyTorch version: {torch.__version__}")
    print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
    
    # Suppress warnings
    warnings.filterwarnings("ignore", category=UserWarning)
    
    # Process all files
    start_time = time.time()
    results = process_folder(input_folder, output_folder)
    
    # Print summary
    total_files = len(results)
    successful = sum(1 for r in results.values() if r['status'] == 'success')
    failed = total_files - successful
    
    print("\nProcessing Complete!")
    print(f"Total files processed: {total_files}")
    print(f"Successfully transcribed: {successful}")
    print(f"Failed: {failed}")
    print(f"Total time taken: {time.time() - start_time:.2f} seconds")
    print(f"\nResults saved in: {output_folder}")

if __name__ == "__main__":
    main()

PyTorch version: 2.4.1+cpu
Device: cpu
Initializing Whisper model...
Model initialized successfully
Found 71 WAV files to process


Processing files:   0%|          | 0/71 [00:00<?, ?it/s]


Processing: SandalWoodNewsStories_1.wav


Processing files:   1%|▏         | 1/71 [01:52<2:10:44, 112.07s/it]

Success: SandalWoodNewsStories_1.wav

Processing: SandalWoodNewsStories_107.wav


Processing files:   3%|▎         | 2/71 [01:56<55:58, 48.68s/it]   

Success: SandalWoodNewsStories_107.wav

Processing: SandalWoodNewsStories_112.wav


Processing files:   4%|▍         | 3/71 [01:58<30:54, 27.27s/it]

Success: SandalWoodNewsStories_112.wav

Processing: SandalWoodNewsStories_144.wav


Processing files:   6%|▌         | 4/71 [03:07<48:46, 43.68s/it]

Success: SandalWoodNewsStories_144.wav

Processing: SandalWoodNewsStories_146.wav


Processing files:   7%|▋         | 5/71 [03:42<44:42, 40.65s/it]

Success: SandalWoodNewsStories_146.wav

Processing: SandalWoodNewsStories_148.wav


Processing files:   8%|▊         | 6/71 [03:46<30:39, 28.29s/it]

Success: SandalWoodNewsStories_148.wav

Processing: SandalWoodNewsStories_156.wav


Processing files:  10%|▉         | 7/71 [05:19<52:48, 49.51s/it]

Success: SandalWoodNewsStories_156.wav

Processing: SandalWoodNewsStories_158.wav


Processing files:  11%|█▏        | 8/71 [06:10<52:16, 49.79s/it]

Success: SandalWoodNewsStories_158.wav

Processing: SandalWoodNewsStories_159.wav


Processing files:  13%|█▎        | 9/71 [11:15<2:13:56, 129.62s/it]

Success: SandalWoodNewsStories_159.wav

Processing: SandalWoodNewsStories_167.wav


Processing files:  14%|█▍        | 10/71 [11:57<1:44:17, 102.58s/it]

Success: SandalWoodNewsStories_167.wav

Processing: SandalWoodNewsStories_168.wav


Processing files:  15%|█▌        | 11/71 [12:44<1:25:43, 85.73s/it] 

Success: SandalWoodNewsStories_168.wav

Processing: SandalWoodNewsStories_169.wav


Processing files:  17%|█▋        | 12/71 [17:23<2:22:01, 144.42s/it]

Success: SandalWoodNewsStories_169.wav

Processing: SandalWoodNewsStories_171.wav


Processing files:  18%|█▊        | 13/71 [18:14<1:52:17, 116.17s/it]

Success: SandalWoodNewsStories_171.wav

Processing: SandalWoodNewsStories_172.wav


Processing files:  20%|█▉        | 14/71 [19:52<1:45:03, 110.58s/it]

Success: SandalWoodNewsStories_172.wav

Processing: SandalWoodNewsStories_173.wav


Processing files:  21%|██        | 15/71 [20:42<1:26:18, 92.47s/it] 

Success: SandalWoodNewsStories_173.wav

Processing: SandalWoodNewsStories_174.wav


Processing files:  23%|██▎       | 16/71 [22:42<1:32:09, 100.54s/it]

Success: SandalWoodNewsStories_174.wav

Processing: SandalWoodNewsStories_175.wav


Processing files:  24%|██▍       | 17/71 [25:35<1:50:14, 122.49s/it]

Success: SandalWoodNewsStories_175.wav

Processing: SandalWoodNewsStories_176.wav


Processing files:  25%|██▌       | 18/71 [28:10<1:56:41, 132.10s/it]

Success: SandalWoodNewsStories_176.wav

Processing: SandalWoodNewsStories_179.wav


Processing files:  27%|██▋       | 19/71 [30:15<1:52:43, 130.07s/it]

Success: SandalWoodNewsStories_179.wav

Processing: SandalWoodNewsStories_181.wav


Processing files:  28%|██▊       | 20/71 [30:30<1:21:06, 95.42s/it] 

Success: SandalWoodNewsStories_181.wav

Processing: SandalWoodNewsStories_184.wav


Processing files:  30%|██▉       | 21/71 [31:20<1:08:13, 81.86s/it]

Success: SandalWoodNewsStories_184.wav

Processing: SandalWoodNewsStories_191.wav


Processing files:  31%|███       | 22/71 [32:35<1:05:17, 79.95s/it]

Success: SandalWoodNewsStories_191.wav

Processing: SandalWoodNewsStories_197.wav


Processing files:  32%|███▏      | 23/71 [33:29<57:35, 71.98s/it]  

Success: SandalWoodNewsStories_197.wav

Processing: SandalWoodNewsStories_2.wav


Processing files:  34%|███▍      | 24/71 [33:38<41:40, 53.20s/it]

Success: SandalWoodNewsStories_2.wav

Processing: SandalWoodNewsStories_200.wav


Processing files:  35%|███▌      | 25/71 [34:30<40:24, 52.70s/it]

Success: SandalWoodNewsStories_200.wav

Processing: SandalWoodNewsStories_211.wav


Processing files:  37%|███▋      | 26/71 [34:50<32:19, 43.09s/it]

Success: SandalWoodNewsStories_211.wav

Processing: SandalWoodNewsStories_215.wav


Processing files:  38%|███▊      | 27/71 [37:28<56:50, 77.50s/it]

Success: SandalWoodNewsStories_215.wav

Processing: SandalWoodNewsStories_223.wav


Processing files:  39%|███▉      | 28/71 [38:05<46:52, 65.41s/it]

Success: SandalWoodNewsStories_223.wav

Processing: SandalWoodNewsStories_229.wav


Processing files:  41%|████      | 29/71 [40:12<58:44, 83.92s/it]

Success: SandalWoodNewsStories_229.wav

Processing: SandalWoodNewsStories_23.wav


Processing files:  42%|████▏     | 30/71 [40:37<45:12, 66.15s/it]

Success: SandalWoodNewsStories_23.wav

Processing: SandalWoodNewsStories_230.wav


Processing files:  44%|████▎     | 31/71 [41:30<41:28, 62.22s/it]

Success: SandalWoodNewsStories_230.wav

Processing: SandalWoodNewsStories_239.wav


Processing files:  45%|████▌     | 32/71 [42:52<44:13, 68.03s/it]

Success: SandalWoodNewsStories_239.wav

Processing: SandalWoodNewsStories_242.wav


In [4]:
import torch
import warnings
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa
import numpy as np
from pathlib import Path
import json
import time
from tqdm import tqdm
import os
from difflib import SequenceMatcher
from concurrent.futures import ThreadPoolExecutor
import re

In [None]:
import torch
import warnings
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from sentence_transformers import SentenceTransformer, util
import librosa
import numpy as np
from pathlib import Path
import json
from gtts import gTTS
import os

class WhisperTranscriber:
    def __init__(self, model_id="vasista22/whisper-kannada-tiny"):
        print("Initializing Whisper model...")
        self.processor = WhisperProcessor.from_pretrained(model_id)
        self.model = WhisperForConditionalGeneration.from_pretrained(model_id)
        
        # Set model configuration for Kannada transcription
        self.model.config.forced_decoder_ids = self.processor.get_decoder_prompt_ids(
            language="kn",
            task="transcribe"
        )
        
        # Initialize suppress_tokens with default values if empty
        if not hasattr(self.model.config, 'suppress_tokens') or not self.model.config.suppress_tokens:
            self.model.config.suppress_tokens = list(range(1, 20))  # Common tokens to suppress
            
        print("Model initialized successfully")

    def load_audio(self, file_path, target_sr=16000):
        """Load and preprocess audio file"""
        try:
            y, sr = librosa.load(file_path, sr=target_sr)
            y = librosa.util.normalize(y)
            return y
        except Exception as e:
            raise RuntimeError(f"Error loading audio file: {str(e)}")

    def transcribe_audio(self, file_path):
        """Transcribe an audio file (question)"""
        try:
            audio = self.load_audio(file_path)
            inputs = self.processor(
                audio,
                sampling_rate=16000,
                return_tensors="pt"
            ).input_features
            
            with torch.no_grad():
                generated_ids = self.model.generate(
                    inputs,
                    do_sample=False,
                    num_beams=1,
                    max_length=448,
                    suppress_tokens=self.model.config.suppress_tokens  # Explicitly pass suppress_tokens
                )
            
            transcription = self.processor.batch_decode(
                generated_ids,
                skip_special_tokens=True
            )[0].strip()
            
            if not transcription:
                raise ValueError("Empty transcription generated")
                
            return transcription
            
        except Exception as e:
            print(f"Error during transcription: {str(e)}")
            raise

def search_transcriptions(question, transcriptions_folder):
    """Search for relevant answers in transcriptions based on semantic similarity"""
    embeddings_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
    question_embedding = embeddings_model.encode(question, convert_to_tensor=True)
    relevant_answers = {}

    try:
        transcriptions_path = Path(transcriptions_folder)
        if not transcriptions_path.exists():
            raise FileNotFoundError(f"Transcriptions folder not found: {transcriptions_folder}")

        for transcription_file in transcriptions_path.glob("*.txt"):
            try:
                with open(transcription_file, 'r', encoding='utf-8') as f:
                    transcription_text = f.read()
                    transcription_embedding = embeddings_model.encode(transcription_text, convert_to_tensor=True)
                    similarity_score = util.cos_sim(question_embedding, transcription_embedding).item()
                    
                    if similarity_score > 0.5:  # Adjust threshold as needed
                        relevant_answers[transcription_file.name] = {
                            "transcription": transcription_text,
                            "similarity_score": similarity_score
                        }
            except Exception as e:
                print(f"Error processing file {transcription_file}: {str(e)}")
                continue

    except Exception as e:
        print(f"Error during transcription search: {str(e)}")
        raise

    return relevant_answers



def text_to_speech_kannada(text, output_path="response_audio.mp3"):
    """Convert Kannada text to speech and save as an audio file"""
    try:
        tts = gTTS(text=text, lang="kn")
        tts.save(output_path)
        print(f"Audio saved at: {output_path}")
        return output_path
    except Exception as e:
        print(f"Error during text-to-speech conversion: {str(e)}")
        return None

def main():
    # Configure paths
    question_audio_path = "D:/IIIT-B-ML/question.wav"
    transcriptions_folder = "D:/IIIT-B-ML/transcriptions"
    
    # Suppress warnings
    warnings.filterwarnings("ignore", category=UserWarning)
    
    try:
        # Step 1: Transcribe the question audio
        transcriber = WhisperTranscriber()
        question_text = transcriber.transcribe_audio(question_audio_path)
        print(f"Question Transcription: {question_text}")
        
        # Step 2: Search for relevant answers
        relevant_answers = search_transcriptions(question_text, transcriptions_folder)
        
        # Display results and find the best answer
        print("\nRelevant Answers:")
        best_answer = None
        best_score = -1
        
        for filename, result in relevant_answers.items():
            print(f"\nFile: {filename}")
            print(f"Similarity Score: {result['similarity_score']}")
            print(f"Answer: {result['transcription']}")
            
            # Update the best answer
            if result['similarity_score'] > best_score:
                best_score = result['similarity_score']
                best_answer = result['transcription']
        
        # Convert the best answer to audio
        if best_answer:
            audio_file = text_to_speech_kannada(best_answer, output_path="best_answer_audio.mp3")
            print(f"Audio response saved at: {audio_file}")
        else:
            print("No relevant answers found.")
            
    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        raise

if __name__ == "__main__":
    main()


Initializing Whisper model...
Model initialized successfully
Question Transcription: ಗಂಧದ ಮರದ ಬೆಳೆ ಎಷ್ಟು ಕರೆ ವಿಸ್ತೀರ್ಣದಲ್ಲಿ ಬೆಳೆಯಲಾಗುತ್ತಿದೆ ಮತ್ತು ಎಷ್ಟು ರೈತರು ಈ ಬೆಳೆಯನ್ನು ಬೆಳೆಯುತ್ತಿದ್ದಾರೆ

Relevant Answers:

File: SandalWoodNewsStories_1.txt
Similarity Score: 0.8587470650672913
Answer: ಮುಖ್ಯಾಂಶಗಳು ಮುಖ್ಯಾಂಶಗಳು ಮುಖ್ಯಮಂತ್ರಿ ಎಚ್

File: SandalWoodNewsStories_107.txt
Similarity Score: 0.9359829425811768
Answer: ದರಗಿರಲಿರುವ ಕೊರೊನಾ ಸೋಂಕು ಹರಡಿರುವ ಸಾಧ್ಯತೆ ಇದೆ ದರಲ್ಲಿ ಕೇಂದ್ರ ಸರ್ಕಾರದ ಸಮಿತಿ ಸಭೆಯಲ್ಲಿ ನಿನ್ನೆ ಸಿದ್ದರಾಮಯ್ಯ ಮಾತನಾಡಿದ ಅವರು ಸರ್ಕಾರಿ ಸಮಿತಿ ಸಭೆಯಲ್ಲಿ ಮಾತನಾಡಿದರು

File: SandalWoodNewsStories_112.txt
Similarity Score: 0.947618842124939
Answer: ಗಿಡಾಗಿ ಕೊಟ್ಟರಲ್ಲ ಹಾಕಿದೆ ಬೆಳೆಗಳು ಯಾವ ಕೊಟ್ಟರೆ

File: SandalWoodNewsStories_144.txt
Similarity Score: 0.8889991641044617
Answer: ಹೈ ಇಳಿಕೆ ಮಾಡಿದ್ದು ಇವತ್ತು ನಾವು ನಮ್ಮ ಶ್ರೀಗದ ತೋಟದಲ್ಲಿದೆವೆ ಇದು ಒಂದು ವರ್ಷದ ಹಿಂದೆ ನಾವೊಂದು ಅತಿಲಲ್ಲಿ ಡಾಲಪ್ ಮಾಡಿರುವಂತಹ ತೋಟ ಅತಿನಿಯಿಂದ ಸ್ವಲ್ಪ ದೂರಾಗುತ್ತೆ ಭಟ್ನಾವರ ಫ್ರೆನ್ಸಿಘತ್ತೆ ನಿಯಂತ್ರಿಸುತ್ತಿದ್ದೇನೆ ಯಾಕಂದ್ರೆ ಇದೇ ತೋಟದ ಬಗ್ಗೆ ನಾವು ಇದರ ಕುರಿತು ಸ್ವಲ್ಪ ಡಿಟೆಲ

In [2]:
import torch
import warnings
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from sentence_transformers import SentenceTransformer, util
import librosa
import numpy as np
from pathlib import Path
import json
from gtts import gTTS
import os
import sounddevice as sd
import soundfile as sf
import time
import wave
from datetime import datetime

class AudioRecorder:
    def __init__(self, sample_rate=16000):
        self.sample_rate = sample_rate
        self.channels = 1
        self.dtype = 'float32'
        
    def record_audio(self, duration=10, output_path="recorded_question.wav"):
        """Record audio for a specified duration"""
        print(f"\nRecording for {duration} seconds... Speak your question now!")
        
        # Record audio
        recording = sd.rec(
            int(duration * self.sample_rate),
            samplerate=self.sample_rate,
            channels=self.channels,
            dtype=self.dtype
        )
        
        # Wait for recording to complete
        sd.wait()
        print("Recording finished!")
        
        # Normalize the audio
        recording = librosa.util.normalize(recording.flatten())
        
        # Save the recording
        sf.write(output_path, recording, self.sample_rate)
        return output_path

class WhisperTranscriber:
    def __init__(self, model_id="vasista22/whisper-kannada-tiny"):
        print("Initializing Whisper model...")
        self.processor = WhisperProcessor.from_pretrained(model_id)
        self.model = WhisperForConditionalGeneration.from_pretrained(model_id)
        
        self.model.config.forced_decoder_ids = self.processor.get_decoder_prompt_ids(
            language="kn",
            task="transcribe"
        )
        
        if not hasattr(self.model.config, 'suppress_tokens') or not self.model.config.suppress_tokens:
            self.model.config.suppress_tokens = list(range(1, 20))
            
        print("Model initialized successfully")

    def load_audio(self, file_path, target_sr=16000):
        try:
            y, sr = librosa.load(file_path, sr=target_sr)
            y = librosa.util.normalize(y)
            return y
        except Exception as e:
            raise RuntimeError(f"Error loading audio file: {str(e)}")

    def transcribe_audio(self, file_path):
        try:
            audio = self.load_audio(file_path)
            inputs = self.processor(
                audio,
                sampling_rate=16000,
                return_tensors="pt"
            ).input_features
            
            with torch.no_grad():
                generated_ids = self.model.generate(
                    inputs,
                    do_sample=False,
                    num_beams=1,
                    max_length=448,
                    suppress_tokens=self.model.config.suppress_tokens
                )
            
            transcription = self.processor.batch_decode(
                generated_ids,
                skip_special_tokens=True
            )[0].strip()
            
            if not transcription:
                raise ValueError("Empty transcription generated")
                
            return transcription
            
        except Exception as e:
            print(f"Error during transcription: {str(e)}")
            raise

def search_transcriptions(question, transcriptions_folder):
    embeddings_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
    question_embedding = embeddings_model.encode(question, convert_to_tensor=True)
    relevant_answers = {}

    try:
        transcriptions_path = Path(transcriptions_folder)
        if not transcriptions_path.exists():
            raise FileNotFoundError(f"Transcriptions folder not found: {transcriptions_folder}")

        for transcription_file in transcriptions_path.glob("*.txt"):
            try:
                with open(transcription_file, 'r', encoding='utf-8') as f:
                    transcription_text = f.read()
                    transcription_embedding = embeddings_model.encode(transcription_text, convert_to_tensor=True)
                    similarity_score = util.cos_sim(question_embedding, transcription_embedding).item()
                    
                    if similarity_score > 0.5:
                        relevant_answers[transcription_file.name] = {
                            "transcription": transcription_text,
                            "similarity_score": similarity_score
                        }
            except Exception as e:
                print(f"Error processing file {transcription_file}: {str(e)}")
                continue

    except Exception as e:
        print(f"Error during transcription search: {str(e)}")
        raise

    return relevant_answers

def text_to_speech_kannada(text, output_path="response_audio.mp3"):
    try:
        tts = gTTS(text=text, lang="kn")
        tts.save(output_path)
        print(f"Audio response saved at: {output_path}")
        return output_path
    except Exception as e:
        print(f"Error during text-to-speech conversion: {str(e)}")
        return None

def play_audio(file_path):
    """Play an audio file"""
    try:
        data, samplerate = sf.read(file_path)
        sd.play(data, samplerate)
        sd.wait()
    except Exception as e:
        print(f"Error playing audio: {str(e)}")

def main():
    # Configure paths
    transcriptions_folder = "D:/IIIT-B-ML/transcriptions"
    output_folder = Path("audio_qa_sessions")
    output_folder.mkdir(exist_ok=True)
    
    # Initialize components
    recorder = AudioRecorder()
    transcriber = WhisperTranscriber()
    
    # Suppress warnings
    warnings.filterwarnings("ignore", category=UserWarning)
    
    print("Welcome to the Kannada Audio Q&A System!")
    print("You can ask questions in Kannada, and I'll provide answers with audio responses.")
    
    while True:
        try:
            # Create a session folder with timestamp
            session_time = datetime.now().strftime("%Y%m%d_%H%M%S")
            session_folder = output_folder / session_time
            session_folder.mkdir(exist_ok=True)
            
            print("\nOptions:")
            print("1. Ask a question (record audio)")
            print("2. Exit")
            
            choice = input("\nEnter your choice (1 or 2): ")
            
            if choice == "2":
                print("Thank you for using the Kannada Audio Q&A System. Goodbye!")
                break
                
            elif choice == "1":
                # Record the question
                question_audio_path = session_folder / "question.wav"
                recorder.record_audio(duration=10, output_path=str(question_audio_path))
                
                # Transcribe the question
                question_text = transcriber.transcribe_audio(str(question_audio_path))
                print(f"\nYour Question (transcribed): {question_text}")
                
                # Search for relevant answers
                relevant_answers = search_transcriptions(question_text, transcriptions_folder)
                
                # Find the best answer
                best_answer = None
                best_score = -1
                
                for filename, result in relevant_answers.items():
                    if result['similarity_score'] > best_score:
                        best_score = result['similarity_score']
                        best_answer = result['transcription']
                
                # Generate and play audio response
                if best_answer:
                    print("\nBest Answer:", best_answer)
                    print("\nGenerating audio response...")
                    
                    response_audio_path = session_folder / "response.mp3"
                    audio_file = text_to_speech_kannada(best_answer, str(response_audio_path))
                    
                    if audio_file:
                        print("\nPlaying response...")
                        play_audio(audio_file)
                else:
                    print("\nSorry, I couldn't find a relevant answer to your question.")
                
            else:
                print("Invalid choice. Please try again.")
                
        except Exception as e:
            print(f"\nError: {str(e)}")
            print("Please try again.")
            continue

if __name__ == "__main__":
    main()

Initializing Whisper model...
Model initialized successfully
Welcome to the Kannada Audio Q&A System!
You can ask questions in Kannada, and I'll provide answers with audio responses.

Options:
1. Ask a question (record audio)
2. Exit

Recording for 10 seconds... Speak your question now!
Recording finished!


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Your Question (transcribed): ಹಲೆಯು

Best Answer: ಮುಖ್ಯಾಂಶಗಳು ಮುಖ್ಯಾಂಶಗಳು ಮುಖ್ಯಮಂತ್ರಿ ಎಚ್

Generating audio response...
Audio response saved at: audio_qa_sessions\20241117_202316\response.mp3

Playing response...

Options:
1. Ask a question (record audio)
2. Exit
Thank you for using the Kannada Audio Q&A System. Goodbye!
