In [1]:
import sounddevice as sd
import numpy as np
from faster_whisper import WhisperModel

# Initialize the Whisper model
model_size = "large-v3"
model = WhisperModel(model_size, device="cuda", compute_type="float32")

# Parameters for real-time audio capture
SAMPLE_RATE = 16000  # Whisper expects 16 kHz audio
CHUNK_SIZE = 1024    # Size of each audio chunk to process
DURATION = 5         # Duration of each recording chunk in seconds

def real_time_transcription():
    # Callback function to process audio chunks in real-time
    def callback(indata, frames, time, status):
        if status:
            print(status)
        audio_data = indata[:, 0]  
        
        # Resample and normalize audio if needed (Whisper expects float32 values)
        audio_data = np.float32(audio_data)
        
        # Run transcription on the audio chunk
        segments, info = model.transcribe(audio_data, beam_size=5)

        # Output detected language and transcription
        print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
        for segment in segments:
            print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

    # Open an audio stream for real-time processing
    with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, callback=callback, blocksize=CHUNK_SIZE):
        print("Listening and transcribing in real-time... Press Ctrl+C to stop.")
        sd.sleep(DURATION * 1000)  # Keep running for `DURATION` seconds (or set an infinite loop)

# Start the real-time transcription
real_time_transcription()


Listening and transcribing in real-time... Press Ctrl+C to stop.


Detected language 'en' with probability 0.493422
[0.00s -> 0.04s]  Thank you.


In [2]:
!pip install pyaudio




[notice] A new release of pip is available: 23.3.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





In [1]:
import sounddevice as sd
import numpy as np
from faster_whisper import WhisperModel
import queue
import threading

# Initialize the Whisper model
model_size = "large-v3"
model = WhisperModel(model_size, device="cuda", compute_type="float32")

# Parameters for real-time audio capture
SAMPLE_RATE = 16000  # Whisper expects 16 kHz audio
CHUNK_SIZE = 1024 
DURATION = 5   
  # Size of each audio chunk to process

# Create a queue to store transcriptions
transcription_queue = queue.Queue()

def audio_callback(indata, frames, time, status):
    """Callback function to process incoming audio data."""
    if status:
        print(status)
    audio_data = indata[:, 0]  # Extract audio from the first channel
    audio_data = np.float32(audio_data)  # Ensure audio data is float32

    # Run transcription on the audio chunk
    segments, info = model.transcribe(audio_data, beam_size=5)

    # Output detected language and transcription
    detected_language = info.language
    language_probability = info.language_probability
    transcription = []

    for segment in segments:
        transcription.append(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")

    # Put the transcription in the queue
    transcription_queue.put((detected_language, language_probability, transcription))

def transcribe_audio():
    """Thread function to continuously print transcriptions."""
    while True:
        detected_language, language_probability, transcription = transcription_queue.get()
        
        # Print the detected language and its probability
        print(f"\nDetected language: '{detected_language}' with probability: {language_probability:.2f}")
        print("Transcription:")
        print("\n".join(transcription))

def real_time_transcription():
    """Start the audio input stream and transcription thread."""
    # Start the audio stream
    with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, callback=audio_callback, blocksize=CHUNK_SIZE):
        print("Listening and transcribing in real-time... Press Ctrl+C to stop.")

        # Start a separate thread for printing transcriptions
        transcription_thread = threading.Thread(target=transcribe_audio, daemon=True)
        transcription_thread.start()

        # Keep the main thread alive to listen for keyboard interrupts
        try:
            while True:
                sd.sleep(1000)
        except KeyboardInterrupt:
            print("Stopping transcription.")

# Start the real-time transcription
if __name__ == "__main__":
    real_time_transcription()


Listening and transcribing in real-time... Press Ctrl+C to stop.

Detected language: 'en' with probability: 0.50
Transcription:
[0.00s -> 0.04s]  Thank you.
input overflow

Detected language: 'ru' with probability: 0.19
Transcription:
[0.00s -> 0.04s]  Субтитры создавал DimaTorzok
input overflow

Detected language: 'ru' with probability: 0.20
Transcription:
[0.00s -> 0.04s]  Субтитры сделал DimaTorzok
input overflow
Stopping transcription.



Detected language: 'ru' with probability: 0.19
Transcription:
[0.00s -> 0.04s]  Субтитры создавал DimaTorzok


In [1]:
import sounddevice as sd
import numpy as np
from faster_whisper import WhisperModel

# Initialize the Whisper model
model_size = "large-v3"
model = WhisperModel(model_size, device="cuda", compute_type="float32")

# Parameters for real-time audio capture
SAMPLE_RATE = 16000  # Whisper expects 16 kHz audio
CHUNK_SIZE = 1024    # Size of each audio chunk to process

def audio_callback(indata, frames, time, status):
    """Callback function to process incoming audio data."""
    if status:
        print(status)
    audio_data = indata[:, 0]  # Extract audio from the first channel
    audio_data = np.float32(audio_data)  # Ensure audio data is float32

    # Run transcription on the audio chunk
    segments, info = model.transcribe(audio_data, beam_size=5)

    # Output detected language and transcription immediately
    detected_language = info.language
    language_probability = info.language_probability

    # Print detected language
    print(f"\nDetected language: '{detected_language}' with probability: {language_probability:.2f}")
    
    # Print transcription segments
    for segment in segments:
        print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")

def real_time_transcription():
    """Start the audio input stream for real-time transcription."""
    # Start the audio stream
    with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, callback=audio_callback, blocksize=CHUNK_SIZE):
        print("Listening and transcribing in real-time... Press Ctrl+C to stop.")

        # Keep the main thread alive to listen for keyboard interrupts
        try:
            while True:
                sd.sleep(1000)  # Sleep in the main thread to keep it running
        except KeyboardInterrupt:
            print("Stopping transcription.")

# Start the real-time transcription
if __name__ == "__main__":
    real_time_transcription()


Listening and transcribing in real-time... Press Ctrl+C to stop.

Detected language: 'en' with probability: 0.51
[0.00s -> 0.04s]  Thank you.
input overflow
Stopping transcription.



Detected language: 'ru' with probability: 0.21


In [1]:
import sounddevice as sd
import numpy as np
from faster_whisper import WhisperModel

# Initialize the Whisper model
model_size = "large-v3"
model = WhisperModel(model_size, device="cpu", compute_type="float32")

# Parameters for real-time audio capture
SAMPLE_RATE = 16000  # Whisper expects 16 kHz audio
CHUNK_SIZE = 512     # Reduced size of each audio chunk

def audio_callback(indata, frames, time, status):
    """Callback function to process incoming audio data."""
    if status:
        print(status)
    audio_data = indata[:, 0]  # Extract audio from the first channel
    audio_data = np.float32(audio_data)  # Ensure audio data is float32

    # Run transcription on the audio chunk
    segments, info = model.transcribe(audio_data, beam_size=3)  # Lower beam size

    # Output detected language and transcription immediately
    detected_language = info.language
    language_probability = info.language_probability

    # Print detected language
    print(f"\nDetected language: '{detected_language}' with probability: {language_probability:.2f}")
    
    # Print transcription segments
    for segment in segments:
        print(f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}")

def real_time_transcription():
    """Start the audio input stream for real-time transcription."""
    # Start the audio stream
    with sd.InputStream(samplerate=SAMPLE_RATE, channels=1, callback=audio_callback, blocksize=CHUNK_SIZE):
        print("Listening and transcribing in real-time... Press Ctrl+C to stop.")

        # Keep the main thread alive to listen for keyboard interrupts
        try:
            while True:
                sd.sleep(1000)  # Sleep in the main thread to keep it running
        except KeyboardInterrupt:
            print("Stopping transcription.")

# Start the real-time transcription
if __name__ == "__main__":
    real_time_transcription()


Listening and transcribing in real-time... Press Ctrl+C to stop.

Detected language: 'en' with probability: 0.39
[0.00s -> 0.04s]  Thank you.
input overflow
Stopping transcription.



Detected language: 'en' with probability: 0.27
[0.00s -> 0.04s]  Thank you.


In [None]:
import sounddevice as sd
import numpy as np
import wave
from faster_whisper import WhisperModel

# Parameters for audio recording
SAMPLE_RATE = 16000  # Whisper expects 16 kHz audio
DURATION = 5         # Duration of recording in seconds
CHUNK_SIZE = 1024    # Size of each audio chunk to process

# Function to record audio
def record_audio(duration, sample_rate):
    """Record audio from the microphone."""
    print(f"Recording for {duration} seconds...")
    recorded_audio = []

    # Callback function to capture audio
    def callback(indata, frames, time, status):
        if status:
            print(status)
        recorded_audio.append(indata.copy())

    # Open the audio stream for recording
    with sd.InputStream(samplerate=sample_rate, channels=1, callback=callback, blocksize=CHUNK_SIZE):
        sd.sleep(duration * 1000)  # Record for the specified duration

    # Concatenate all recorded chunks into a single numpy array
    return np.concatenate(recorded_audio)

# Function to save audio to a WAV file
def save_audio_to_wav(audio_data, filename, sample_rate):
    """Save recorded audio data to a WAV file."""
    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(1)          # Mono
        wf.setsampwidth(2)          # 16-bit PCM
        wf.setframerate(sample_rate) # Sample rate
        wf.writeframes(audio_data.tobytes())  # Write audio data to file

# Function to transcribe audio
def transcribe_audio(filename):
    """Transcribe audio from a file using the Whisper model."""
    # Initialize the Whisper model
    model_size = "large-v3"
    model = WhisperModel(model_size, device="cuda", compute_type="float32")

    # Transcribe audio from the file
    segments, info = model.transcribe(filename, beam_size=5)

    # Output detected language and transcription
    print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

    for segment in segments:
        print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

# Main workflow
if __name__ == "__main__":
    # Record audio
    audio_data = record_audio(DURATION, SAMPLE_RATE)

    # Save the recorded audio to a WAV file
    audio_filename = "recorded_audio.wav"
    save_audio_to_wav(audio_data, audio_filename, SAMPLE_RATE)

    # Transcribe the saved audio file
    transcribe_audio(audio_filename)


Recording for 5 seconds...


In [None]:
import os
import wave
import pyaudio
import numpy as np
from scipy.io import wavfile
from faster_whisper import WhisperModel

# Default configurations
DEFAULT_MODEL_SIZE = "medium"
DEFAULT_CHUNK_LENGTH = 10  # seconds


def calculate_speaking_pace(transcription, chunk_length):
    """Calculate the speaking pace in words per second."""
    words = transcription.split()
    num_words = len(words)
    speaking_rate = num_words / chunk_length  # Words per second
    return speaking_rate


def is_silence(data, max_amplitude_threshold=3000):
    """Check if audio data contains silence based on max amplitude."""
    max_amplitude = np.max(np.abs(data))
    return max_amplitude <= max_amplitude_threshold


def record_audio_chunk(audio, stream, chunk_length=DEFAULT_CHUNK_LENGTH):
    """Record a chunk of audio and save it to a temporary file."""
    frames = []
    for _ in range(0, int(16000 / 1024 * chunk_length)):
        data = stream.read(1024)
        frames.append(data)

    # Save the recorded chunk to a temporary WAV file
    temp_file_path = 'temp_audio_chunk.wav'
    with wave.open(temp_file_path, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(audio.get_sample_size(pyaudio.paInt16))
        wf.setframerate(16000)
        wf.writeframes(b''.join(frames))

    # Check if the recorded chunk contains silence
    try:
        samplerate, data = wavfile.read(temp_file_path)
        if is_silence(data):
            os.remove(temp_file_path)
            return False  # Return False if it's silence
        return True  # Return True if it contains speech
    except Exception as e:
        print(f"Error while reading audio file: {e}")
        return False


def transcribe_audio(model, file_path):
    """Transcribe the audio file using the Whisper model."""
    segments, info = model.transcribe(file_path, beam_size=7)
    transcription = ' '.join(segment.text for segment in segments)
    return transcription


def audio_to_text():
    """Main function to record and transcribe audio."""
    model_size = DEFAULT_MODEL_SIZE + ".en"
    model = WhisperModel(model_size, device="cpu")

    audio = pyaudio.PyAudio()
    stream = audio.open(format=pyaudio.paInt16, channels=1, rate=16000,
                        input=True, frames_per_buffer=1024)
    
    try:
        print("Recording... (Press Ctrl+C to stop)")
        while True:
            # Record audio chunk
            print("Listening for speech...")
            if record_audio_chunk(audio, stream):
                # Transcribe audio if speech is detected
                transcription = transcribe_audio(model, 'temp_audio_chunk.wav')
                os.remove('temp_audio_chunk.wav')

                # Calculate and print speaking pace
                speaking_pace = calculate_speaking_pace(transcription, DEFAULT_CHUNK_LENGTH)
                print(f"Customer: {transcription}")
                print(f"Speaking pace: {speaking_pace:.2f} words per second")
            else:
                print("Silence detected, waiting for speech...")

    except KeyboardInterrupt:
        print("\nStopping...")

    finally:
        stream.stop_stream()
        stream.close()
        audio.terminate()


if __name__ == "__main__":
    audio_to_text()


Recording... (Press Ctrl+C to stop)
Listening for speech...
Customer:  Is it working? How are you? The body is working or not? Hello? Hello?
Speaking pace: 1.40 words per second
Listening for speech...


In [1]:
!pip install roboflow

from roboflow import Roboflow
rf = Roboflow(api_key="i26yzPgXSLDL6CMF0ZfF")
project = rf.workspace("project-eqxcg").project("food_detection-vmjvx")
version = project.version(13)
dataset = version.download("multiclass")
                


[notice] A new release of pip is available: 23.3.2 -> 24.2
[notice] To update, run: c:\Users\ayush\AppData\Local\Programs\Python\Python311\python.exe -m pip install --upgrade pip


loading Roboflow workspace...
loading Roboflow project...
Downloading Dataset Version Zip in Food_detection-13 to multiclass: 100% [10738545 / 10738545] bytes


Extracting Dataset Version Zip to Food_detection-13 in multiclass:: 100%|██████████| 475/475 [00:02<00:00, 187.97it/s]


In [1]:
from flask import Flask, render_template, request
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from transformers import BitsAndBytesConfig

# Initialize Flask app
app = Flask(__name__)

from huggingface_hub import login
login(token="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")

# Setup your model and tokenizer (same as your script above)
base_model_id = "mistralai/Mistral-7B-Instruct-v0.2"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    quantization_config=bnb_config,
    device_map="cuda:0",
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=512,
    padding_side="left",
    add_eos_token=True
)

tokenizer.pad_token = tokenizer.eos_token
config = PeftConfig.from_pretrained("D:\project")
model = PeftModel.from_pretrained(base_model, "D:\project")


# Function to generate output based on input prompt
def generate_output(prompt):
    max_new_tokens = 200
    model_input = tokenizer(prompt, return_tensors="pt")

    model.eval()
    with torch.no_grad():
        output = model.generate(**model_input, max_new_tokens=max_new_tokens)

    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded_output


# Define Flask routes
@app.route('/')
def index():
    return render_template('index.html')  # Main page with the form

@app.route('/generate', methods=['POST'])
def generate():
    prompt = request.form['prompt']
    generated_text = generate_output(prompt)  # Call generate_output from model
    return jsonify({'generated_text': generated_text})


if __name__ == "__main__":
    app.run(debug=True)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to C:\Users\ayush\.cache\huggingface\token
Login successful


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
 * Restarting with watchdog (windowsapi)


SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
!pip install peft