In [1]:
# Install required packages
!pip install SpeechRecognition pydub gTTS

import speech_recognition as sr
from pydub import AudioSegment
from google.colab import files
from gtts import gTTS

# ===============================
# 1. Function to upload and transcribe audio file
# ===============================
def upload_and_transcribe():
    print("Please upload an audio file (.wav or .mp3):")
    uploaded = files.upload()
    for fn in uploaded.keys():
        file_name = fn

    # Convert MP3 to WAV if needed
    if not file_name.endswith(".wav"):
        sound = AudioSegment.from_file(file_name)
        file_name = "converted.wav"
        sound.export(file_name, format="wav")

    recognizer = sr.Recognizer()
    with sr.AudioFile(file_name) as source:
        audio_data = recognizer.record(source)

    try:
        text = recognizer.recognize_google(audio_data)
        print("\n📝 Transcription from file:\n", text)
    except sr.UnknownValueError:
        print("❌ Could not understand audio.")
    except sr.RequestError:
        print("❌ API request error.")

# ===============================
# 2. Function to generate sample audio for testing
# ===============================
def generate_sample_audio():
    tts = gTTS("Hello, this is a test for the speech to text transcription project.")
    tts.save("sample_audio.mp3")
    print("✅ Sample audio created: sample_audio.mp3")

# ===============================
# 3. Main Menu
# ===============================
print("Speech-to-Text Transcription Tool")
print("Options:\n1 - Upload audio file\n2 - Generate sample audio")
choice = input("Enter choice (1/2): ").strip()

if choice == "1":
    upload_and_transcribe()
elif choice == "2":
    generate_sample_audio()
else:
    print("❌ Invalid choice.")

Collecting SpeechRecognition
  Downloading speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting click<8.2,>=7.1 (from gTTS)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading speechrecognition-3.14.3-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition, click, gTTS
  Attempting uninstall: click
    Found existing installation: click 8.2.1
    Uninstalling click-8.2.1:
      Successfully uninstalled click-8.2.1
Successfully installed SpeechRecognition-3.14.3 click-8.1.8 gTTS-2.5.4
Speech-to-Text Transcription To

Saving sample_audio.mp3 to sample_audio.mp3

📝 Transcription from file:
 hello this is a test for the speech to text transcription project


In [None]:
generate_sample_audio()

✅ Sample audio created: sample_audio.mp3


In [2]:
!pip install SpeechRecognition pydub

import speech_recognition as sr
import io
import base64
from google.colab import output
from IPython.display import Javascript
from pydub import AudioSegment # Import pydub

AUDIO_FILE = "live_audio.wav"

# JavaScript for recording audio
JS_CODE = """
const sleep = time => new Promise(resolve => setTimeout(resolve, time));
const b2text = blob => new Promise(resolve => {
    const reader = new FileReader();
    reader.onloadend = () => resolve(reader.result);
    reader.readAsDataURL(blob);
});

async function record(sec){
  stream = await navigator.mediaDevices.getUserMedia({ audio: true });
  recorder = new MediaRecorder(stream);
  chunks = [];
  recorder.ondataavailable = e => chunks.push(e.data);
  recorder.start();
  await sleep(sec*1000);
  recorder.stop();
  await new Promise(resolve => recorder.onstop = resolve);
  blob = new Blob(chunks, { type: 'audio/wav' });
  let b64 = await b2text(blob);
  return b64;
}
"""

display(Javascript(JS_CODE))

# Function to record audio from mic
def record_audio(seconds=5):
    try:
        data = output.eval_js(f"record({seconds})")
        binary = base64.b64decode(data.split(',')[1])

        # Convert the audio data to WAV format using pydub
        try:
            audio = AudioSegment.from_file(io.BytesIO(binary))
            audio.export(AUDIO_FILE, format="wav")
        except Exception as e:
            print(f"Error converting audio: {e}")
            return None

        return AUDIO_FILE
    except Exception as e:
        print("Error recording:", e)
        return None

# Transcription function
def transcribe(file_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(file_path) as source:
        audio_data = recognizer.record(source)
    try:
        return recognizer.recognize_google(audio_data)
    except:
        return "[Could not understand audio]"

# "Live" loop
def live_transcription(chunks=3, chunk_duration=5):
    print("🎙️ Starting live transcription...\n")
    for i in range(chunks):
        print(f"Recording chunk {i+1}/{chunks}...")
        file_path = record_audio(chunk_duration)
        if file_path:
            text = transcribe(file_path)
            print("You said:", text, "\n")
        else:
            print("Skipping transcription for this chunk due to recording error.\n")

# Run live mode: 3 chunks of 5 seconds each
live_transcription(chunks=3, chunk_duration=5)



<IPython.core.display.Javascript object>

🎙️ Starting live transcription...

Recording chunk 1/3...
You said: [Could not understand audio] 

Recording chunk 2/3...
You said: hello 

Recording chunk 3/3...
You said: hi Daddy hi Daddy 

