In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/audio-s1/Expats Asian Interview -30-35.mp3


In [None]:
# === Install dependencies (in Kaggle or local Jupyter) ===
!pip install -q faster-whisper librosa soundfile pydub

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.7/39.7 MB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.6/38.6 MB[0m [31m44.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.0/46.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.8/86.8 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# === Import libraries ===
from pydub import AudioSegment
from faster_whisper import WhisperModel
import librosa
import numpy as np
import soundfile as sf
from tempfile import NamedTemporaryFile
import os

# === Step 1: Convert MP3 to mono 16kHz WAV ===
input_mp3 = "/kaggle/input/interview/Audio.mp3"
output_wav = "/kaggle/working/audio_fixed.wav"

audio = AudioSegment.from_mp3(input_mp3)
audio = audio.set_channels(1).set_frame_rate(16000)
audio.export(output_wav, format="wav")

# === Step 2: Load WAV (already 16kHz, so no need to resample) ===
audio_data, sample_rate = librosa.load(output_wav, sr=16000)

# === Step 3: Split based on silence ===
intervals = librosa.effects.split(audio_data, top_db=30)  # silence threshold (in dB)
print(f"🔹 Detected {len(intervals)} audio chunks")

# === Step 4: Load Whisper model ===
model = WhisperModel("large", device="cuda")  # use "cpu" if no GPU

# === Step 5: Transcribe each chunk ===
all_text = []

for idx, (start, end) in enumerate(intervals):
    duration = (end - start) / sample_rate
    if duration < 1.0:
        continue  # skip very short segments

    print(f"🎧 Transcribing chunk {idx + 1}/{len(intervals)} ({duration:.2f}s)")

    chunk = audio_data[start:end]

    # Save chunk to temporary WAV file
    with NamedTemporaryFile(suffix=".wav", delete=True) as temp_audio:
        sf.write(temp_audio.name, chunk, sample_rate)
        try:
            segments, _ = model.transcribe(temp_audio.name, language="en")
            chunk_text = "".join([seg.text for seg in segments])
            all_text.append(chunk_text)
        except Exception as e:
            print(f"⚠️ Error transcribing chunk {idx + 1}: {e}")

# === Step 6: Combine and save transcription ===
final_text = " ".join(all_text)
output_filename = "whisper_large_transcript.txt"

with open(output_filename, "w", encoding="utf-8") as f:
    f.write(final_text)

print(f"\n✅ Transcription complete. Output saved to: {output_filename}")
