# WhisperX Transcription for Silksong Gesture Controller

**Project:** Hollow Knight: Silksong Gesture Recognition

**Purpose:** Transcribe audio recordings with word-level timestamps using WhisperX large-v3

**Hardware:** GPU-accelerated (CUDA)

---

## Setup Instructions

1. **Enable GPU:** Runtime → Change runtime type → GPU
2. **Upload audio files** to Google Drive: `My Drive/silksong_data/[session_name]/audio_16k.wav`
3. **Run all cells** in order

---

## 1. Verify GPU Access

In [None]:
# Check GPU availability
!nvidia-smi

import torch
print(f"\nCUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")

## 2. Install WhisperX and Dependencies

In [None]:
# Install WhisperX
!pip install -q git+https://github.com/m-bain/whisperx.git

# Install PyTorch with CUDA support
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

print("✅ Installation complete!")

## 3. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Verify mount
import os
base_path = '/content/drive/My Drive/silksong_data'

if os.path.exists(base_path):
    print(f"✅ Google Drive mounted successfully!")
    print(f"\nAvailable sessions:")
    sessions = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]
    for session in sessions:
        print(f"  - {session}")
else:
    print(f"⚠️  Path not found: {base_path}")
    print(f"   Please create the folder structure in Google Drive first.")

## 4. Define Custom Prompt for Gesture Commands

In [None]:
# Custom prompt optimized for Silksong gesture recognition
CUSTOM_PROMPT = (
    "The following is a transcription of a person playing the video game "
    "Hollow Knight: Silksong. They are speaking their character's actions out loud. "
    "The key commands are: jump, punch, attack, turn, walk, walking, walk start, "
    "idle, rest, stop, noise. The speaker might say phrases like 'I'm gonna jump here', "
    "'punch punch', 'let me walk over there', 'okay, now idle', or 'that was noise'."
)

print("Custom prompt configured:")
print(f"  {CUSTOM_PROMPT[:100]}...")

## 5. Single Session Transcription

In [None]:
# CONFIGURE THIS: Set your session name
SESSION_NAME = "20251017_125600_session"  # Change this to your session

# Paths
audio_path = f"/content/drive/My Drive/silksong_data/{SESSION_NAME}/audio_16k.wav"
output_dir = f"/content/drive/My Drive/silksong_data/{SESSION_NAME}/"

# Verify audio file exists
if not os.path.exists(audio_path):
    print(f"❌ ERROR: Audio file not found: {audio_path}")
    print(f"   Please upload audio_16k.wav to this location in Google Drive.")
else:
    print(f"✅ Audio file found: {audio_path}")

    # Get file size
    size_mb = os.path.getsize(audio_path) / (1024 * 1024)
    print(f"   Size: {size_mb:.2f} MB")

    # Get duration (approximate)
    import librosa
    duration = librosa.get_duration(path=audio_path)
    print(f"   Duration: {duration/60:.2f} minutes")
    print(f"   Estimated transcription time: {duration/60 * 0.5:.1f} minutes")

In [None]:
# Run WhisperX transcription
import whisperx
import torch

print("🚀 Starting WhisperX transcription...\n")

# Set device
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if device == "cuda" else "float32"

print(f"Device: {device}")
print(f"Compute type: {compute_type}\n")

# Load model
print("Loading WhisperX model (large-v3)...")
model = whisperx.load_model(
    "large-v3",
    device=device,
    compute_type=compute_type,
    language="en"
)
print("✅ Model loaded\n")

# Load audio
print("Loading audio...")
audio = whisperx.load_audio(audio_path)
print("✅ Audio loaded\n")

# Transcribe with custom prompt
print("Transcribing with custom gesture prompt...")
result = model.transcribe(
    audio,
    batch_size=16,
    initial_prompt=CUSTOM_PROMPT
)
print(f"✅ Transcription complete!")
print(f"   Segments: {len(result['segments'])}")
print(f"   Language: {result.get('language', 'unknown')}\n")

# Apply forced alignment for word-level timestamps
print("Applying forced alignment for word-level timestamps...")
model_a, metadata = whisperx.load_align_model(
    language_code="en",
    device=device
)

result = whisperx.align(
    result["segments"],
    model_a,
    metadata,
    audio,
    device,
    return_char_alignments=False
)

# Count words
word_count = sum(len(seg.get('words', [])) for seg in result.get('segments', []))
print(f"✅ Alignment complete!")
print(f"   Words with timestamps: {word_count}\n")

In [None]:
# Save results
import json

output_json = os.path.join(output_dir, "whisperx_output.json")
output_txt = os.path.join(output_dir, "whisperx_output_summary.txt")

# Save JSON
with open(output_json, 'w', encoding='utf-8') as f:
    json.dump(result, f, indent=2, ensure_ascii=False)
print(f"✅ Saved JSON: {output_json}")

# Save summary
with open(output_txt, 'w', encoding='utf-8') as f:
    f.write("WhisperX Transcription Summary\n")
    f.write("=" * 70 + "\n\n")
    f.write(f"Session: {SESSION_NAME}\n")
    f.write(f"Segments: {len(result.get('segments', []))}\n")
    f.write(f"Words: {word_count}\n\n")

    # First 50 words
    f.write("First 50 words with timestamps:\n")
    f.write("-" * 70 + "\n")

    all_words = []
    for seg in result.get('segments', []):
        all_words.extend(seg.get('words', []))

    for i, word_info in enumerate(all_words[:50], 1):
        word = word_info.get('word', '')
        start = word_info.get('start', 0)
        end = word_info.get('end', 0)
        score = word_info.get('score', 0)
        f.write(f"{i:3d}. {start:7.2f}s-{end:7.2f}s | {word:20s} | conf: {score:.3f}\n")

print(f"✅ Saved summary: {output_txt}")
print(f"\n🎉 Transcription complete! Results saved to Google Drive.")

## 6. Preview Results

In [None]:
# Display first few segments
print("First 5 segments with word-level timestamps:\n")
print("=" * 70)

for i, segment in enumerate(result['segments'][:5], 1):
    print(f"\n[Segment {i}] {segment['start']:.2f}s - {segment['end']:.2f}s")
    print(f"Text: {segment['text']}")

    if 'words' in segment:
        print("Words:")
        for word_info in segment['words']:
            word = word_info['word']
            start = word_info['start']
            end = word_info['end']
            conf = word_info.get('score', 0)
            print(f"  {start:6.2f}s-{end:6.2f}s: '{word}' (conf: {conf:.3f})")

print("\n" + "=" * 70)
print(f"Total segments: {len(result['segments'])}")
print(f"Total words: {word_count}")

## 7. Gesture Keyword Analysis

In [None]:
# Count gesture keywords in transcription
from collections import Counter

gesture_keywords = ['jump', 'punch', 'attack', 'turn', 'walk', 'walking', 'idle', 'rest', 'stop', 'noise']

# Extract all words
all_words = []
for seg in result['segments']:
    for word_info in seg.get('words', []):
        all_words.append(word_info['word'].lower().strip())

# Count gesture keywords
gesture_counts = Counter()
for word in all_words:
    if word in gesture_keywords:
        gesture_counts[word] += 1

# Display results
print("Gesture Keyword Frequency Analysis")
print("=" * 70)
print(f"\nTotal words transcribed: {len(all_words)}")
print(f"Gesture keywords found: {sum(gesture_counts.values())}")
print(f"Coverage: {sum(gesture_counts.values()) / len(all_words) * 100:.1f}%\n")

print("Gesture breakdown:")
for keyword in sorted(gesture_keywords):
    count = gesture_counts.get(keyword, 0)
    if count > 0:
        bar = '█' * int(count / 2)
        print(f"  {keyword:10s}: {count:3d} {bar}")
    else:
        print(f"  {keyword:10s}: {count:3d}")

print("\n" + "=" * 70)

## 8. Batch Processing (Optional)

In [None]:
# CONFIGURE THIS: List of session names to process
SESSIONS_TO_PROCESS = [
    "20251017_125600_session",
    # "20251017_130000_session",
    # "20251017_131500_session",
    # Add more session names here
]

base_path = "/content/drive/My Drive/silksong_data/"

print(f"Batch processing {len(SESSIONS_TO_PROCESS)} session(s)\n")
print("=" * 70)

for i, session in enumerate(SESSIONS_TO_PROCESS, 1):
    print(f"\n[{i}/{len(SESSIONS_TO_PROCESS)}] Processing: {session}")
    print("-" * 70)

    audio_path = os.path.join(base_path, session, "audio_16k.wav")
    output_dir = os.path.join(base_path, session)

    if not os.path.exists(audio_path):
        print(f"⚠️  Skipping - audio file not found")
        continue

    try:
        # Transcribe
        audio = whisperx.load_audio(audio_path)
        result = model.transcribe(audio, batch_size=16, initial_prompt=CUSTOM_PROMPT)

        # Align
        result = whisperx.align(
            result["segments"],
            model_a,
            metadata,
            audio,
            device,
            return_char_alignments=False
        )

        # Save
        output_json = os.path.join(output_dir, "whisperx_output.json")
        with open(output_json, 'w', encoding='utf-8') as f:
            json.dump(result, f, indent=2, ensure_ascii=False)

        word_count = sum(len(seg.get('words', [])) for seg in result.get('segments', []))
        print(f"✅ Complete - {len(result['segments'])} segments, {word_count} words")

    except Exception as e:
        print(f"❌ Error: {e}")
        continue

print("\n" + "=" * 70)
print("🎉 Batch processing complete!")

## 9. Download Results (Optional)

In [None]:
# Download the JSON file directly to your computer
from google.colab import files

# Download the transcription results
output_json = os.path.join(output_dir, "whisperx_output.json")

if os.path.exists(output_json):
    print(f"Downloading: {output_json}")
    files.download(output_json)
    print("✅ Download started! Check your browser's download folder.")
else:
    print(f"❌ File not found: {output_json}")

---

## Next Steps

After transcription completes:

1. **Download** the `whisperx_output.json` file from Google Drive
2. **Move** to your local project: `data/continuous/[session_name]/whisperx_output.json`
3. **Run label alignment** on your local machine:
   ```bash
   python align_voice_labels.py \
     --session [session_name] \
     --whisper data/continuous/[session_name]/whisperx_output.json
   ```
4. **Continue** with Phase III training data preparation

---

## Resources

- **Project Documentation:** `docs/Phase_V/CLOUD_GPU_GUIDE.md`
- **WhisperX GitHub:** https://github.com/m-bain/whisperx
- **Colab FAQ:** https://research.google.com/colaboratory/faq.html