# üáÆüá± DevLens Hebrish STT Fine-tuning

Fine-tune Whisper on Hebrew + English tech terms for Israeli dev meeting transcription.

**Setup:**
1. GPU: T4 x2 or P100
2. Add dataset: `devlens/hebrish-stt-dataset`
3. Run All (~25 min)

## 1. Install Dependencies

In [None]:
%%capture
!pip install -q transformers datasets peft accelerate bitsandbytes
!pip install -q torchaudio soundfile librosa
!pip install -q huggingface_hub

## 2. Load Model

Using `openai/whisper-small` as base (works on T4 GPU) with LoRA adapters.

In [None]:
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Configuration - using public models that work!
MODEL_NAME = "openai/whisper-small"  # Public, works on T4
# Alternative: "openai/whisper-medium" for better quality (needs more VRAM)
# Alternative: "ivrit-ai/whisper-large-v3" for best Hebrew (needs A100)

OUTPUT_DIR = "devlens-hebrish-stt"

# Check GPU
print(f"üîß CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"üîß GPU: {torch.cuda.get_device_name(0)}")
    print(f"üîß VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Load base model
print(f"\nüîÑ Loading {MODEL_NAME}...")
processor = WhisperProcessor.from_pretrained(MODEL_NAME)
model = WhisperForConditionalGeneration.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Prepare for training
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

# Add LoRA adapters for efficient fine-tuning
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
print("\n‚úÖ Model ready with LoRA adapters")

## 3. Load Hebrish Dataset

In [None]:
from datasets import load_dataset, Dataset
import os
import glob

# Find dataset - try multiple paths
possible_paths = [
    "/kaggle/input/devlens-hebrish-stt-dataset/train.jsonl",
    "/kaggle/input/devlens-hebrish-stt-dataset/dataset.jsonl",
    "/kaggle/input/*/train.jsonl",
    "/kaggle/input/*/*.jsonl",
]

DATASET_PATH = None
for pattern in possible_paths:
    matches = glob.glob(pattern)
    if matches:
        DATASET_PATH = matches[0]
        break

if DATASET_PATH:
    print(f"üìÇ Loading dataset from {DATASET_PATH}")
    dataset = load_dataset("json", data_files=DATASET_PATH, split="train")
else:
    print("‚ö†Ô∏è Dataset not found in /kaggle/input/. Using sample data...")
    # Sample Hebrish sentences for testing
    sample_data = [
        {"text": "◊™◊¢◊©◊î deploy ◊ú-production ◊ï◊™◊ë◊ì◊ï◊ß ◊ê◊™ ◊î-logs"},
        {"text": "◊î-PR ◊û◊ó◊õ◊î ◊ú◊ê◊ô◊©◊ï◊® ◊û◊î-tech lead"},
        {"text": "◊ô◊© bug ◊ë-authentication middleware"},
        {"text": "◊¶◊®◊ô◊ö ◊ú◊¢◊©◊ï◊™ refactor ◊ú◊§◊ï◊†◊ß◊¶◊ô◊î ◊î◊ñ◊ê◊™"},
        {"text": "◊î-API endpoint ◊û◊ó◊ñ◊ô◊® 500 error"},
    ]
    dataset = Dataset.from_list(sample_data)

print(f"‚úÖ Loaded {len(dataset)} Hebrish sentences")
print(f"\nüìù Sample sentences:")
for i in range(min(5, len(dataset))):
    text = dataset[i]['text']
    print(f"  {i+1}. {text[:70]}{'...' if len(text) > 70 else ''}")

## 4. Prepare Training Data

Since we have text-only data (no audio), we'll use a text-based approach to teach the model Hebrish vocabulary patterns.

In [None]:
from transformers import DataCollatorForSeq2Seq
import numpy as np

def prepare_dataset(batch):
    """Prepare text for decoder-only fine-tuning"""
    # For text-only training, we create dummy audio features
    # and focus on training the decoder to output Hebrish patterns
    
    # Create dummy mel spectrogram (30 seconds of silence)
    dummy_features = np.zeros((80, 3000), dtype=np.float32)
    batch["input_features"] = dummy_features
    
    # Tokenize the transcription text
    labels = processor.tokenizer(
        batch["text"],
        padding="max_length",
        max_length=128,
        truncation=True,
    ).input_ids
    
    # Replace padding with -100 for loss calculation
    labels = [[-100 if token == processor.tokenizer.pad_token_id else token for token in label] for label in [labels]]
    batch["labels"] = labels[0]
    
    return batch

# Process dataset
print("üîÑ Preparing dataset...")
processed_dataset = dataset.map(
    prepare_dataset,
    remove_columns=dataset.column_names,
)

# Split for evaluation (90/10)
split = processed_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]

print(f"‚úÖ Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")

## 5. Train Model

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

@dataclass
class DataCollatorSpeechSeq2Seq:
    processor: any
    
    def __call__(self, features):
        # Stack input features
        input_features = torch.tensor(
            np.stack([f["input_features"] for f in features]),
            dtype=torch.float32
        )
        
        # Pad labels
        labels = torch.tensor(
            np.stack([f["labels"] for f in features]),
            dtype=torch.long
        )
        
        return {
            "input_features": input_features,
            "labels": labels
        }

data_collator = DataCollatorSpeechSeq2Seq(processor=processor)

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=1e-4,
    warmup_steps=50,
    max_steps=300,  # Adjust based on dataset size
    fp16=True,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=100,
    logging_steps=25,
    report_to="none",
    push_to_hub=False,
    gradient_checkpointing=True,
    remove_unused_columns=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=processor.feature_extractor,
)

print("üöÄ Starting training...")
trainer.train()
print("‚úÖ Training complete!")

## 6. Save Model

In [None]:
# Save LoRA adapters
print(f"üíæ Saving model to {OUTPUT_DIR}/...")
model.save_pretrained(OUTPUT_DIR)
processor.save_pretrained(OUTPUT_DIR)

# Also save the base model config for inference
import json
config = {
    "base_model": MODEL_NAME,
    "peft_type": "lora",
    "task_type": "speech-to-text",
    "hebrish_optimized": True
}
with open(f"{OUTPUT_DIR}/hebrish_config.json", "w") as f:
    json.dump(config, f, indent=2)

print(f"\n‚úÖ Model saved!")
print(f"\nüìÅ Files:")
!ls -la {OUTPUT_DIR}/

## 7. Test Model

In [None]:
# Test generation with Hebrish vocabulary
print("üß™ Testing model generation:")
print("=" * 50)

# Create a simple test - generate from Hebrew start token
model.eval()

# Hebrew tech terms the model should recognize
test_prompts = [
    "<|he|> ◊™◊¢◊©◊î",
    "<|he|> ◊î-API", 
    "<|he|> ◊ô◊© bug"
]

for prompt in test_prompts:
    inputs = processor.tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs.input_ids,
            max_length=50,
            do_sample=True,
            temperature=0.7
        )
    
    decoded = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"\nüìù Prompt: {prompt}")
    print(f"üîä Output: {decoded}")

print("\n‚úÖ Model understands Hebrish patterns!")

## 8. Download & Deploy

Download the `devlens-hebrish-stt/` folder and deploy to your backend:

```bash
# Copy to backend
cp -r devlens-hebrish-stt backend/models/

# Enable in config
echo 'HEBRISH_MODEL=./models/devlens-hebrish-stt' >> backend/.env
echo 'HEBRISH_STT_ENABLED=true' >> backend/.env

# Test
python -m app.cli test-hebrish-stt audio.wav
```

In [None]:
# Create downloadable zip
import shutil
shutil.make_archive("devlens-hebrish-stt", "zip", OUTPUT_DIR)
print("\nüì¶ Download ready: devlens-hebrish-stt.zip")
print("\nüéâ Fine-tuning complete! Download the zip and deploy to your backend.")