In [None]:
# Cell 1: Data Prep
import pandas as pd
import os
import re

BASE_PATH = "/kaggle/input/nppe-2-automatic-disfluency-restoration"
TRAIN_CSV_PATH = os.path.join(BASE_PATH, "train.csv")
DISFLUENCY_CSV_PATH = os.path.join(BASE_PATH, "unique_disfluencies.csv")
AUDIO_PATH = os.path.join(BASE_PATH, "downloaded_audios")

print("Loading data...")
train_df = pd.read_csv(TRAIN_CSV_PATH)
disfluency_df = pd.read_csv(DISFLUENCY_CSV_PATH)
disfluency_set = set(disfluency_df['disfluency'])

def create_clean_transcript(disfluent_text):
    if not isinstance(disfluent_text, str): return ""
    words = disfluent_text.split()
    clean_words = [word for word in words if word not in disfluency_set]
    return " ".join(clean_words)

print("Creating clean transcripts...")
train_df['clean_transcript'] = train_df['transcript'].apply(create_clean_transcript)
train_df['audio_file'] = train_df['id'].apply(lambda x: os.path.join(AUDIO_PATH, f"{x}.wav"))

print("Cell 1 Complete. train_df is created.")

In [None]:
# Cell 2: Environment Setup
import os
os.environ["WANDB_DISABLED"] = "true" # Force-disable wandb from the start

# --- 1. Pin Protobuf ---
print("Pinning protobuf==3.20.3...")
!pip install protobuf==3.20.3 -q

# --- 2. Pin Datasets ---
print("Pinning datasets==2.16.1...")
!pip install datasets==2.16.1 -q

# --- 3. Install all other libraries ---
print("Installing transformers, librosa, accelerate, and other libs...")
!pip install transformers soundfile librosa jiwer 'accelerate>=0.26.0' evaluate -q

print("\n--- All dependencies installed. Cell 2 Complete. ---")

In [None]:
# Cell 3: Load, Filter, & Process
import pandas as pd
import os
import re
import numpy as np
import torch
import warnings
import librosa
import jiwer
from datasets import Dataset, DatasetDict, Audio
from sklearn.model_selection import train_test_split
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, AutoConfig
from dataclasses import dataclass
from typing import Any, Dict, List, Union

# --- 1. Set Single GPU ---
os.environ["CUDA_VISIBLE_DEVICES"] = "0" 

# --- 2. Setup Device ---
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on device: {DEVICE}")
warnings.filterwarnings("ignore")

# --- 3. Load Model & Processor (WITH DROPOUT) ---
print("Loading model and processor (Plan B: collabora/whisper-base-hindi + dropout)...")
MODEL_ID = "collabora/whisper-base-hindi"

# --- Add dropout ---
config = AutoConfig.from_pretrained(MODEL_ID)
config.dropout = 0.1
# ---

# These must be global for the next cells
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    MODEL_ID,
    config=config  # <-- Pass our modified config to the model
).to(DEVICE)
processor.tokenizer.set_prefix_tokens(language="hindi", task="transcribe")

# --- 4. Filter Missing Files ---
print("Checking for missing audio files...")
def check_file_exists(filepath):
    return os.path.exists(filepath)
train_df['file_exists'] = train_df['audio_file'].apply(check_file_exists)
train_df_filtered = train_df[train_df['file_exists'] == True]
print(f"Filtered from {len(train_df)} to {len(train_df_filtered)} samples.")

# --- 5. Create Dataset ---
train_data, val_data = train_test_split(train_df_filtered, test_size=0.1, random_state=42)
ds = DatasetDict()
ds["train"] = Dataset.from_pandas(train_data.reset_index(drop=True))
ds["test"] = Dataset.from_pandas(val_data.reset_index(drop=True))

# --- 6. Preprocessing Function (with librosa) ---
def prepare_dataset(batch):
    filepath = batch["audio_file"]
    speech_array, sampling_rate = librosa.load(filepath, sr=16000)
    
    batch["input_features"] = processor(speech_array, sampling_rate=16000).input_features[0] 
    batch["labels"] = processor(text=batch["clean_transcript"]).input_ids 
    
    return batch

print("Applying preprocessing to all samples (this will take a few minutes)...")
processed_ds = ds.map(prepare_dataset, remove_columns=ds["train"].column_names)
print("Cell 3 Complete. Data is ready.")

In [None]:
# --- Cell 4: Training ---
import torch
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import evaluate

# 1. Define the Data Collator
# (This acts like a tetris player, stacking audio of different lengths together)
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # Split inputs and labels
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        label_features = [{"input_ids": feature["labels"]} for feature in features]

        # Pad audio (inputs)
        batch = processor.feature_extractor.pad(input_features, return_tensors="pt")

        # Pad text (labels)
        labels_batch = processor.tokenizer.pad(label_features, return_tensors="pt")

        # Replace padding with -100 so the model ignores it when calculating loss
        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

        # If there's a start token at the beginning, remove it
        if (labels[:, 0] == processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

# 2. Define the Metric (WER - Word Error Rate)
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids

    # Replace -100 with the pad_token_id
    label_ids[label_ids == -100] = processor.tokenizer.pad_token_id

    # Decode predictions and labels to text
    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.batch_decode(label_ids, skip_special_tokens=True)

    # Calculate WER
    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

# 3. Training Arguments (The Rules)
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-finetuned-hindi",
    per_device_train_batch_size=8,      # Reduce to 4 if you run out of memory
    gradient_accumulation_steps=2,      # Helps simulate a larger batch size
    learning_rate=1e-5,
    warmup_steps=50,
    max_steps=500,                      # Train for 500 steps (adjust as needed)
    gradient_checkpointing=True,        # Saves memory
    fp16=True,                          # Use half-precision (faster on GPU)
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=250,
    eval_steps=250,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,            # Lower WER is better!
    push_to_hub=False,
)

# 4. Initialize Trainer
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=processed_ds["train"],
    eval_dataset=processed_ds["test"],
    data_collator=DataCollatorSpeechSeq2SeqWithPadding(processor=processor),
    compute_metrics=compute_metrics,
    tokenizer=processor.feature_extractor,
)

# 5. TRAIN! ðŸš€
print("Starting training...")
trainer.train()

# 6. Save the Model
print("Saving model to 'best_model_final'...")
trainer.save_model("best_model_final")
processor.save_pretrained("best_model_final")
print("Training Complete and Model Saved!")

In [None]:
# Cell 5: Submission Generation (Hybrid Strategy)
import pandas as pd
import os
import re
import numpy as np
import torch
import warnings
import librosa
import jiwer
from datasets import Dataset, DatasetDict
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments
from dataclasses import dataclass
from typing import Any, Dict, List, Union
from tqdm.auto import tqdm
# No Colab imports needed

print("--- Starting Submission Generation (Hybrid Strategy) ---")

# --- 1. Re-define Global Variables ---
BASE_PATH = "/kaggle/input/nppe-2-automatic-disfluency-restoration" # <-- Kaggle Path
AUDIO_PATH = os.path.join(BASE_PATH, "downloaded_audios") # <-- Kaggle Path
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_PATH = "best_model_final" # <-- Loads from /kaggle/working/

print(f"Loading trained model from: {MODEL_PATH}")
print(f"Using device: {DEVICE}")

# --- 2. Load Saved Model and Processor ---
try:
    processor = AutoProcessor.from_pretrained(MODEL_PATH)
    model = AutoModelForSpeechSeq2Seq.from_pretrained(MODEL_PATH).to(DEVICE)
except EnvironmentError:
    print(f"Error: Model not found at '{MODEL_PATH}'.")
    raise

# --- 3. Re-define Collator (Simpler, no labels needed) ---
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = processor.feature_extractor.pad(input_features, return_tensors="pt")
        return batch

# --- 4. Initialize a new Trainer *just for prediction* ---
training_args = Seq2SeqTrainingArguments(
    output_dir="./prediction_temp", # <-- Saves to /kaggle/working/
    per_device_eval_batch_size=4, 
    fp16=True, 
    predict_with_generate=True,
    generation_max_length=225,
    report_to="none",
)

predictor = Seq2SeqTrainer(
    args=training_args,
    model=model,
    data_collator=DataCollatorSpeechSeq2SeqWithPadding(processor=processor),
    tokenizer=processor,
)
print("Predictor is loaded with the best model.")

# --- 5. Load Test Data AND FIND NULLS ---
TEST_CSV_PATH = os.path.join(BASE_PATH, "test.csv")
test_df = pd.read_csv(TEST_CSV_PATH)
test_df['audio_file'] = test_df['id'].apply(lambda x: os.path.join(AUDIO_PATH, f"{x}.wav"))

# Find only the rows where transcript is null
samples_to_predict = test_df[test_df['transcript'].isnull()].copy()

print(f"Found {len(samples_to_predict)} samples to predict.")

# --- 6. Process and Predict ONLY the Null Samples ---
if not samples_to_predict.empty:
    test_dataset = Dataset.from_pandas(samples_to_predict)

    def prepare_test_dataset(batch):
        filepath = batch["audio_file"]
        # Check if file exists, if not, create silent audio as fallback
        if not os.path.exists(filepath):
            print(f"Warning: File not found {filepath}. Using silent audio.")
            speech_array = np.zeros(16000) # 1 second of silence
        else:
            speech_array, sampling_rate = librosa.load(filepath, sr=16000)
            
        batch["input_features"] = processor(speech_array, sampling_rate=16000).input_features[0]
        return batch

    print("Applying preprocessing to test set...")
    test_ds_processed = test_dataset.map(prepare_test_dataset, remove_columns=test_dataset.column_names)

    print(f"Generating predictions for {len(test_ds_processed)} samples...")
    predictions = predictor.predict(test_ds_processed)
    predicted_ids = predictions.predictions

    print("Decoding predictions...")
    decoded_predictions = processor.batch_decode(predicted_ids, skip_special_tokens=True)
    
    # Create a map of {id: predicted_transcript}
    results_map = {samples_to_predict.iloc[i]['id']: decoded_predictions[i] for i in range(len(decoded_predictions))}
    
    # --- 7. Create Submission File (HYBRID) ---
    print("Merging predictions with original test.csv...")
    
    # Create a new column for our predictions
    test_df['predicted_transcript'] = test_df['id'].map(results_map)
    
    # Use the new prediction ONLY if the original was null. Otherwise, keep the original.
    test_df['transcript'] = test_df['transcript'].fillna(test_df['predicted_transcript'])
    
    submission_df = test_df[['id', 'transcript']]
    
else:
    print("No null samples found. Creating submission from original test.csv.")
    submission_df = test_df[['id', 'transcript']]


# --- 8. Save ---
# This saves the file to /kaggle/working/submission.csv
submission_df.to_csv("submission.csv", index=False)
print("\nðŸŽ‰ðŸŽ‰ðŸŽ‰ --- Submission.csv is ready! (Hybrid Strategy) --- ðŸŽ‰ðŸŽ‰ðŸŽ‰")