In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, mean_squared_error
from xgboost import XGBRegressor
from tqdm import tqdm
import os
import torch
import torchaudio
from transformers import WhisperProcessor, WhisperModel

2025-05-03 17:25:26.253007: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746293126.439725      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746293126.494148      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# ---- CONFIG ----
BASE_DIR = "/kaggle/input/shl-intern-hiring-assessment/Dataset"
TRAIN_CSV = os.path.join(BASE_DIR, "train.csv")
TEST_CSV = os.path.join(BASE_DIR, "test.csv")
TRAIN_AUDIO_DIR = os.path.join(BASE_DIR, "audios", "train")
TEST_AUDIO_DIR = os.path.join(BASE_DIR, "audios", "test")
OUTPUT_CSV = "/kaggle/working/whisper_turbo_tuned_xgb.csv"

In [3]:
# ---- DEVICE SETUP ----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---- Load Whisper ----
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3-turbo")
model = WhisperModel.from_pretrained("openai/whisper-large-v3-turbo").to(device)
model.eval()

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.71M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

WhisperModel(
  (encoder): WhisperEncoder(
    (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))
    (embed_positions): Embedding(1500, 1280)
    (layers): ModuleList(
      (0-31): 32 x WhisperEncoderLayer(
        (self_attn): WhisperSdpaAttention(
          (k_proj): Linear(in_features=1280, out_features=1280, bias=False)
          (v_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (q_proj): Linear(in_features=1280, out_features=1280, bias=True)
          (out_proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=1280, out_features=5120, bias=True)
        (fc2): Linear(in_features=5120, out_features=1280, bias=True)
        (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwis

In [4]:
def extract_embedding(audio_path):
    waveform, sr = torchaudio.load(audio_path)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(waveform)

    waveform = waveform.mean(dim=0).numpy()  # convert to mono and to NumPy 1D
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        last_hidden = model.encoder(**inputs).last_hidden_state
    return last_hidden.mean(dim=1).squeeze().cpu().numpy()

# ---- Load datasets ----
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

# ---- Extract embeddings ----
def process_dataframe(df, audio_dir, extract_score=True):
    features = []
    labels = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        audio_path = os.path.join(audio_dir, row['filename'])
        emb = extract_embedding(audio_path)
        features.append(emb)
        if extract_score:
            labels.append(row['label'])
    return np.array(features), (np.array(labels) if extract_score else None)

In [5]:
X_train, y_train = process_dataframe(train_df, TRAIN_AUDIO_DIR, extract_score=True)
X_test, _ = process_dataframe(test_df, TEST_AUDIO_DIR, extract_score=False)

100%|██████████| 444/444 [03:26<00:00,  2.15it/s]
100%|██████████| 204/204 [01:33<00:00,  2.18it/s]


In [6]:
# Custom RMSE scorer
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

rmse_scorer = make_scorer(rmse, greater_is_better=False)


In [7]:
# 🔍 XGBoost Random Search
# -----------------------------
xgb = XGBRegressor(tree_method="gpu_hist", device = "cuda", predictor="gpu_predictor", random_state=42)

xgb_params = {
    'n_estimators': [50, 75, 100, 150, 200, 250, 275, 300, 350],
    'max_depth': [3, 5, 7, 9, 11, 13, 15, 17, 21],
    'learning_rate': [0.005, 0.01, 0.03, 0.05, 0.07, 0.1],
    'subsample': [0.6, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_search = RandomizedSearchCV(
    xgb,
    xgb_params,
    n_iter=1500,
    scoring=rmse_scorer,
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

print("🔍 Tuning XGBoost...")
xgb_search.fit(X_train, y_train)

🔍 Tuning XGBoost...
Fitting 3 folds for each of 1500 candidates, totalling 4500 fits



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.

Parameters: { "predictor" } are not used.

Parameters: { "predictor" } are not used.

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_pr

In [8]:
xgb_best_params = xgb_search.best_params_
print(f"✅ Best XGB Params: {xgb_best_params}")

✅ Best XGB Params: {'subsample': 0.8, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.05, 'colsample_bytree': 0.6}


In [9]:
# ✅ Train Final Models
# -----------------------------
xgb_model = XGBRegressor(**xgb_best_params, tree_method="gpu_hist", device = "cuda", random_state=42)
xgb_model.fit(X_train, y_train)


    E.g. tree_method = "hist", device = "cuda"



In [10]:
xgb_train_preds = xgb_model.predict(X_train)
xgb_test_preds = xgb_model.predict(X_test)


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [11]:
train_rmse = mean_squared_error(y_train, xgb_train_preds, squared=False)
print(f"\n✅ XGB Train RMSE: {train_rmse:.5f}")


✅ XGB Train RMSE: 0.07611


In [12]:
submission = test_df.copy()
submission['label'] = xgb_test_preds
submission[['filename', 'label']].to_csv("whisper_xgb_fine_tuned.csv", index=False)
print("\n📁 Saved")


📁 Saved
