In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()
shl_intern_hiring_assessment_path = kagglehub.competition_download('shl-intern-hiring-assessment')
print('Data source import complete.')

# Libraries

In [None]:
import os
import torch
import torchaudio
import numpy as np
import pandas as pd
from tqdm import tqdm
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from transformers import WhisperProcessor, WhisperModel

2025-05-03 12:00:34.720371: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746273634.902844      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746273634.957868      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Configuration and model loading

In [None]:
BASE_DIR = "/kaggle/input/shl-intern-hiring-assessment/Dataset"
TRAIN_CSV = os.path.join(BASE_DIR, "train.csv")
TEST_CSV = os.path.join(BASE_DIR, "test.csv")
TRAIN_AUDIO_DIR = os.path.join(BASE_DIR, "audios", "train")
TEST_AUDIO_DIR = os.path.join(BASE_DIR, "audios", "test")
OUTPUT_CSV = "/kaggle/working/submission.csv"

In [None]:
# ---- DEVICE SETUP ----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---- Load Whisper ----
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
model = WhisperModel.from_pretrained("openai/whisper-base").to(device)
model.eval()

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

WhisperModel(
  (encoder): WhisperEncoder(
    (conv1): Conv1d(80, 512, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(512, 512, kernel_size=(3,), stride=(2,), padding=(1,))
    (embed_positions): Embedding(1500, 512)
    (layers): ModuleList(
      (0-5): 6 x WhisperEncoderLayer(
        (self_attn): WhisperSdpaAttention(
          (k_proj): Linear(in_features=512, out_features=512, bias=False)
          (v_proj): Linear(in_features=512, out_features=512, bias=True)
          (q_proj): Linear(in_features=512, out_features=512, bias=True)
          (out_proj): Linear(in_features=512, out_features=512, bias=True)
        )
        (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (activation_fn): GELUActivation()
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (final_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    

In [None]:
# ---- Load datasets ----
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

# Preprocessing and feature extraction from audio

In [None]:
def extract_embedding(audio_path):
    waveform, sr = torchaudio.load(audio_path)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(waveform)

    waveform = waveform.mean(dim=0).numpy()  # convert to mono and to NumPy 1D
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        last_hidden = model.encoder(**inputs).last_hidden_state
    return last_hidden.mean(dim=1).squeeze().cpu().numpy()


In [None]:
# ---- Extract embeddings ----
def process_dataframe(df, audio_dir, extract_score=True):
    features = []
    labels = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        audio_path = os.path.join(audio_dir, row['filename'])
        emb = extract_embedding(audio_path)
        features.append(emb)
        if extract_score:
            labels.append(row['label'])
    return np.array(features), (np.array(labels) if extract_score else None)


In [None]:
X_train, y_train = process_dataframe(train_df, TRAIN_AUDIO_DIR, extract_score=True)
X_test, _ = process_dataframe(test_df, TEST_AUDIO_DIR, extract_score=False)

100%|██████████| 444/444 [00:35<00:00, 12.44it/s]
100%|██████████| 204/204 [00:20<00:00, 10.15it/s]


# Training XGB Regressor

*Note: The best hyperparameters are used.*
*  I have tuned and compared the hyperparameter results seperately.

In [None]:
# ---- Train XGBoost regression model ----
regressor = XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
regressor.fit(X_train, y_train)

In [None]:
# ---- Predictions ----
train_preds = regressor.predict(X_train)
test_preds = regressor.predict(X_test)

# Training RMSE calculation

In [None]:
# ---- RMSE ----
train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
print(f"Train RMSE: {train_rmse:.4f}")

Train RMSE: 0.0111


# Saving test predictions

In [None]:
output_df = test_df.copy()
output_df['label'] = test_preds
output_df.to_csv(OUTPUT_CSV, index=False)
print(f"Predictions saved to {OUTPUT_CSV}")