In [None]:
# (Optional) Only if data is required from Kaggle
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()
shl_intern_hiring_assessment_path = kagglehub.competition_download('shl-intern-hiring-assessment')
print('Data source import complete.')

# Libraries

In [None]:
import os
import optuna
import torch
import torchaudio
import numpy as np
import pandas as pd
from tqdm import tqdm
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from transformers import WhisperProcessor, WhisperModel
from sklearn.metrics import make_scorer, mean_squared_error

# Configuration and model loading

In [None]:
# ---- CONFIG ----
BASE_DIR = "/kaggle/input/shl-intern-hiring-assessment/Dataset"
TRAIN_CSV = os.path.join(BASE_DIR, "train.csv")
TEST_CSV = os.path.join(BASE_DIR, "test.csv")
TRAIN_AUDIO_DIR = os.path.join(BASE_DIR, "audios", "train")
TEST_AUDIO_DIR = os.path.join(BASE_DIR, "audios", "test")
OUTPUT_CSV = "/kaggle/working/submission.csv"

In [None]:
# ---- DEVICE SETUP ----
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---- Load Whisper ----
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3-turbo")
model = WhisperModel.from_pretrained("openai/whisper-large-v3-turbo").to(device)
model.eval()

# Preprocessing and feature extraction from audio

In [None]:
def extract_embedding(audio_path):
    waveform, sr = torchaudio.load(audio_path)
    if sr != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
        waveform = resampler(waveform)

    waveform = waveform.mean(dim=0).numpy()  # convert to mono and to NumPy 1D
    inputs = processor(waveform, sampling_rate=16000, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        last_hidden = model.encoder(**inputs).last_hidden_state
    return last_hidden.mean(dim=1).squeeze().cpu().numpy()

# ---- Load datasets ----
train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)

# ---- Extract embeddings ----
def process_dataframe(df, audio_dir, extract_score=True):
    features = []
    labels = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        audio_path = os.path.join(audio_dir, row['filename'])
        emb = extract_embedding(audio_path)
        features.append(emb)
        if extract_score:
            labels.append(row['label'])
    return np.array(features), (np.array(labels) if extract_score else None)

In [None]:
X_train, y_train = process_dataframe(train_df, TRAIN_AUDIO_DIR, extract_score=True)
X_test, _ = process_dataframe(test_df, TEST_AUDIO_DIR, extract_score=False)

In [None]:
# Custom RMSE scorer
def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

rmse_scorer = make_scorer(rmse, greater_is_better=False)

In [None]:
cat_model = CatBoostRegressor(verbose=0, task_type="GPU")
print("\n🚀 Training Final Models...")

In [None]:
cat_model.fit(X_train, y_train)

# Catboost hyperparameter tuning

In [None]:
# Split your data
X_train_cat, X_val_cat, y_train_cat, y_val_cat = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
train_pool = Pool(X_train_cat, y_train_cat)
val_pool = Pool(X_val_cat, y_val_cat)

In [None]:
# Objective function for Optuna
def objective(trial):
    params = {
        "iterations": 200,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0, 1),
        "random_strength": trial.suggest_float("random_strength", 1, 20),
        "loss_function": "RMSE",
        "eval_metric": "RMSE",
        "verbose": 0,
        "random_seed": 42,
        "early_stopping_rounds": 50
    }

    model = CatBoostRegressor(**params)
    model.fit(train_pool, eval_set=val_pool, use_best_model=True)
    preds = model.predict(X_val_cat)
    rmse = mean_squared_error(y_val_cat, preds, squared=False)
    return rmse

# Run Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

# Train and test predictions

In [None]:
# Best model
best_params = study.best_params
final_model = CatBoostRegressor(**best_params)
final_model.fit(train_pool, eval_set=val_pool, use_best_model=True)

In [None]:
cat_train_preds = final_model.predict(X_train)
train_rmse = mean_squared_error(y_train, cat_train_preds, squared=False)
print(f"Train RMSE: {train_rmse:.5f}") #Train RMSE: 0.50655

In [None]:
cat_test_preds = final_model.predict(X_test)
final_preds = (cat_test_preds)
submission = test_df.copy()
submission['label'] = final_preds
submission[['filename', 'label']].to_csv("cat_tuned_submission.csv", index=False)