In [3]:
!pip install librosa

Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.1.0-py3-none-any.whl.metadata (9.0 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Collecting pooch>=1.1 (from librosa)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-1.0.0-cp312-abi3-win_amd64.whl.metadata (5.6 kB)
Downloading librosa-0.11.0-py3-none-any.whl (260 kB)
Downloading audioread-3.1.0-py3-none-any.whl (23 kB)
Downloading pooch-1.8.2-py3-none-any.whl (64 kB)
Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl (1.0 MB)
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------- ----------------------------- 0.3/1.0 MB ? eta -:--:--
   ------------------------------ --------- 0.8/1.0 MB 2.6 MB/s eta 0:00:01
   ---------------------------------------- 1.0/1.0 M


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
# ============================================
#      üöÄ SHL Grammar Scoring System
# ============================================

import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from tqdm import tqdm
import librosa

# ML Imports
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# -------------------------------
# 0. Paths
# -------------------------------
BASE_PATH = "/kaggle/input/shl-intern-hiring-assessment-2025/dataset"
CSV_PATH = os.path.join(BASE_PATH, "csvs")
AUDIO_PATH = os.path.join(BASE_PATH, "audios")
train_df = pd.read_csv(os.path.join(CSV_PATH, "train.csv"))
test_df = pd.read_csv(os.path.join(CSV_PATH, "test.csv"))

print("Train shape:", train_df.shape, "Test shape:", test_df.shape)

ID_COL = "filename"
TARGET_COL = "label"

# -------------------------------
# 1. Audio Feature Extraction
# -------------------------------
def extract_features(file_path, sr=16000, n_mfcc=20):
    try:
        y, sr = librosa.load(file_path, sr=sr)
        feats = {}
        
        # Basic stats
        feats["duration"] = librosa.get_duration(y=y, sr=sr)
        feats["rms_mean"] = np.mean(librosa.feature.rms(y=y))
        feats["zcr_mean"] = np.mean(librosa.feature.zero_crossing_rate(y))
        
        # Spectral features
        feats["spec_centroid"] = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
        feats["spec_bw"] = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
        feats["spec_rolloff"] = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
        
        # Chroma
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        feats["chroma_mean"] = np.mean(chroma)
        feats["chroma_std"] = np.std(chroma)
        
        # Tonnetz (harmony)
        try:
            tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)
            feats["tonnetz_mean"] = np.mean(tonnetz)
            feats["tonnetz_std"] = np.std(tonnetz)
        except Exception:
            feats["tonnetz_mean"] = feats["tonnetz_std"] = 0.0
        
        # MFCCs and deltas
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
        delta_mfcc = librosa.feature.delta(mfccs)
        delta2_mfcc = librosa.feature.delta(mfccs, order=2)
        
        for i in range(n_mfcc):
            feats[f"mfcc_{i+1}_mean"] = np.mean(mfccs[i])
            feats[f"mfcc_{i+1}_std"] = np.std(mfccs[i])
            feats[f"mfcc_delta_{i+1}_mean"] = np.mean(delta_mfcc[i])
            feats[f"mfcc_delta2_{i+1}_mean"] = np.mean(delta2_mfcc[i])
        
        return feats
    except Exception as e:
        print(f"‚ö†Ô∏è Error processing {file_path}: {e}")
        return None

def build_feature_df(df, split="train"):
    data = []
    for _, r in tqdm(df.iterrows(), total=len(df), desc=f"Extracting {split} features"):
        fname = str(r[ID_COL])
        path = os.path.join(AUDIO_PATH, split, f"{fname}.wav")
        f = extract_features(path)
        if f is None:
            continue
        f[ID_COL] = fname
        if split == "train":
            f[TARGET_COL] = r[TARGET_COL]
        data.append(f)
    return pd.DataFrame(data)

train_feat_df = build_feature_df(train_df, "train")
test_feat_df = build_feature_df(test_df, "test")

print("Train Features:", train_feat_df.shape, "Test Features:", test_feat_df.shape)

# -------------------------------
# 2. Preprocessing
# -------------------------------
X = train_feat_df.drop(columns=[ID_COL, TARGET_COL])
y = train_feat_df[TARGET_COL].astype(float)
X_test = test_feat_df.drop(columns=[ID_COL])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Optional PCA for noise reduction
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)
X_test_pca = pca.transform(X_test_scaled)
print("Reduced dims:", X_pca.shape[1])

# -------------------------------
# 2. Preprocessing
# -------------------------------
X = train_feat_df.drop(columns=[ID_COL, TARGET_COL])
y = train_feat_df[TARGET_COL].astype(float)
X_test = test_feat_df.drop(columns=[ID_COL])

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Optional PCA for noise reduction
pca = PCA(n_components=0.95, random_state=42)
X_pca = pca.fit_transform(X_scaled)
X_test_pca = pca.transform(X_test_scaled)
print("Reduced dims:", X_pca.shape[1])

# -------------------------------
# 3. Modeling
# -------------------------------
xgb_model = XGBRegressor(
    n_estimators=800,
    learning_rate=0.03,
    max_depth=7,
    subsample=0.8,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    reg_alpha=0.5,
    tree_method="hist",
    random_state=42,
    n_jobs=-1
)

lgb_model = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.02,
    num_leaves=64,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.5,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1
)

cat_model = CatBoostRegressor(
    iterations=700,
    learning_rate=0.03,
    depth=8,
    l2_leaf_reg=3,
    random_seed=42,
    verbose=0
)

stack = StackingRegressor(
    estimators=[("xgb", xgb_model), ("lgb", lgb_model), ("cat", cat_model)],
    final_estimator=XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        tree_method="hist"
    ),
    n_jobs=-1
)

# -------------------------------
# 4. Cross-validation
# -------------------------------
kf = KFold(n_splits=5, shuffle=True, random_state=42)
val_scores = []
test_preds = np.zeros(len(X_test_pca))

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_pca)):
    X_tr, X_val = X_pca[tr_idx], X_pca[val_idx]
    y_tr, y_val = y.values[tr_idx], y.values[val_idx]
    
    stack.fit(X_tr, y_tr)
    val_pred = stack.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    val_scores.append(rmse)
    
    test_preds += stack.predict(X_test_pca) / kf.n_splits
    print(f"Fold {fold+1} RMSE: {rmse:.4f}")

print("‚úÖ Mean CV RMSE:", np.mean(val_scores))

# -------------------------------
# 5. Submission
# -------------------------------
test_preds = np.clip(test_preds, 0.0, 5.0)

submission = pd.DataFrame({
    "filename": test_feat_df["filename"].values,
    "label": test_preds
})

os.makedirs("/kaggle/working/output", exist_ok=True)
submission.to_csv("/kaggle/working/output/submission.csv", index=False)
print("‚úÖ Saved submission.csv with shape:", submission.shape)