# Grammar Scoring Engine for Spoken Data

#  Brief Report

## Objective
Predict a continuous grammar score (0–5) for each spoken audio sample using Wav2Vec2 embeddings and Ridge Regression.

## Approach
We used Wav2Vec2, a powerful pretrained model from Facebook, to extract deep audio embeddings from .wav files. These embeddings were fed into a Ridge Regression model for predicting grammar scores.

###  **Pipeline Overview**

1. **Preprocessing**:
   - Converted `.wav` to 16kHz mono audio
   - Used `facebook/wav2vec2-base-960h` for extracting 768-dim features

2. **Model**:
   - Ridge Regression (suitable for small datasets and continuous outputs)

3. **Training**:
   - Split data into training and validation sets (80-20 split)
   - Trained on 444 samples using CPU (no GPU used)

### **Evaluation Results**

- **Validation RMSE**: 0.9027
- **Pearson Correlation**: 0.6442

###  **Submission Format**

| filename       | label |
|----------------|-------|
| audio_001.wav  | 3.42  |
| audio_002.wav  | 2.78  |
---

In [2]:
!pip install transformers librosa torch --quiet

import os
import numpy as np
import pandas as pd
import librosa
import torch
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import warnings
warnings.filterwarnings("ignore")

In [3]:
base_path = '/kaggle/input/shl-intern-hiring-assessment/dataset/'

train_df = pd.read_csv(base_path + 'train.csv')
test_df = pd.read_csv(base_path + 'test.csv')
sample_submission = pd.read_csv(base_path + 'sample_submission.csv')

train_audio_path = base_path + 'audios_train/'
test_audio_path = base_path + 'audios_test/'

In [4]:
from transformers.utils import logging
logging.set_verbosity_error()
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
model.eval()

preprocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/163 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/291 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Wav2Vec2Model(
  (feature_extractor): Wav2Vec2FeatureEncoder(
    (conv_layers): ModuleList(
      (0): Wav2Vec2GroupNormConvLayer(
        (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (activation): GELUActivation()
        (layer_norm): GroupNorm(512, 512, eps=1e-05, affine=True)
      )
      (1-4): 4 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
      (5-6): 2 x Wav2Vec2NoLayerNormConvLayer(
        (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,), bias=False)
        (activation): GELUActivation()
      )
    )
  )
  (feature_projection): Wav2Vec2FeatureProjection(
    (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (projection): Linear(in_features=512, out_features=768, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Wav2Vec2Encoder(
    (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(
  

In [5]:
def extract_wav2vec2_embedding(file_path):
    y, sr = librosa.load(file_path, sr=16000)
    inputs = processor(y, return_tensors="pt", sampling_rate=16000).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

X_train_feats = []
y_train_labels = []

for idx, row in train_df.iterrows():
    file_path = os.path.join(train_audio_path, row['filename'])
    features = extract_wav2vec2_embedding(file_path)
    X_train_feats.append(features)
    y_train_labels.append(row['label'])

X = np.array(X_train_feats)
y = np.array(y_train_labels)

KeyboardInterrupt: 

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model_ridge = Ridge(alpha=1.0)
model_ridge.fit(X_train, y_train)

In [None]:
y_pred = model_ridge.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
pearson_corr, _ = pearsonr(y_val, y_pred)

print(f"Validation RMSE: {rmse:.4f}")
print(f"Pearson Correlation: {pearson_corr:.4f}")

In [None]:
X_test_feats = []

for idx, row in test_df.iterrows():
    file_path = os.path.join(test_audio_path, row['filename'])
    features = extract_wav2vec2_embedding(file_path)
    X_test_feats.append(features)

X_test = np.array(X_test_feats)

test_preds = model_ridge.predict(X_test)

submission = test_df.copy()
submission['label'] = test_preds
submission[['filename','label']].to_csv("submission.csv", index=False)
submission.head()