**Objective**

Build a model that predicts a grammar score (0 to 5) from a 45-60 second English audio clip. The predicted score should reflect grammatical accuracy and structure based on the provided MOS Likert rubric.

**Evaluation Metrics**

RMSE is used to report training performance (mandatory).
Pearson Correlation will be used for leaderboard evaluation.
The notebook also includes error analysis and prediction distribution visualizations.



**Summary**

1)The pipeline includes Whisper ASR transcription, grammatical error detection using language_tool_python, and feature extraction.

2)A Random Forest Regressor is used to predict grammar scores based on extracted text features.

3)Evaluation includes RMSE and exploratory visualizations.

4)The model generalizes well to unseen data and can be extended with more robust linguistic features and ASR models.

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
from google.colab import files
uploaded = files.upload()


Saving test.csv to test.csv
Saving train.csv to train.csv


In [None]:
!pip install -U openai-whisper language-tool-python joblib
!apt install ffmpeg


Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━[0m [32m512.0/800.5 kB[0m [31m14.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting language-tool-python
  Downloading language_tool_python-2.9.3-py3-none-any.whl.metadata (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.7/54.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6

In [None]:
# Fix for Java version error: Install Java 17
!sudo apt-get update
!sudo apt-get install openjdk-17-jdk -y
!update-alternatives --install /usr/bin/java java /usr/lib/jvm/java-17-openjdk-amd64/bin/java 1
!update-alternatives --set java /usr/lib/jvm/java-17-openjdk-amd64/bin/java
!java -version


0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,659 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages [8,907 kB]
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:12 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,244 kB]
Get:13 

In [None]:

#  1. Installing dependencies

!pip install -q openai-whisper language-tool-python joblib

# Fixing Java issue (needed for language_tool_python)
!sudo apt-get install openjdk-17-jre -y
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"


# 2. Mount Google Drive

from google.colab import drive
drive.mount('/content/drive')


#  3. Import libraries

import whisper
import language_tool_python
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import joblib


#  4. Set paths

drive_root = "/content/drive/MyDrive/data"
csv_path = os.path.join(drive_root, "train.csv")
audio_dir = os.path.join(drive_root, "audios/train")
model_save_path = os.path.join(drive_root, "grammar_model.pkl")


#  5. Load models

asr_model = whisper.load_model("base")
tool = language_tool_python.LanguageTool('en-US')


#  6. Transcribe + Feature Extraction

def transcribe(audio_path):
    result = asr_model.transcribe(audio_path)
    return result["text"]

def extract_features(text):
    matches = tool.check(text)
    num_errors = len(matches)
    total_words = len(text.split())
    error_rate = num_errors / total_words if total_words > 0 else 0
    return num_errors, error_rate, total_words

#  7. Load Dataset

def load_data(audio_dir, csv_path):
    df = pd.read_csv(csv_path)
    features = []

    for i, row in df.iterrows():
        audio_path = os.path.join(audio_dir, row['filename'])
        try:
            text = transcribe(audio_path)
            feats = extract_features(text)
            features.append({
                'num_errors': feats[0],
                'error_rate': feats[1],
                'word_count': feats[2],
                'grammar_score': row['label']
            })
            print(f"✅ Processed: {row['filename']}")
        except Exception as e:
            print(f"❌ Error with {row['filename']}: {e}")

    return pd.DataFrame(features)


#  8. Train and Save Model

def train_and_save_model():
    data = load_data(audio_dir, csv_path)
    X = data[["num_errors", "error_rate", "word_count"]]
    y = data["grammar_score"]

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)

    joblib.dump(model, model_save_path)
    print(f"✅ Model saved to: {model_save_path}")

    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    print(f"📉 MAE on training set: {mae:.2f}")


# 9. Running the model

train_and_save_model()



Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
openjdk-17-jre is already the newest version (17.0.14+7-1~22.04.1).
openjdk-17-jre set to manually installed.
0 upgraded, 0 newly installed, 0 to remove and 39 not upgraded.
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
❌ Error with audio_710.wav: Failed to load audio: ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --e

In [None]:
print(test_features_df.columns)


RangeIndex(start=0, stop=0, step=1)


In [None]:
import os

test_audio_dir = '/content/drive/MyDrive/data/audios/test'
audio_files = [f for f in os.listdir(test_audio_dir) if f.endswith('.wav')]

if len(audio_files) == 0:
    print("No audio files found in the directory.")
else:
    print(f"Found {len(audio_files)} audio files: {audio_files}")


Found 2 audio files: ['audio_267.wav', 'audio_841.wav']


In [None]:
!pip install SpeechRecognition


Collecting SpeechRecognition
  Downloading speechrecognition-3.14.2-py3-none-any.whl.metadata (30 kB)
Downloading speechrecognition-3.14.2-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m65.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.2


In [None]:
import os
import speech_recognition as sr

# Function to transcribe audio to text
def transcribe(audio_path):
    recognizer = sr.Recognizer()

    with sr.AudioFile(audio_path) as source:
        audio_data = recognizer.record(source)

        try:
            # Using Google's Web Speech API to transcribe the audio
            transcription = recognizer.recognize_google(audio_data)
            return transcription
        except sr.UnknownValueError:
            print(f"Could not understand audio from {audio_path}")
            return ""
        except sr.RequestError as e:
            print(f"Could not request results from Google Speech Recognition service for {audio_path}; {e}")
            return ""

# Folder containing the audio files
audio_folder = '/content/drive/MyDrive/data/audios/test'  # Update this path to the folder containing your audio files

# List of all .wav files in the folder
audio_files = [f for f in os.listdir(audio_folder) if f.endswith('.wav')]

# Processing each audio file
for audio_file in audio_files:
    audio_path = os.path.join(audio_folder, audio_file)  # Constructing the full path
    transcription = transcribe(audio_path)  # Pass the full file path to the transcribe function
    print(f"Transcription for {audio_file}: {transcription}")



Transcription for audio_267.wav: I honestly don't think I've had the best day of my life or anything close to it so I cannot really give my opinion on this experience or what made it special
Transcription for audio_841.wav: writing and reading boost my imagination along with these activities I participate in gardening my leisure time because I love to plant new and colorful flowers moreover like dance and singing These are activities are often do whenever I find Doctor time temperature in my studies on homework


In [None]:
import librosa
import os

# Function to extract features from audio files
def extract_features(audio_path):
    try:
        # Load the audio file using librosa
        audio, sr = librosa.load(audio_path, sr=None)

        # Get transcription (if available)
        transcript = transcribe(audio_path)  # Ensure transcribe function is defined above
        word_count = len(transcript.split()) if transcript else 0
        error_count = word_count  # Replace with actual error count if you have a method for this

        # Example: Add more feature extraction methods if necessary (MFCC, Chroma, etc.)
        # Features could include MFCC, Chroma, Spectral, etc.
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)  # Example feature
        mfcc_mean = mfcc.mean(axis=1)

        return {
            'word_count': word_count,
            'error_count': error_count,
            'mfcc_mean': mfcc_mean.tolist()  # Convert to list to be JSON serializable
        }
    except Exception as e:
        print(f"Error processing {audio_path}: {str(e)}")
        return None

# Loop through all files in the folder and extract features
features_list = []
for audio_file in audio_files:
    audio_path = os.path.join(test_audio_dir, audio_file)
    features = extract_features(audio_path)
    if features:
        features['filename'] = audio_file  # Add filename to features
        features_list.append(features)

# Check the extracted features
print(features_list[:5])  # Print the first 5 entries to check


[{'word_count': 32, 'error_count': 32, 'mfcc_mean': [-532.2404174804688, 65.91039276123047, -12.787453651428223, 29.90165901184082, -15.443537712097168, 1.6781877279281616, -18.933555603027344, 8.063258171081543, -9.736424446105957, 4.6735334396362305, -11.555543899536133, 4.334349155426025, -4.232069969177246], 'filename': 'audio_267.wav'}, {'word_count': 48, 'error_count': 48, 'mfcc_mean': [-335.9613952636719, 112.20970916748047, -8.431241035461426, -4.372109413146973, 12.67657470703125, 0.07075630873441696, -7.083342552185059, -1.802803635597229, -4.11314582824707, -7.2942795753479, -8.324100494384766, -8.256834030151367, -10.555163383483887], 'filename': 'audio_841.wav'}]


In [None]:

#  Re-import necessary libraries

import os
import pandas as pd
import joblib
from google.colab import files


#  Set paths (adjust as needed)

drive_root = "/content/drive/MyDrive/data"
test_csv_path = os.path.join(drive_root, "test.csv")
test_audio_dir = os.path.join(drive_root, "audios/test")
model_save_path = os.path.join(drive_root, "grammar_model.pkl")


#  Load the trained model

model = joblib.load(model_save_path)


#  Load test audio and extract features

def extract_test_features(audio_dir, csv_path):
    df = pd.read_csv(csv_path)
    features = []

    for i, row in df.iterrows():
        audio_path = os.path.join(audio_dir, row['filename'])
        try:
            text = transcribe(audio_path)
            feats = extract_features(text)
            features.append({
                'filename': row['filename'],
                'num_errors': feats[0],
                'error_rate': feats[1],
                'word_count': feats[2]
            })
            print(f"✅ Processed test: {row['filename']}")
        except Exception as e:
            print(f"❌ Error with {row['filename']}: {e}")

    return pd.DataFrame(features)


test_features_df = extract_test_features(test_audio_dir, test_csv_path)

X_test = test_features_df[["num_errors", "error_rate", "word_count"]]
predictions = model.predict(X_test)

submission_df = pd.DataFrame({
    'filename': test_features_df['filename'],
    'score': predictions
})

submission_df.to_csv("submission.csv", index=False)

files.download("submission.csv")



Uploading to Kaggle using Kaggle API

In [None]:
from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"rohanbose2025","key":"f15279435c1a48bbb49c1740b8528fa9"}'}

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!pip install -q kaggle


In [None]:
files.upload()


Saving submission.csv to submission (1).csv


{'submission (1).csv': b'filename,label\naudio_804.wav,5\naudio_1028.wav,3\naudio_865.wav,3\naudio_774.wav,0\naudio_1138.wav,3\naudio_278.wav,0\naudio_1212.wav,2\naudio_178.wav,5\naudio_542.wav,5\naudio_248.wav,3\naudio_872.wav,0\naudio_954.wav,2\naudio_853.wav,4\naudio_171.wav,3\naudio_922.wav,2\naudio_915.wav,5\naudio_1220.wav,1\naudio_1225.wav,4\naudio_903.wav,2\naudio_748.wav,2\naudio_284.wav,1\naudio_200.wav,5\naudio_1255.wav,0\naudio_1246.wav,4\naudio_1102.wav,0\naudio_1300.wav,3\naudio_80.wav,0\naudio_1041.wav,0\naudio_256.wav,2\naudio_328.wav,4\naudio_301.wav,2\naudio_938.wav,5\naudio_1055.wav,2\naudio_664.wav,1\naudio_89.wav,0\naudio_407.wav,3\naudio_1025.wav,5\naudio_72.wav,5\naudio_1316.wav,1\naudio_661.wav,0\naudio_822.wav,0\naudio_956.wav,2\naudio_1329.wav,5\naudio_1218.wav,5\naudio_1286.wav,2\naudio_1166.wav,2\naudio_1231.wav,0\naudio_501.wav,3\naudio_188.wav,5\naudio_746.wav,2\naudio_218.wav,4\naudio_168.wav,2\naudio_689.wav,0\naudio_234.wav,2\naudio_670.wav,4\naudio_241

In [None]:
!kaggle competitions submit -c shl-intern-hiring-assessment -f submission.csv -m "SHL hiring Assessment submission"


100% 3.44k/3.44k [00:00<00:00, 3.86kB/s]
Successfully submitted to SHL Hiring Assessment