In [2]:
#  1. Installing dependencies

!pip install -q openai-whisper language-tool-python joblib

# Fixing Java issue (needed for language_tool_python)
!sudo apt-get install openjdk-17-jre -y
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-17-openjdk-amd64"





#  3. Import libraries

import whisper
import language_tool_python
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import joblib


#  4. Set paths

drive_root = "/kaggle/input/shl-hiring-assessment/Dataset"
csv_path = os.path.join(drive_root, "train.csv")
audio_dir = os.path.join(drive_root, "audios/train")
model_save_path =  "/kaggle/working/grammar_model.pkl"


#  5. Load models

asr_model = whisper.load_model("base")
tool = language_tool_python.LanguageTool('en-US')


#  6. Transcribe + Feature Extraction

def transcribe(audio_path):
    result = asr_model.transcribe(audio_path)
    return result["text"]

def extract_features(text):
    matches = tool.check(text)
    num_errors = len(matches)
    total_words = len(text.split())
    error_rate = num_errors / total_words if total_words > 0 else 0
    return num_errors, error_rate, total_words

#  7. Load Dataset

def load_data(audio_dir, csv_path):
    df = pd.read_csv(csv_path)
    features = []

    for i, row in df.iterrows():
        audio_path = os.path.join(audio_dir, row['filename'])
        try:
            text = transcribe(audio_path)
            feats = extract_features(text)
            features.append({
                'num_errors': feats[0],
                'error_rate': feats[1],
                'word_count': feats[2],
                'grammar_score': row['label']
            })
            print(f"✅ Processed: {row['filename']}")
        except Exception as e:
            print(f"❌ Error with {row['filename']}: {e}")

    return pd.DataFrame(features)


#  8. Train and Save Model

def train_and_save_model():
    data = load_data(audio_dir, csv_path)
    X = data[["num_errors", "error_rate", "word_count"]]
    y = data["grammar_score"]

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)

    joblib.dump(model, model_save_path)
    print(f"✅ Model saved to: {model_save_path}")

    preds = model.predict(X)
    mae = mean_absolute_error(y, preds)
    print(f"📉 MAE on training set: {mae:.2f}")


# 9. Running the model

train_and_save_model()

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
openjdk-17-jre is already the newest version (17.0.14+7-1~22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 122 not upgraded.


  checkpoint = torch.load(fp, map_location=device)


✅ Processed: audio_710.wav
✅ Processed: audio_1265.wav
✅ Processed: audio_1114.wav
✅ Processed: audio_946.wav
✅ Processed: audio_1127.wav
✅ Processed: audio_669.wav
✅ Processed: audio_1029.wav
✅ Processed: audio_713.wav
✅ Processed: audio_845.wav
✅ Processed: audio_150.wav
✅ Processed: audio_764.wav
✅ Processed: audio_1253.wav
✅ Processed: audio_1271.wav
✅ Processed: audio_638.wav
✅ Processed: audio_755.wav
✅ Processed: audio_950.wav
✅ Processed: audio_1232.wav
✅ Processed: audio_346.wav
✅ Processed: audio_730.wav
✅ Processed: audio_783.wav
✅ Processed: audio_39.wav
✅ Processed: audio_292.wav
✅ Processed: audio_880.wav
✅ Processed: audio_921.wav
✅ Processed: audio_68.wav
✅ Processed: audio_905.wav
✅ Processed: audio_1035.wav
✅ Processed: audio_1290.wav
✅ Processed: audio_1221.wav
✅ Processed: audio_824.wav
✅ Processed: audio_743.wav
✅ Processed: audio_676.wav
✅ Processed: audio_1111.wav
✅ Processed: audio_354.wav
✅ Processed: audio_134.wav
✅ Processed: audio_881.wav
✅ Processed: audio_

In [4]:
# --- Predict on test set and save submission.csv --- #

import joblib
import pandas as pd
import os

# Load model
model = joblib.load("/kaggle/working/grammar_model.pkl")

# Load and process test data
def load_test_data(audio_dir, csv_path):
    import whisper
    import language_tool_python

    asr_model = whisper.load_model("base")
    tool = language_tool_python.LanguageTool('en-US')

    def transcribe(audio_path):
        result = asr_model.transcribe(audio_path)
        return result["text"]

    def extract_features(text):
        matches = tool.check(text)
        num_errors = len(matches)
        total_words = len(text.split())
        error_rate = num_errors / total_words if total_words > 0 else 0
        return num_errors, error_rate, total_words

    df = pd.read_csv(csv_path)
    features = []

    for _, row in df.iterrows():
        audio_path = os.path.join(audio_dir, row['filename'])
        try:
            text = transcribe(audio_path)
            feats = extract_features(text)
            features.append({
                'filename': row['filename'],
                'num_errors': feats[0],
                'error_rate': feats[1],
                'word_count': feats[2],
            })
            print(f"✅ Test: {row['filename']}")
        except Exception as e:
            print(f"❌ Error in test: {row['filename']} — {e}")

    return pd.DataFrame(features)

# Define test paths again
drive_root = "/kaggle/input/shl-hiring-assessment/Dataset"
test_csv_path = os.path.join(drive_root, "test.csv")
test_audio_dir = os.path.join(drive_root, "audios/test")

# Extract features and predict
test_df = load_test_data(test_audio_dir, test_csv_path)
X_test = test_df[["num_errors", "error_rate", "word_count"]]
test_df["label"] = model.predict(X_test)

# Save submission
submission_df = test_df[["filename", "label"]]
submission_path = "/kaggle/working/submission.csv"
submission_df.to_csv(submission_path, index=False)
print(f"📁 Saved submission to: {submission_path}")


  checkpoint = torch.load(fp, map_location=device)


✅ Test: audio_804.wav
✅ Test: audio_1028.wav
✅ Test: audio_865.wav
✅ Test: audio_774.wav
✅ Test: audio_1138.wav
✅ Test: audio_278.wav
✅ Test: audio_1212.wav
✅ Test: audio_178.wav
✅ Test: audio_542.wav
✅ Test: audio_248.wav
✅ Test: audio_872.wav
✅ Test: audio_954.wav
✅ Test: audio_853.wav
✅ Test: audio_171.wav
✅ Test: audio_922.wav
✅ Test: audio_915.wav
✅ Test: audio_1220.wav
✅ Test: audio_1225.wav
✅ Test: audio_903.wav
✅ Test: audio_748.wav
✅ Test: audio_284.wav
✅ Test: audio_200.wav
✅ Test: audio_1255.wav
✅ Test: audio_1246.wav
✅ Test: audio_1102.wav
✅ Test: audio_1300.wav
✅ Test: audio_80.wav
✅ Test: audio_1041.wav
✅ Test: audio_256.wav
✅ Test: audio_328.wav
✅ Test: audio_301.wav
✅ Test: audio_938.wav
✅ Test: audio_1055.wav
✅ Test: audio_664.wav
✅ Test: audio_89.wav
✅ Test: audio_407.wav
✅ Test: audio_1025.wav
✅ Test: audio_72.wav
✅ Test: audio_1316.wav
✅ Test: audio_661.wav
✅ Test: audio_822.wav
✅ Test: audio_956.wav
✅ Test: audio_1329.wav
✅ Test: audio_1218.wav
✅ Test: audio_1286.w