Installing the Dependencies

In [6]:
!pip install --upgrade pip
!pip install kaggle        # only needed if downloading dataset via API
!pip install openai-whisper==20250625  # official whisper (or 'whisper' package)
!pip install faster-whisper  # optional GPU-friendly alternative
!pip install librosa soundfile pydub
!pip install language_tool_python

Collecting openai-whisper==20250625
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m803.2/803.2 kB[0m [31m15.0 MB/s[0m  [33m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: openai-whisper
  Building wheel for openai-whisper (pyproject.toml) ... [?25l[?25hdone
  Created wheel for openai-whisper: filename=openai_whisper-20250625-py3-none-any.whl size=803979 sha256=7fd486d925c2e12b2525a7f39fc188027848e63f057503442ed0c2a4e9778785
  Stored in directory: /root/.cache/pip/wheels/61/d2/20/09ec9bef734d126cba375b15898010b6cc28578d8afdde5869
Successfully built openai-whisper
Installing collected packages: openai-whisper
Successfully installed openai-whisper-20250625


In [4]:
!pip install transformers torch sentencepiece  # for GPT-2 perplexity or other models
!pip install textstat
!pip install scikit-learn xgboost pandas numpy matplotlib seaborn
!pip install pyannote.audio==3.4.0

Collecting pyannote.audio==3.4.0
  Downloading pyannote_audio-3.4.0-py2.py3-none-any.whl.metadata (11 kB)
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio==3.4.0)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting lightning>=2.0.1 (from pyannote.audio==3.4.0)
  Downloading lightning-2.5.6-py3-none-any.whl.metadata (42 kB)
Collecting pyannote.core<6.0,>=5.0.0 (from pyannote.audio==3.4.0)
  Downloading pyannote.core-5.0.0-py3-none-any.whl.metadata (1.4 kB)
Collecting pyannote.database<6.0,>=5.0.1 (from pyannote.audio==3.4.0)
  Downloading pyannote.database-5.1.3-py3-none-any.whl.metadata (1.1 kB)
Collecting pyannote.metrics<4.0,>=3.2 (from pyannote.audio==3.4.0)
  Downloading pyannote.metrics-3.2.1-py3-none-any.whl.metadata (1.3 kB)
Collecting pyannote.pipeline<4.0,>=3.0.1 (from pyannote.audio==3.4.0)
  Downloading pyannote.pipeline-3.0.1-py3-none-any.whl.metadata (897 bytes)
Collecting pytorch_metric_learning>=2.1.0 (from pyannote.audio==3.4.

In [9]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [17]:
!ls /content/drive/MyDrive/grammar_voice_samples

000010-0013.txt  000010-0014.wav  000050-0049.txt  000050-0050.wav
000010-0013.wav  000010-0015.txt  000050-0049.wav
000010-0014.txt  000010-0015.wav  000050-0050.txt


In [18]:
# Notebook cell
import os, glob, json, math, re
import pandas as pd, numpy as np
from pathlib import Path
from tqdm.auto import tqdm

# Audio
import librosa, soundfile as sf
from pydub import AudioSegment

# ASR: whisper (or faster-whisper)
import whisper

# Grammar checking
import language_tool_python

# Text features
import textstat
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Modeling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import joblib


  m = re.match('([su]([0-9]{1,2})p?) \(([0-9]{1,2}) bit\)$', token)
  m2 = re.match('([su]([0-9]{1,2})p?)( \(default\))?$', token)
  elif re.match('(flt)p?( \(default\))?$', token):
  elif re.match('(dbl)p?( \(default\))?$', token):


Utility: audio loading & normalization

In [19]:
# notebook cell
def load_audio(path, sr=16000, mono=True, normalize=True):
    y, sr_ret = librosa.load(path, sr=sr, mono=mono)
    if normalize:
        # RMS normalize (avoid divide by zero)
        rms = np.sqrt(np.mean(y**2)) + 1e-9
        y = y / rms
    return y, sr

def ensure_wav(path_in, path_out, target_sr=16000):
    # convert mp3/ogg to wav 16k mono using pydub
    audio = AudioSegment.from_file(path_in)
    audio = audio.set_frame_rate(target_sr).set_channels(1).set_sample_width(2)
    audio.export(path_out, format="wav")
    return path_out


ASR transcription (Whisper) ‚Äî generate transcripts from audio

In [20]:
# Notebook cell
ASR_MODEL_NAME = "small"  # choose tiny, base, small, medium, large
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)
asr_model = whisper.load_model(ASR_MODEL_NAME).to(device)

def transcribe_file_whisper(path, language=None):
    # returns a dict with 'text' and other metadata
    result = asr_model.transcribe(path, language=language)
    return result  # result['text'] is the transcript

# Example:
# r = transcribe_file_whisper("data/audio/sample1.wav")
# print(r['text'])


Device: cpu


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 461M/461M [00:03<00:00, 127MiB/s]


Grammar error extraction using LanguageTool

[31mERROR: Could not find a version that satisfies the requirement java (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for java[0m[31m
[0m

In [25]:
!apt-get install -y openjdk-17-jre-headless
!pip install language_tool_python


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Suggested packages:
  libnss-mdns fonts-dejavu-extra fonts-ipafont-gothic fonts-ipafont-mincho
  fonts-wqy-microhei | fonts-wqy-zenhei fonts-indic
The following NEW packages will be installed:
  openjdk-17-jre-headless
0 upgraded, 1 newly installed, 0 to remove and 41 not upgraded.
Need to get 48.3 MB of archives.
After this operation, 193 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 openjdk-17-jre-headless amd64 17.0.16+8~us1-0ubuntu1~22.04.1 [48.3 MB]
Fetched 48.3 MB in 5s (9,792 kB/s)
Selecting previously unselected package openjdk-17-jre-headless:amd64.
(Reading database ... 125446 files and directories currently installed.)
Preparing to unpack .../openjdk-17-jre-headless_17.0.16+8~us1-0ubuntu1~22.04.1_amd64.deb ...
Unpacking openjdk-17-jre-headless:amd64 (17.0.16+8~us1-0ubuntu1~22.04.1) ...
Setting up openjdk-17-jre-headl

In [26]:
# Notebook cell
import language_tool_python
tool = language_tool_python.LanguageTool('en-US')

def grammar_stats_from_text(text):
    matches = tool.check(text)
    total_errors = len(matches)
    # categorize errors by ruleId or category
    rule_counts = {}
    for m in matches:
        rid = m.ruleId or "UNKNOWN"
        rule_counts[rid] = rule_counts.get(rid, 0) + 1
    # gather unique messages (optional)
    messages = [m.message for m in matches]
    return {
        "total_errors": total_errors,
        "rule_counts": rule_counts,
        "messages": messages,
        "matches": matches
    }


Downloading LanguageTool latest: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 255M/255M [00:20<00:00, 12.2MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmp073gxzrq.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.


Audio/prosodic features (pitch, speaking rate, duration, energy)

In [27]:
# Notebook cell
import numpy as np
import librosa
import math

def prosody_features(y, sr):
    # duration
    duration = len(y)/sr
    # energy
    energy = np.mean(y**2)
    # speaking rate approx: words / duration (but need words from transcript)
    # pitch (f0) via librosa.pyin (if available)
    try:
        f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'),
                                                     fmax=librosa.note_to_hz('C7'))
        # filter out nans
        f0_clean = f0[~np.isnan(f0)]
        if len(f0_clean) > 0:
            f0_mean = float(np.mean(f0_clean))
            f0_std = float(np.std(f0_clean))
        else:
            f0_mean, f0_std = 0.0, 0.0
    except Exception as e:
        f0_mean, f0_std = 0.0, 0.0
    # zero crossing rate
    zcr = float(np.mean(librosa.feature.zero_crossing_rate(y)))
    # tempo (rough)
    tempo, _ = librosa.beat.beat_track(y, sr=sr, trim=False)
    return {
        "duration": duration,
        "energy": float(energy),
        "f0_mean": f0_mean,
        "f0_std": f0_std,
        "zcr": zcr,
        "tempo": float(tempo)
    }


Full pipeline to create dataset (transcribe audio ‚Üí text features ‚Üí grammar features ‚Üí audio features)

In [28]:
AUDIO_DIR = "/content/drive/MyDrive/grammar_voice_samples"
labels_path = None  # no labels

In [4]:
# ======================================================
# üì¶ Install dependencies
# ======================================================
!apt-get install -y openjdk-17-jre-headless -qq
!pip install -q language_tool_python transformers librosa textstat tqdm pandas numpy torch

# ======================================================
# üìö Imports
# ======================================================
import os, glob, warnings
import pandas as pd
import numpy as np
from tqdm import tqdm
import librosa
import textstat
import language_tool_python
import torch
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

warnings.filterwarnings("ignore")

# ======================================================
# üß† Initialize tools
# ======================================================
tool = language_tool_python.LanguageTool('en-US')
gpt_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_model.eval()

# ======================================================
# ‚öôÔ∏è Helper Functions
# ======================================================

def load_audio(path, sr=16000):
    """Safely load audio"""
    try:
        y, sr = librosa.load(path, sr=sr)
        return y, sr
    except Exception as e:
        print(f"[Audio Error] {path}: {e}")
        return np.array([]), sr

def gpt2_perplexity(text):
    """Compute GPT-2 perplexity safely"""
    try:
        if not text.strip():
            return np.nan
        encodings = gpt_tokenizer(text, return_tensors="pt")
        with torch.no_grad():
            max_length = gpt2_model.config.n_positions
            stride = 512
            nlls = []
            for i in range(0, encodings.input_ids.size(1), stride):
                begin_loc = max(i + stride - max_length, 0)
                end_loc = i + stride
                input_ids = encodings.input_ids[:, begin_loc:end_loc]
                target_ids = input_ids.clone()
                target_ids[:, :-stride] = -100
                outputs = gpt2_model(input_ids, labels=target_ids)
                neg_log_likelihood = outputs.loss * stride
                nlls.append(neg_log_likelihood)
            ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
        return float(ppl)
    except Exception:
        return np.nan

def grammar_stats_from_text(text):
    """Detect grammar errors using LanguageTool"""
    try:
        matches = tool.check(text)
        return len(matches)
    except:
        return np.nan

def readability_scores(text):
    """Compute readability metrics"""
    try:
        return {
            "flesch_kincaid": textstat.flesch_kincaid_grade(text),
            "flesch_reading_ease": textstat.flesch_reading_ease(text)
        }
    except:
        return {"flesch_kincaid": np.nan, "flesch_reading_ease": np.nan}

def prosody_features(y, sr):
    """Extract prosodic/audio features"""
    try:
        if len(y) == 0:
            return {k: np.nan for k in ["duration", "energy", "f0_mean", "f0_std", "zcr", "tempo"]}
        duration = librosa.get_duration(y=y, sr=sr)
        energy = np.mean(y ** 2)
        f0 = librosa.yin(y, fmin=50, fmax=300)
        f0_mean = np.mean(f0)
        f0_std = np.std(f0)
        zcr = np.mean(librosa.feature.zero_crossing_rate(y))
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
        return {
            "duration": duration,
            "energy": energy,
            "f0_mean": f0_mean,
            "f0_std": f0_std,
            "zcr": zcr,
            "tempo": tempo
        }
    except Exception as e:
        print(f"[Prosody Error] {e}")
        return {k: np.nan for k in ["duration", "energy", "f0_mean", "f0_std", "zcr", "tempo"]}

def safe_read_text(txt_path):
    """Read text safely"""
    try:
        with open(txt_path, "r", encoding="utf-8") as f:
            return f.read().strip()
    except:
        return ""

# ======================================================
# üöÄ Main Feature Extraction Loop
# ======================================================

AUDIO_DIR = "/content/drive/MyDrive/grammar_voice_samples"
OUTPUT_DIR = "/content/drive/MyDrive/grammar_voice_samples_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)

rows = []
audio_paths = sorted(glob.glob(os.path.join(AUDIO_DIR, "*.wav")))

print(f"üéß Found {len(audio_paths)} audio files")

for p in tqdm(audio_paths, desc="Extracting features"):
    filename = os.path.basename(p)
    txt_path = os.path.join(AUDIO_DIR, filename.replace(".wav", ".txt"))

    # Load transcript text
    text = safe_read_text(txt_path)

    # Text and grammar features
    total_errors = grammar_stats_from_text(text)
    token_count = len(gpt_tokenizer.tokenize(text)) if text else 0
    ppl = gpt2_perplexity(text) if len(text.split()) > 3 else np.nan
    read_scores = readability_scores(text)

    # Audio features
    y, sr = load_audio(p, sr=16000)
    pros = prosody_features(y, sr)

    # Combine all features
    row = {
        "filename": filename,
        "transcript": text,
        "total_errors": total_errors,
        "token_count": token_count,
        "ppl": ppl,
        "flesch_kincaid": read_scores['flesch_kincaid'],
        "flesch_reading": read_scores['flesch_reading_ease'],
        "duration": pros['duration'],
        "energy": pros['energy'],
        "f0_mean": pros['f0_mean'],
        "f0_std": pros['f0_std'],
        "zcr": pros['zcr'],
        "tempo": pros['tempo']
    }

    rows.append(row)

# ======================================================
# üíæ Save extracted features
# ======================================================
output_path = os.path.join(OUTPUT_DIR, "features.csv")
df_features = pd.DataFrame(rows)

# Handle NaN safely
df_features.fillna("", inplace=True)
df_features.to_csv(output_path, index=False)

print(f"\n‚úÖ Features saved successfully to: {output_path}")
df_features.head()


Selecting previously unselected package openjdk-17-jre-headless:amd64.
(Reading database ... (Reading database ... 5%(Reading database ... 10%(Reading database ... 15%(Reading database ... 20%(Reading database ... 25%(Reading database ... 30%(Reading database ... 35%(Reading database ... 40%(Reading database ... 45%(Reading database ... 50%(Reading database ... 55%(Reading database ... 60%(Reading database ... 65%(Reading database ... 70%(Reading database ... 75%(Reading database ... 80%(Reading database ... 85%(Reading database ... 90%(Reading database ... 95%(Reading database ... 100%(Reading database ... 125446 files and directories currently installed.)
Preparing to unpack .../openjdk-17-jre-headless_17.0.16+8~us1-0ubuntu1~22.04.1_amd64.deb ...
Unpacking openjdk-17-jre-headless:amd64 (17.0.16+8~us1-0ubuntu1~22.04.1) ...
Setting up openjdk-17-jre-headless:amd64 (17.0.16+8~us1-0ubuntu1~22.04.1) ...
update-alternatives: using /usr/lib/jvm/java-17-openjdk-amd64/bin

Downloading LanguageTool latest: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 255M/255M [00:11<00:00, 22.2MB/s]
INFO:language_tool_python.download_lt:Unzipping /tmp/tmpv8zuegpx.zip to /root/.cache/language_tool_python.
INFO:language_tool_python.download_lt:Downloaded https://internal1.languagetool.org/snapshots/LanguageTool-latest-snapshot.zip to /root/.cache/language_tool_python.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

üéß Found 5 audio files


Extracting features:   0%|          | 0/5 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.
Extracting features: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:52<00:00, 10.52s/it]


‚úÖ Features saved successfully to: /content/drive/MyDrive/grammar_voice_samples_output/features.csv





Unnamed: 0,filename,transcript,total_errors,token_count,ppl,flesch_kincaid,flesch_reading,duration,energy,f0_mean,f0_std,zcr,tempo
0,000010-0013.wav,We'll also be holding a special webinar on tha...,0,18,42.2425,8.541538,56.978462,5.290875,0.009675,199.628403,69.475356,0.115326,[133.92857142857142]
1,000010-0014.wav,"Overall, it's just important that you know tha...",1,22,10.333729,6.310588,80.097647,4.620813,0.01731,204.256936,69.144918,0.088335,[133.92857142857142]
2,000010-0015.wav,"That said, now I'd like to introduce a couple ...",0,23,34.938625,5.9675,66.5275,6.990875,0.010751,178.459519,57.707044,0.103984,[66.96428571428571]
3,000050-0049.wav,to when you see it happen on the screen and ju...,1,15,47.053768,3.633333,95.73,3.324313,0.004157,211.861588,65.947779,0.105826,[144.23076923076923]
4,000050-0050.wav,you don't need an external monitor or TV to us...,1,23,79.904961,8.008182,76.832273,5.649063,0.005374,225.753037,60.590099,0.11463,[117.1875]


Modeling: train/test split, feature selection, training XGBoost / RandomForest

In [5]:
import seaborn as sns
import matplotlib.pyplot as plt

df = pd.read_csv("/content/drive/MyDrive/grammar_voice_samples_output/features.csv")

# Basic EDA
print(df.info())
print(df.describe())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   filename        5 non-null      object 
 1   transcript      5 non-null      object 
 2   total_errors    5 non-null      int64  
 3   token_count     5 non-null      int64  
 4   ppl             5 non-null      float64
 5   flesch_kincaid  5 non-null      float64
 6   flesch_reading  5 non-null      float64
 7   duration        5 non-null      float64
 8   energy          5 non-null      float64
 9   f0_mean         5 non-null      float64
 10  f0_std          5 non-null      float64
 11  zcr             5 non-null      float64
 12  tempo           5 non-null      object 
dtypes: float64(8), int64(2), object(3)
memory usage: 652.0+ bytes
None
       total_errors  token_count        ppl  flesch_kincaid  flesch_reading  \
count      5.000000     5.000000   5.000000        5.000000        5.0000

In [7]:
df

Unnamed: 0,filename,transcript,total_errors,token_count,ppl,flesch_kincaid,flesch_reading,duration,energy,f0_mean,f0_std,zcr,tempo,grammar_score_auto
0,000010-0013.wav,We'll also be holding a special webinar on tha...,0,18,42.2425,8.541538,56.978462,5.290875,0.009675,199.628403,69.475356,0.115326,[133.92857143],10.0
1,000010-0014.wav,"Overall, it's just important that you know tha...",1,22,10.333729,6.310588,80.097647,4.620813,0.01731,204.256936,69.144918,0.088335,[133.92857143],9.565217
2,000010-0015.wav,"That said, now I'd like to introduce a couple ...",0,23,34.938625,5.9675,66.5275,6.990875,0.010751,178.459519,57.707044,0.103984,[66.96428571],10.0
3,000050-0049.wav,to when you see it happen on the screen and ju...,1,15,47.053768,3.633333,95.73,3.324313,0.004157,211.861588,65.947779,0.105826,[144.23076923],9.375
4,000050-0050.wav,you don't need an external monitor or TV to us...,1,23,79.904961,8.008182,76.832273,5.649063,0.005374,225.753037,60.590099,0.11463,[117.1875],9.583333


In [11]:
import numpy as np
import pandas as pd

# Load the extracted feature dataset
df = pd.read_csv("/content/drive/MyDrive/grammar_voice_samples_output/features.csv")

# Replace NaN with 0 for safe numeric operations
df = df.fillna(0)

# -----------------------------
# üìä Generate Grammar Quality Score (Heuristic Label)
# -----------------------------

# Step 1: Base grammar score from grammar errors
# Fewer total_errors = higher base score
grammar_component = np.exp(-0.2 * df["total_errors"])

# Step 2: Readability score (scaled)
# Flesch reading ease: higher = easier to read ‚Üí better grammar
readability_component = np.clip(df["flesch_reading"] / 100, 0, 1)

# Step 3: Speech smoothness
# Lower pitch variance (f0_std) = more consistent speech
f0_component = np.exp(-0.1 * df["f0_std"])

# Step 4: Combine into one score
# Weighted average
df["label"] = (
    0.6 * grammar_component +
    0.3 * readability_component +
    0.1 * f0_component
)

# Normalize label to 0‚Äì10 range
df["label"] = 10 * (df["label"] - df["label"].min()) / (df["label"].max() - df["label"].min())

# Save new dataset
df.to_csv("/content/drive/MyDrive/grammar_voice_samples_output/features_with_labels.csv", index=False)

print("‚úÖ Labels generated and saved.")
print(df[["filename", "total_errors", "flesch_reading", "f0_std", "label"]].head())


‚úÖ Labels generated and saved.
          filename  total_errors  flesch_reading     f0_std      label
0  000010-0013.wav             0       56.978462  69.475356   6.296098
1  000010-0014.wav             1       80.097647  69.144918   1.239877
2  000010-0015.wav             0       66.527500  57.707044  10.000000
3  000050-0049.wav             1       95.730000  65.947779   7.262885
4  000050-0050.wav             1       76.832273  60.590099   0.000000


In [9]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Load your feature dataset
df = pd.read_csv("/content/drive/MyDrive/grammar_voice_samples_output/features.csv")

# üß© Check what columns exist
print(df.columns)

# üéØ Define your target (replace 'label' with the correct column name)
if "label" in df.columns:
    y = df["label"].fillna(0)
else:
    # If you don't have labels yet, create dummy labels just to test the pipeline
    print("‚ö†Ô∏è No label column found, creating dummy labels for testing.")
    y = [0] * len(df)  # temporary placeholder

# üßÆ Define feature matrix
X = df[[
    "total_errors", "token_count", "ppl",
    "flesch_kincaid", "flesch_reading",
    "duration", "energy", "f0_mean", "f0_std", "zcr"
]].fillna(0)

# ‚úÖ Ensure lengths match
print(f"X shape: {X.shape}, y length: {len(y)}")

# üîÄ Split dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# üöÄ Train model
model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=4,
    random_state=42
)
model.fit(X_train, y_train)

# üìà Evaluate
y_pred = model.predict(X_test)
print("‚úÖ Model Performance:")
print("MSE:", mean_squared_error(y_test, y_pred))
print("R¬≤:", r2_score(y_test, y_pred))


Index(['filename', 'transcript', 'total_errors', 'token_count', 'ppl',
       'flesch_kincaid', 'flesch_reading', 'duration', 'energy', 'f0_mean',
       'f0_std', 'zcr', 'tempo'],
      dtype='object')
‚ö†Ô∏è No label column found, creating dummy labels for testing.
X shape: (5, 10), y length: 5
‚úÖ Model Performance:
MSE: 0.0
R¬≤: nan


In [12]:
import joblib
model_path = "/content/drive/MyDrive/grammar_voice_samples_output/grammar_score_model.pkl"
joblib.dump(model, model_path)
print("‚úÖ Model saved to:", model_path)



‚úÖ Model saved to: /content/drive/MyDrive/grammar_voice_samples_output/grammar_score_model.pkl


In [13]:
import joblib
import librosa
import numpy as np
import textstat
import language_tool_python
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

# Load model
model_path = "/content/drive/MyDrive/grammar_voice_samples_output/grammar_score_model.pkl"
model = joblib.load(model_path)

# Initialize tools (load once)
tool = language_tool_python.LanguageTool('en-US')
gpt_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_model.eval()

# Helper functions reused
def gpt2_perplexity(text):
    if not text.strip():
        return 0
    encodings = gpt_tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = gpt2_model(**encodings, labels=encodings["input_ids"])
        loss = outputs.loss
        return float(torch.exp(loss))

def grammar_stats_from_text(text):
    matches = tool.check(text)
    return len(matches)

def readability_scores(text):
    return {
        "flesch_kincaid": textstat.flesch_kincaid_grade(text),
        "flesch_reading": textstat.flesch_reading_ease(text)
    }

def prosody_features(y, sr):
    duration = librosa.get_duration(y=y, sr=sr)
    energy = np.mean(y ** 2)
    f0 = librosa.yin(y, fmin=50, fmax=300)
    f0_mean = np.mean(f0)
    f0_std = np.std(f0)
    zcr = np.mean(librosa.feature.zero_crossing_rate(y))
    return duration, energy, f0_mean, f0_std, zcr

def predict_grammar_score(audio_path, text_path=None):
    # Load audio
    y, sr = librosa.load(audio_path, sr=16000)
    duration, energy, f0_mean, f0_std, zcr = prosody_features(y, sr)

    # Load or transcribe text
    text = ""
    if text_path and os.path.exists(text_path):
        with open(text_path, "r") as f:
            text = f.read().strip()

    # Features
    total_errors = grammar_stats_from_text(text)
    token_count = len(gpt_tokenizer.tokenize(text)) if text else 0
    ppl = gpt2_perplexity(text)
    read_scores = readability_scores(text)

    # Feature vector for model
    features = np.array([[
        total_errors, token_count, ppl,
        read_scores["flesch_kincaid"], read_scores["flesch_reading"],
        duration, energy, f0_mean, f0_std, zcr
    ]])

    # Predict
    pred = model.predict(features)[0]
    print(f"üéØ Predicted Grammar Score for {os.path.basename(audio_path)}: {pred:.2f}/10")
    return pred


In [16]:
predict_grammar_score(
    "/content/drive/MyDrive/grammar_voice_samples/000010-0014.wav",
   "/content/drive/MyDrive/grammar_voice_samples/000010-0014.txt"
)


üéØ Predicted Grammar Score for 000010-0014.wav: 0.00/10


np.float32(0.0)