In [8]:
import os, io, math, json, tempfile
from dataclasses import dataclass
from typing import Dict, Tuple
import numpy as np
import librosa
import soundfile as sf
from IPython.display import Audio, display, Markdown
import ipywidgets as widgets
from transformers import pipeline
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
def load_audio(path: str, sr: int = 16000) -> Tuple[np.ndarray, int]:
    wav, orig_sr = librosa.load(path, sr=sr, mono=True)
    return wav, sr

# simple token overlap
def token_overlap_ratio(a: str, b: str) -> float:
    ta = set([w.lower().strip('.,!?;:()[]') for w in a.split() if w.strip()])
    tb = set([w.lower().strip('.,!?;:()[]') for w in b.split() if w.strip()])
    if not ta and not tb:
        return 1.0
    if not ta or not tb:
        return 0.0
    overlap = ta.intersection(tb)
    return len(overlap) / max(1, len(ta.union(tb)))

# Simple readability / length metrics
def simple_quality_metrics(transcript: str, audio_seconds: float) -> Dict:
    words = transcript.split()
    wps = len(words) / max(1e-6, audio_seconds)
    avg_word_len = sum(len(w) for w in words) / max(1, len(words))
    return {'num_words': len(words), 'words_per_sec': wps, 'avg_word_length': avg_word_len}

In [10]:
ASR_MODEL = "facebook/wav2vec2-large-960h"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

print('Loading ASR pipeline (this will download model weights the first time)...')
asr = pipeline('automatic-speech-recognition', model=ASR_MODEL, chunk_length_s=30)
print('ASR pipeline ready.')

print('Loading sentence-transformers embedding model...')
embedder = SentenceTransformer(EMBEDDING_MODEL)
print('Embedding model ready.')

Loading ASR pipeline (this will download model weights the first time)...


Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cpu


ASR pipeline ready.
Loading sentence-transformers embedding model...
Embedding model ready.


In [None]:
upload = widgets.FileUpload(accept='.wav', multiple=False)
question_text = widgets.Textarea(value='', placeholder='Paste the interview question here (e.g. What is your name?)', description='Question:', layout=widgets.Layout(width='100%', height='80px'))
topk_widget = widgets.IntSlider(value=5, min=1, max=15, step=1, description='Top keywords:')
run_button = widgets.Button(description='Transcribe & Assess', button_style='primary')

out = widgets.Output(layout={'border': '1px solid black'})

def on_run_clicked(b):
    with out:
        out.clear_output()
        if not upload.value:
            print('Please upload a .wav file using the upload widget above.')
            return
        files = upload.value
        if isinstance(files, dict):
            files = list(files.values())
        elif isinstance(files, tuple):
            files = list(files)

        if not files:
            print("Please upload a .wav file first.")
            return

        fileinfo = files[0]
        data = fileinfo['content']
        name = fileinfo.get('name', 'uploaded.wav')
        tmp_dir = tempfile.gettempdir()
        tmp_path = os.path.join(tmp_dir, name)

        with open(tmp_path, 'wb') as f:
            f.write(data)

        print(f"Saved uploaded file to {tmp_path}")
        wav, sr = load_audio(tmp_path, sr = 16000)
        audio_seconds = len(wav) / sr
        display(Audio(data=wav, rate=sr))
        print('\nTranscribing audio (this can take a few seconds)...')
        wav, sr = librosa.load(tmp_path, sr = 16000, mono = True)

        result = asr({"array": wav, "sampling_rate": sr})
        transcript = result['text'].strip()
        print('\n--- TRANSCRIPT ---\n')
        print(transcript if transcript else '[empty transcript]')
        print('\n--- METRICS ---\n')
        q_text = question_text.value.strip()
        if not q_text:
            print('No question text provided. Please paste the interview question into the Question field.')
            return
        embeddings = embedder.encode([q_text, transcript], convert_to_numpy=True)
        cos = float(cosine_similarity([embeddings[0]], [embeddings[1]])[0][0])
        overlap = token_overlap_ratio(q_text, transcript)
        quality = simple_quality_metrics(transcript, audio_seconds)
        # Weighted combination: 70% semantic (cos), 20% token overlap, 10% length/quality sanity check
        length_score = min(1.0, quality['num_words'] / max(1, len(q_text.split())))  # if answer length similar to question length -> ok
        raw_score = 0.7 * cos + 0.2 * overlap + 0.1 * length_score
        score_0_100 = round(float(raw_score * 100), 2)
        print(f'Cosine similarity (semantic): {cos:.4f}')
        print(f'Token overlap ratio: {overlap:.4f}')
        print(f'Answer words: {quality['num_words']}, words/sec: {quality['words_per_sec']:.2f}')
        print(f'Length-based score: {length_score:.4f}')
        print(f'--> Aggregate relevance score (0-100): {score_0_100}')
        # Save results
        out_json = {
            'question': q_text,
            'transcript': transcript,
            'cosine_similarity': cos,
            'token_overlap': overlap,
            'quality': quality,
            'aggregate_relevance_score': score_0_100
        }
        tmp_dir = tempfile.gettempdir()
        out_path = os.path.join(tmp_dir, "transcription_relevance_result.json")

        with open(out_path, "w", encoding="utf-8") as fh:
            json.dump(out_json, fh, ensure_ascii=False, indent=2)

        print("Full results saved to:", out_path)
        
run_button.on_click(on_run_clicked)

display(widgets.VBox([widgets.Label('Upload a .wav voice note of the interview answer:'), upload, question_text, run_button, out]))

VBox(children=(Label(value='Upload a .wav voice note of the interview answer:'), FileUpload(value=(), accept='…

In [None]:

def assess_file(path: str, question: str) -> Dict:
    wav, sr = load_audio(path, sr=16000)
    audio_seconds = len(wav) / sr
    print(f'Audio length: {audio_seconds:.2f}s — running ASR...')
    res = asr(path)
    transcript = res.get('text', '').strip()
    print('Transcript:', transcript)
    emb = embedder.encode([question, transcript], convert_to_numpy=True)
    cos = float(cosine_similarity([emb[0]], [emb[1]])[0][0])
    overlap = token_overlap_ratio(question, transcript)
    quality = simple_quality_metrics(transcript, audio_seconds)
    length_score = min(1.0, quality['num_words'] / max(1, len(question.split())))
    raw_score = 0.7 * cos + 0.2 * overlap + 0.1 * length_score
    score_0_100 = round(float(raw_score * 100), 2)
    return {
        'question': question,
        'transcript': transcript,
        'cosine_similarity': cos,
        'token_overlap': overlap,
        'quality': quality,
        'aggregate_relevance_score': score_0_100
    }
