In [6]:
import torch
import librosa
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC

model_id = "facebook/wav2vec2-lv-60-espeak-cv-ft"
processor = Wav2Vec2Processor.from_pretrained(model_id)
model = Wav2Vec2ForCTC.from_pretrained(model_id)

Loading weights:   0%|          | 0/424 [00:00<?, ?it/s]

Phonemized Output: ɑ


In [8]:
def phonemize_audio(audio_path):
    speech, sr = librosa.load(audio_path, sr=16000) # resamples to 16,000Hz
    input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values

    with torch.no_grad():
        logits = model(input_values).logits

    predicted_ids = torch.argmax(logits, dim=-1)
    phonemes = processor.batch_decode(predicted_ids)
    return phonemes

In [13]:
import os
import sounddevice as sd
from scipy.io.wavfile import write

def record_audio(filename, duration=3, sample_rate=16000):
    directory = os.path.dirname(filename)
    if directory and not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")

    print(f"--- Recording started for {duration} seconds ---")
    
    recording = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
    
    sd.wait()
    
    print("--- Recording finished ---")
    
    write(filename, sample_rate, recording)
    return filename

# record_audio("scope_phoneme_data/test/test_record.wav")

Created directory: scope_phoneme_data/test
--- Recording started for 3 seconds ---
--- Recording finished ---


'scope_phoneme_data/test/test_record.wav'

## Record & Run Recognizer Realtime

In [14]:
file_name = "scope_phoneme_data/test/test_record.wav"
record_audio(file_name)
print(phonemize_audio(file_name))

--- Recording started for 3 seconds ---
--- Recording finished ---
['aː s s d']


## Run Recognizer for all SCOPE Phoneme Data

In [9]:
import os
import pandas as pd

data_path = "data/scope_phoneme_data"
data_records = []

for folder_name in os.listdir(data_path):
    folder_path = os.path.join(data_path, folder_name)
    
    if os.path.isdir(folder_path):
        for file_name in os.listdir(folder_path):
            if file_name.endswith(".wav"):
                file_path = os.path.join(folder_path, file_name)
                phonemes = phonemize_audio(file_path)
                
                data_records.append({
                    "Folder/Label": folder_name,
                    "File Name": file_name,
                    "Predicted Phonemes": phonemes
                })

df = pd.DataFrame(data_records)
print(df.head())

df.to_csv("phoneme_results.csv", index=False)

  Folder/Label     File Name Predicted Phonemes
0       O long  O long 4.wav                [ɑ]
1       O long  O long 1.wav                [ɑ]
2       O long  O long 3.wav                [ɑ]
3       O long  O long 2.wav                [ɑ]
4            J       J 3.wav             [tʃ ɑ]
