In [8]:
import os
from pocketsphinx import AudioFile, get_model_path, Decoder
from pydub import AudioSegment
from collections import defaultdict

In [None]:
def extract_word_audio(input_wav, output_wav, target_word="that", extra_time=0):
    # Set up paths for the models
    model_path = get_model_path()
    
    # Create a list to hold the start and end times of the target word
    word_times = []

    # Create a decoder with the appropriate configuration
    config = {
        'verbose': False,
        'hmm': os.path.join(model_path, 'en-us'),  # Path to the acoustic model
        'lm': os.path.join(model_path, 'en-us.lm.bin'),  # Path to the language model
        'dict': os.path.join(model_path, 'cmudict-en-us.dict'),  # Path to the dictionary
        'frate': 100,
        'samprate': 16000
    }
    
    # Initialize the decoder
    decoder = Decoder(config)

    # Start decoding the audio file
    decoder.start_utt()
    
    # Read the audio file in binary mode and process it
    with open(input_wav, 'rb') as audio_file:
        while True:
            buf = audio_file.read(1024)
            if not buf:
                break
            decoder.process_raw(buf, False, False)

    decoder.end_utt()

    # Check for the target word in the recognized segments
    recognized_words = []
    for seg in decoder.seg():
        word = seg.word.split('(')[0]  # Get the word before the hypothesis index
        recognized_words.append(word)  # Collect all recognized words
        # Check if the segment matches the target word
        if word.lower() == target_word.lower():
            # Append start and end times (in milliseconds)
            start_time = int(seg.start_frame * (1000 / 100))  # Assuming 100 frames per second
            end_time = int(seg.end_frame * (1000 / 100))      # Adjust frame rate if necessary
            # Adjust start and end times for extra recording
            adjusted_start = max(0, start_time - extra_time)  # One second before
            adjusted_end = end_time + extra_time               # One second after
            word_times.append((adjusted_start, adjusted_end))

    # Print recognized words
    print("Recognized words:", ' '.join(recognized_words))

    # Check if the word was found
    if not word_times:
        print(f"The word '{target_word}' was not found in the audio {input_wav}.")
        return

    # Load the original audio file
    original_audio = AudioSegment.from_wav(input_wav)

    # Extract segments for each occurrence of the target word
    for i, (start, end) in enumerate(word_times):
        # Ensure the start and end times are within the audio length
        start = max(0, start)
        end = min(len(original_audio), end)
        
        # Extract the audio segment
        word_audio = original_audio[start:end]
        
        # Save the extracted segment using the original sample rate
        name_without_extension = os.path.splitext(output_wav)[0]
        output_file = f"{name_without_extension}_{i + 1}.wav"
        word_audio.export(output_file, format="wav")
        print(f"Extracted '{target_word}' to '{output_file}' from {start}ms to {end}ms.")

In [3]:
def count_word_occurrences(file_paths):
    word_count = defaultdict(lambda: [0, set()])  # Default dictionary to hold (count, set of identifiers)
    
    # Read each file and count words
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                identifier, text = line.split(' ', 1)
                words = text.split()
                
                for word in words:
                    # Normalize words to lowercase for consistent counting
                    normalized_word = word.lower()
                    word_count[normalized_word][0] += 1  # Increment word count
                    word_count[normalized_word][1].add(identifier)  # Add identifier

    # Create a sorted list from the dictionary
    sorted_word_counts = [
        (count_info[0], word, list(count_info[1])) 
        for word, count_info in word_count.items()
    ]
    sorted_word_counts.sort(key=lambda x: x[0], reverse=True)  # Sort by count (descending)

    return sorted_word_counts

def count_speaker_occurrences(identifiers):
    speaker_count = defaultdict(int)  # Default dictionary to hold counts of each speaker

    # Count occurrences of each speaker
    for identifier in identifiers:
        speaker_number = identifier.split('-')[0]  # Extract speaker number
        speaker_count[speaker_number] += 1  # Increment the count for the speaker

    # Convert to a regular dictionary for better readability
    return dict(speaker_count)

def generate_path(filename, path_prefix):
    reader, chapter, trial = filename.split('-')
    return f"{path_prefix}/{reader}/{chapter}/{reader}-{chapter}-{trial}.wav"


def word_extraction_wrapper(files_to_search, save_to_path, word, time_padding):
    for file in files_to_search:
        _, _, _, _, _, _, _, speaker, chapter, file_name = file.split('/')

        save_to = f"{save_to_path}/{word}_no_extra_time/{speaker}/{file_name}"
        extract_word_audio(file, save_to, word, time_padding)

In [12]:
# Example usage:
# file_paths = [r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\19\198\19-198.trans.txt', 
#               r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\19\227\19-227.trans.txt',
#               r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\26\495\26-495.trans.txt',
#               r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\26\496\26-496.trans.txt']  # List of your txt files

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\27\123349\27-123349.trans.txt",
#               r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\27\124992\27-124992.trans.txt"]

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\39\121914\39-121914.trans.txt",
#               r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\39\121915\39-121915.trans.txt",
#               r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\39\121916\39-121916.trans.txt"]

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\40\222\40-222.trans.txt",
#               r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\40\121026\40-121026.trans.txt"]

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\87\121553\87-121553.trans.txt"]

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\201\122255\201-122255.trans.txt",
#               r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\201\127786\201-127786.trans.txt"]

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\311\124404\311-124404.trans.txt"]

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\201\122255\201-122255.trans.txt",
#               r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\201\127786\201-127786.trans.txt",
#               r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\311\124404\311-124404.trans.txt",
# ]#              r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\87\121553\87-121553.trans.txt"]

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\87\121553\87-121553.trans.txt"]
file_paths = [r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\19\198\19-198.trans.txt']
spoken_words = count_word_occurrences(file_paths)

for word in spoken_words:
    print(word)


(49, 'her', ['19-198-0035', '19-198-0028', '19-198-0031', '19-198-0034', '19-198-0032', '19-198-0015', '19-198-0012', '19-198-0013', '19-198-0037', '19-198-0014', '19-198-0029', '19-198-0020', '19-198-0016', '19-198-0022', '19-198-0019', '19-198-0009', '19-198-0005', '19-198-0021', '19-198-0018', '19-198-0004', '19-198-0007', '19-198-0030', '19-198-0011', '19-198-0006'])
(48, 'and', ['19-198-0035', '19-198-0028', '19-198-0034', '19-198-0032', '19-198-0015', '19-198-0003', '19-198-0012', '19-198-0027', '19-198-0013', '19-198-0014', '19-198-0029', '19-198-0020', '19-198-0016', '19-198-0022', '19-198-0019', '19-198-0009', '19-198-0036', '19-198-0005', '19-198-0001', '19-198-0025', '19-198-0017', '19-198-0021', '19-198-0018', '19-198-0023', '19-198-0007', '19-198-0026', '19-198-0011', '19-198-0006'])
(44, 'the', ['19-198-0002', '19-198-0035', '19-198-0031', '19-198-0034', '19-198-0003', '19-198-0027', '19-198-0033', '19-198-0013', '19-198-0014', '19-198-0020', '19-198-0010', '19-198-0019',

In [None]:
for word in spoken_words:
    if word[1] == "that":
        filenames = word[2]


path_prefix = "C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data/cleaned"
# path_prefix = "C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data/as_wav"

files_to_search = [generate_path(filename, path_prefix) for filename in filenames]

save_to_path = f"C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data/extracted_words"

# please note word_extraction_wrapper will save to path/{word}_uncleaned/path/file 
word_extraction_wrapper(files_to_search, save_to_path, "that", 50)


Recognized words: <s> and from shakespeare she gave a great store information <sil> among the rest that <sil> truffles light as air <sil> are <sil> to the jealous confirmation strong as proofs of holy writ </s>
Extracted 'that' to 'C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data/extracted_words/that_no_extra_time/19/19-198-0026_1.wav' from 5150ms to 5540ms.
Recognized words: <s> that [NOISE] <sil> that for a beetle which we tread upon in a court for all suffer and steals a pang as great as like a giant guys <sil> and that a young woman in love always looks </s>
Extracted 'that' to 'C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data/extracted_words/that_no_extra_time/19/19-198-0027_1.wav' from 490ms to 960ms.
Extracted 'that' to 'C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data/extracted_words/that_no_extra_time/19/19-198-0027_2.wav' from 1960ms to 2310ms.
Extracted 'that' to 'C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data

In [9]:
input = r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\as_wav\87\121553\87-121553-0008.wav'
extract_word_audio(input, "./", "that", 10)

Recognized words: <s> when it was created was his mind to plead with such a living get a jade <sil> had in his mother heard a prophetic <sil> as soon as he espouses were completed between him and the face of holy followed <sil> with a with mutual safety dowry to other </s>
The word 'that' was not found in the audio C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\as_wav\87\121553\87-121553-0008.wav.
