In [2]:
import os
from pocketsphinx import AudioFile, get_model_path, Decoder
from pydub import AudioSegment
from collections import defaultdict



In [3]:
def extract_word_audio(input_wav, output_wav, target_word="which", extra_time=0):
    # Set up paths for the models
    model_path = get_model_path()
    
    # Create a list to hold the start and end times of the target word
    word_times = []

    # Create a decoder with the appropriate configuration
    config = {
        'verbose': False,
        'hmm': os.path.join(model_path, 'en-us'),  # Path to the acoustic model
        'lm': os.path.join(model_path, 'en-us.lm.bin'),  # Path to the language model
        'dict': os.path.join(model_path, 'cmudict-en-us.dict')  # Path to the dictionary
    }
    
    # Initialize the decoder
    decoder = Decoder(config)

    # Start decoding the audio file
    decoder.start_utt()
    
    # Read the audio file in binary mode and process it
    with open(input_wav, 'rb') as audio_file:
        while True:
            buf = audio_file.read(1024)
            if not buf:
                break
            decoder.process_raw(buf, False, False)

    decoder.end_utt()

    # Check for the target word in the recognized segments
    recognized_words = []
    for seg in decoder.seg():
        word = seg.word.split('(')[0]  # Get the word before the hypothesis index
        recognized_words.append(word)  # Collect all recognized words
        # Check if the segment matches the target word
        if word.lower() == target_word.lower():
            # Append start and end times (in milliseconds)
            start_time = int(seg.start_frame * (1000 / 100))  # Assuming 100 frames per second
            end_time = int(seg.end_frame * (1000 / 100))      # Adjust frame rate if necessary
            # Adjust start and end times for extra recording
            adjusted_start = max(0, start_time - extra_time)  # One second before
            adjusted_end = end_time + extra_time                # One second after
            word_times.append((adjusted_start, adjusted_end))

    # Print recognized words
    print("Recognized words:", ' '.join(recognized_words))

    # Check if the word was found
    if not word_times:
        print(f"The word '{target_word}' was not found in the audio.")
        return

    # Load the original audio file
    original_audio = AudioSegment.from_wav(input_wav)

    # Extract segments for each occurrence of the target word
    for i, (start, end) in enumerate(word_times):
        # Ensure the start and end times are within the audio length
        start = max(0, start)
        end = min(len(original_audio), end)
        
        # Extract the audio segment
        word_audio = original_audio[start:end]
        
        # Save the extracted segment using the original sample rate
        name_without_extension = os.path.splitext(output_wav)[0]
        output_file = f"{name_without_extension}_{i + 1}.wav"
        word_audio.export(output_file, format="wav")
        print(f"Extracted '{target_word}' to '{output_file}' from {start}ms to {end}ms.")

In [4]:
def count_word_occurrences(file_paths):
    word_count = defaultdict(lambda: [0, set()])  # Default dictionary to hold (count, set of identifiers)
    
    # Read each file and count words
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                identifier, text = line.split(' ', 1)
                words = text.split()
                
                for word in words:
                    # Normalize words to lowercase for consistent counting
                    normalized_word = word.lower()
                    word_count[normalized_word][0] += 1  # Increment word count
                    word_count[normalized_word][1].add(identifier)  # Add identifier

    # Create a sorted list from the dictionary
    sorted_word_counts = [
        (count_info[0], word, list(count_info[1])) 
        for word, count_info in word_count.items()
    ]
    sorted_word_counts.sort(key=lambda x: x[0], reverse=True)  # Sort by count (descending)

    return sorted_word_counts

def count_speaker_occurrences(identifiers):
    speaker_count = defaultdict(int)  # Default dictionary to hold counts of each speaker

    # Count occurrences of each speaker
    for identifier in identifiers:
        speaker_number = identifier.split('-')[0]  # Extract speaker number
        speaker_count[speaker_number] += 1  # Increment the count for the speaker

    # Convert to a regular dictionary for better readability
    return dict(speaker_count)

def generate_path(filename):
    path_prefix = "C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data/cleaned"
    reader, chapter, trial = filename.split('-')
    return f"{path_prefix}/{reader}/{chapter}/{reader}-{chapter}-{trial}.wav"


def word_extraction_wrapper(files_to_search, word, time_padding):
    for file in files_to_search:
        _, _, _, _, _, _, _, speaker, chapter, file_name = file.split('/')
        save_to = f"C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data/extracted_words/{word}/{speaker}/{file_name}"
        extract_word_audio(file, save_to, word, time_padding)

In [7]:
# Example usage:
# file_paths = [r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\19\198\19-198.trans.txt', 
#               r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\19\227\19-227.trans.txt',
#               r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\26\495\26-495.trans.txt',
#               r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\26\496\26-496.trans.txt']  # List of your txt files

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\27\123349\27-123349.trans.txt",
#               r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\27\124992\27-124992.trans.txt"]

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\39\121914\39-121914.trans.txt",
#               r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\39\121915\39-121915.trans.txt",
#               r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\39\121916\39-121916.trans.txt"]

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\40\222\40-222.trans.txt",
#               r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\40\121026\40-121026.trans.txt"]

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\87\121553\87-121553.trans.txt"]

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\201\122255\201-122255.trans.txt",
#               r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\201\127786\201-127786.trans.txt"]

file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\311\124404\311-124404.trans.txt"]

spoken_words = count_word_occurrences(file_paths)

for word in spoken_words:
    print(word)


(351, 'the', ['311-124404-0092', '311-124404-0004', '311-124404-0114', '311-124404-0010', '311-124404-0060', '311-124404-0033', '311-124404-0116', '311-124404-0047', '311-124404-0088', '311-124404-0034', '311-124404-0068', '311-124404-0077', '311-124404-0061', '311-124404-0081', '311-124404-0058', '311-124404-0095', '311-124404-0030', '311-124404-0109', '311-124404-0017', '311-124404-0055', '311-124404-0115', '311-124404-0012', '311-124404-0102', '311-124404-0062', '311-124404-0005', '311-124404-0039', '311-124404-0118', '311-124404-0040', '311-124404-0094', '311-124404-0059', '311-124404-0089', '311-124404-0110', '311-124404-0112', '311-124404-0066', '311-124404-0042', '311-124404-0046', '311-124404-0074', '311-124404-0048', '311-124404-0073', '311-124404-0013', '311-124404-0099', '311-124404-0072', '311-124404-0024', '311-124404-0093', '311-124404-0009', '311-124404-0119', '311-124404-0076', '311-124404-0008', '311-124404-0025', '311-124404-0014', '311-124404-0090', '311-124404-0086'

In [9]:
for word in spoken_words:
    if word[1] == "that":
        filenames = word[2]

files_to_search = [generate_path(filename) for filename in filenames]

word_extraction_wrapper(files_to_search, "that", 100)


Recognized words: <s> in the most direct lights and that <sil> according to the rules of mechanics <sil> which are the same with those of nature [NOISE] <sil> what many objects to end at once <sil> same point <sil> </s>
Extracted 'that' to 'C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data/extracted_words/that/311/311-124404-0092_1.wav' from 1790ms to 2220ms.
Recognized words: <s> that <sil> <sil> though there are many animals which manifest more <sil> industry that week in certain of their actions <sil> same animals for yet observed sure not all <sil> in many others <sil> <sil> so that the circumstance that they do better and </s>
Extracted 'that' to 'C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data/extracted_words/that/311/311-124404-0114_1.wav' from 40ms to 640ms.
Extracted 'that' to 'C:/Computer Science Programs/Fall_2024/EE502_BioMed/project/data/extracted_words/that/311/311-124404-0114_2.wav' from 3920ms to 4360ms.
Extracted 'that' to 'C:/Computer S