In [4]:
import os
from collections import defaultdict

In [2]:
def count_word_occurrences(file_paths):
    word_count = defaultdict(lambda: [0, set()])  # Default dictionary to hold (count, set of identifiers)
    
    # Read each file and count words
    for file_path in file_paths:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                identifier, text = line.split(' ', 1)
                words = text.split()
                
                for word in words:
                    # Normalize words to lowercase for consistent counting
                    normalized_word = word.lower()
                    word_count[normalized_word][0] += 1  # Increment word count
                    word_count[normalized_word][1].add(identifier)  # Add identifier

    # Create a sorted list from the dictionary
    sorted_word_counts = [
        (count_info[0], word, list(count_info[1])) 
        for word, count_info in word_count.items()
    ]
    sorted_word_counts.sort(key=lambda x: x[0], reverse=True)  # Sort by count (descending)

    return sorted_word_counts

def count_speaker_occurrences(identifiers):
    speaker_count = defaultdict(int)  # Default dictionary to hold counts of each speaker

    # Count occurrences of each speaker
    for identifier in identifiers:
        speaker_number = identifier.split('-')[0]  # Extract speaker number
        speaker_count[speaker_number] += 1  # Increment the count for the speaker

    # Convert to a regular dictionary for better readability
    return dict(speaker_count)

In [5]:
# file_paths = [r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\19\198\19-198.trans.txt', 
#               r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\19\227\19-227.trans.txt',
#               r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\26\495\26-495.trans.txt',
#               r'C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\26\496\26-496.trans.txt']

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\87\121553\87-121553.trans.txt"]

# file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\201\122255\201-122255.trans.txt",
#               r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\201\127786\201-127786.trans.txt"]

file_paths = [r"C:\Computer Science Programs\Fall_2024\EE502_BioMed\project\data\LibriSpeech\train-clean-100\311\124404\311-124404.trans.txt"]

result = count_word_occurrences(file_paths)

In [6]:
result = count_word_occurrences(file_paths)
for word in result:
    print(word)

(351, 'the', ['311-124404-0023', '311-124404-0097', '311-124404-0081', '311-124404-0019', '311-124404-0020', '311-124404-0030', '311-124404-0099', '311-124404-0110', '311-124404-0112', '311-124404-0008', '311-124404-0055', '311-124404-0115', '311-124404-0009', '311-124404-0074', '311-124404-0100', '311-124404-0021', '311-124404-0000', '311-124404-0077', '311-124404-0109', '311-124404-0070', '311-124404-0118', '311-124404-0060', '311-124404-0033', '311-124404-0117', '311-124404-0005', '311-124404-0083', '311-124404-0014', '311-124404-0102', '311-124404-0075', '311-124404-0106', '311-124404-0089', '311-124404-0064', '311-124404-0059', '311-124404-0029', '311-124404-0058', '311-124404-0069', '311-124404-0098', '311-124404-0068', '311-124404-0040', '311-124404-0036', '311-124404-0046', '311-124404-0088', '311-124404-0111', '311-124404-0034', '311-124404-0079', '311-124404-0071', '311-124404-0120', '311-124404-0002', '311-124404-0080', '311-124404-0006', '311-124404-0086', '311-124404-0001'

In [7]:
for word in result:
    if word[1] == 'which':
        print(word[1])
        print(word[2])
        result_speaker = count_speaker_occurrences(word[2])
        print(result_speaker)

which
['87-121553-0025', '87-121553-0092', '87-121553-0014', '87-121553-0007', '87-121553-0101', '87-121553-0067', '87-121553-0089', '87-121553-0023', '87-121553-0004', '87-121553-0057', '87-121553-0049', '87-121553-0070', '87-121553-0077', '87-121553-0061', '87-121553-0106', '87-121553-0031', '87-121553-0028', '87-121553-0075', '87-121553-0036', '87-121553-0071', '87-121553-0006', '87-121553-0029', '87-121553-0048', '87-121553-0012', '87-121553-0053', '87-121553-0058', '87-121553-0026', '87-121553-0035', '87-121553-0017', '87-121553-0060', '87-121553-0000', '87-121553-0084', '87-121553-0016', '87-121553-0104']
{'87': 34}
