# Generate features (acoustic, phonetic, and lexical) for stimuli
## 1. Phonemic feature matrix
### 1.1. Get MFA dictionary

In [2]:
def read_syllable_dict(file_path):
    """
    Reads a file and returns a dictionary with words as keys and phoneme lists as values.
    Ignores the numeric middle section between tabs, extracting only word and phonemes.
    
    Args:
        file_path (str): Path to the file
    
    Returns:
        dict: {word: [phonemes]}
    """
    syllable_dict = {}
    
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Remove leading/trailing whitespace and split by tabs
            parts = line.strip().split('\t')

            # Extract word (first part) and phonemes (third part)
            word = parts[0].replace("'", "")  # Remove single quotes if present
            if len(parts) == 6:
                phonemes_str = parts[5]  # Third part is the phonemes
            elif len(parts) == 2:
                phonemes_str = parts[1]  # Third part is the phonemes
            else:
                continue

            # Split phonemes by spaces into a list
            phonemes = phonemes_str.split()
            if not phonemes:  # If no phonemes found, skip the line
                print(f"No phonemes found in line: {line.strip()}")
                continue
                
            # Store in dictionary
            syllable_dict[word] = phonemes
    
    return syllable_dict

In [3]:
file_path = "english_us_lr.dict"
# Create the filename-to-phoneme dictionary
dict = read_syllable_dict(file_path)
for word, syllables in list(dict.items())[:10]:
    print(f"{word}: {syllables}")

d: ['D']
ll: ['L']
re: ['R', 'EY1']
s: ['EH1', 'S']
ve: ['V', 'IY1', 'IY1']
[bracketed]: ['spn']
<cutoff>: ['spn']
a: ['EY1']
as: ['EH1', 'Z']
abody: ['EY1', 'B', 'AA2', 'D', 'IY0']


### 1.2. Get the stimuli phonemic dictionary for the Lexical Delay Experiment

In [4]:
import os

def get_wav_files(directory):
    wav_files_names = {}
    wav_files = []
    # Check if directory exists
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return wav_files
    
    # Iterate through files in the directory
    for filename in os.listdir(directory):
        if filename.lower().endswith('.wav'):  # Case-insensitive check for .wav files
            # Add to dictionary (key is filename, value can be None or customized)
            wav_files_names[filename[:-4]] = None
    return wav_files_names

In [5]:
directory = r"C:\Users\bl314\Box\CoganLab\task_stimuli\LexicalDecRepDelay"
syllables = get_wav_files(directory)
for syllable in syllables.keys():
    syllables[syllable] = dict.get(syllable)
    print(f"{syllable}: {syllables[syllable]}")

bacon: ['B', 'EY1', 'K', 'AH0', 'N']
bagel: ['B', 'EY1', 'G', 'AH0', 'L']
banel: ['B', 'AE1', 'N', 'AH0', 'L']
banic: ['B', 'AE1', 'N', 'IH0', 'K']
baron: ['B', 'AE1', 'R', 'AH0', 'N']
basin: ['B', 'EY1', 'S', 'AH0', 'N']
belet: ['B', 'EH1', 'L', 'EH0', 'T']
berin: ['B', 'AE1', 'R', 'IH0', 'N']
beris: ['B', 'AE1', 'R', 'IH0', 'S']
bison: ['B', 'AY1', 'S', 'AH0', 'N']
bonus: ['B', 'OW1', 'N', 'AH0', 'S']
boreb: ['B', 'AO1', 'R', 'AH0', 'B']
boril: ['B', 'AO1', 'R', 'AH0', 'L']
cabin: ['K', 'AE1', 'B', 'AH0', 'N']
camel: ['K', 'AE1', 'M', 'AH0', 'L']
caris: ['K', 'AE1', 'R', 'AH0', 'S']
casef: ['K', 'AA1', 'S', 'IH0', 'F']
cazel: ['K', 'AA1', 'Z', 'AH0', 'L']
civic: ['S', 'IH1', 'V', 'IH0', 'K']
civil: ['S', 'IH1', 'V', 'AH0', 'L']
cobin: ['K', 'AA1', 'B', 'IH0', 'N']
colon: ['K', 'OW1', 'L', 'AH0', 'N']
comet: ['K', 'AA1', 'M', 'AH0', 'T']
comic: ['K', 'AA1', 'M', 'IH0', 'K']
coral: ['K', 'AO1', 'R', 'AH0', 'L']
davel: ['D', 'EY1', 'V', 'AH0', 'L']
delin: ['D', 'EH1', 'L', 'IH0', 'N']
d

### 1.3. One-hot encoding for each unique phoneme

In [6]:
# Get unique phoneme vector
all_phonemes = set()
for phonemes in syllables.values():
    all_phonemes.update(phonemes)
all_phonemes = sorted(list(all_phonemes))  # Sort for consistent ordering
phoneme_to_index = {phoneme: idx for idx, phoneme in enumerate(all_phonemes)}
vector_length = len(all_phonemes)

# Create one-hot encoding for each word
phoneme_one_hot_dict = {}
for word, phonemes in syllables.items():
    # Initialize vector with zeros
    vector = [0] * vector_length
    # Set 1 for each phoneme present in the word
    for phoneme in phonemes:
        vector[phoneme_to_index[phoneme]] = 1
    phoneme_one_hot_dict[word] = vector

print(phoneme_one_hot_dict)

# save the phoneme_one_hot_dict
import pickle
phoneme_one_hot_dict_path = "phoneme_one_hot_dict.pickle"
with open(phoneme_one_hot_dict_path, 'wb') as handle:
    pickle.dump(phoneme_one_hot_dict, handle)

{'bacon': [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'bagel': [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'banel': [0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'banic': [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'baron': [0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], 'basin': [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0], 'belet': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], 'berin': [0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], 'beris': [0, 0, 0, 1, 0

In [7]:
# # Read the saved one hot dict. Just for demonstration
# import pandas as pd
# d= pd.read_pickle("phoneme_one_hot_dict.pickle")

## 2. Acoustic feature (Power envelope at five frequency bins)
### 2.1. Read normolized binned envelope 
(The matrix was generated by `get_stims_envelope.m`)

In [9]:
envelope_feature_dict = {}

with open("envelope_power_bins.txt", "r") as file:
    for line in file:
        parts = line.strip().split("\t")
        key = parts[0]
        values = list(map(float, parts[1:]))  # Convert remaining columns to float
        envelope_feature_dict[key] = values

# Print a sample of the dictionary
for key, values in list(envelope_feature_dict.items())[:5]:  # Show first 5 entries
    print(key, ":", values)

# Save dictionary
import pickle
envelope_feature_dict_path = "envelope_feature_dict.pickle"
with open(envelope_feature_dict_path, 'wb') as handle:
    pickle.dump(envelope_feature_dict, handle)

bacon : [0.74574, 0.637592, 0.459571, 0.799834, 0.249223]
bagel : [0.790974, 0.632659, 0.508791, 0.898081, 0.243668]
banel : [0.584685, 0.661659, 0.721426, 0.813079, 0.46464]
banic : [0.595901, 0.744453, 0.693261, 0.731105, 0.465257]
baron : [0.744477, 0.96389, 0.91162, 0.67948, 0.462083]
