# Pipeline

In [None]:
# install all packages needed
#install = "sudo apt-get install python3-pip python3-tk python3-pil python3-pil.imagetk python3-pandas python3-numpy python3-matplotlib python3-scipy python3-sklearn python3-sklearn-lib python3-textgrid"
#import os
#os.system(install)

In [None]:
# calculate the duration of each cue
# import all libraries
import os
import numpy as np
from scipy.io.wavfile import read
import pandas as pd
import re
import matplotlib.pyplot as plt
import textgrid
import time
import subprocess
import shutil
import sys
from tqdm.notebook import tqdm 
root_directory = "C:/Users/Nabiya/Box/Academic-Duke/CoganLab/test_4/"
pattern = re.compile(r'^D\d+$')

def calculate_dur(wav):
    a = read(wav)
    fs = a[0]
    time = (np.array(a[1],dtype=float)).shape[0] / fs
    return time


In [None]:
speech_text_dict ={}
speech_text_dict['hut'] = 'hut'
speech_text_dict['heat'] = 'heat'
speech_text_dict['hoot'] = 'hoot'
speech_text_dict['hot'] = 'hot'
speech_text_dict['mice'] = 'there was once a house that was overrun with mice'
speech_text_dict['dog'] = 'the dog was very proud of the bell'
speech_text_dict['fame'] = 'notoriety is often mistaken for fame'
speech_text_dict


In [None]:
duration_dict = {}
duration_dict['heat'] = calculate_dur('SentenceRep_Stim/heat.wav')
duration_dict['hut'] = calculate_dur('SentenceRep_Stim/hut.wav')
duration_dict['hoot'] = calculate_dur('SentenceRep_Stim/hoot.wav')
duration_dict['hot'] = calculate_dur('SentenceRep_Stim/hot.wav')
duration_dict['mice'] = calculate_dur('SentenceRep_Stim/HouseMice3Secs.wav')
duration_dict['dog'] = calculate_dur('SentenceRep_Stim/DogBell18Sec.wav')
duration_dict['fame'] = calculate_dur('SentenceRep_Stim/NotorietyFame.wav')
duration_dict

In [None]:
# write all these print statements to a file
with open('unique_values.txt', 'w') as f:
    for item in os.listdir(root_directory):
        patient_path = os.path.join(root_directory, item)

        if os.path.isdir(patient_path) and pattern.match(item):
            cond_path = os.path.join(patient_path, 'condition_events.txt')
            cue_path = os.path.join(patient_path, 'cue_events.txt')
            condition_file = open(cond_path, 'r')
            condition_lines = condition_file.readlines()
            cue_file = open(cue_path, 'r')
            cue_lines = cue_file.readlines()
            condition_file.close()
            cue_file.close()
            condition_values = set(tuple(line.strip().split('\t')[2].split('_')[1:]) for line in condition_lines)
            cue_values = set(tuple(line.strip().split('\t')[2].split('_')[1:]) for line in cue_lines)
            f.write(f'Patient {item} has the following unique values for conditions: {condition_values}\n')
            f.write(f'Patient {item} has the following unique values for cues: {cue_values}\n')
            f.write('\n')
            print(f'Patient {item} has the following unique values for conditions: {condition_values}')
            print(f'Patient {item} has the following unique values for cues: {cue_values}')
            print('\n')



### Judging from the unique values of condition and cue files, you can exclude trials that are noisy or have no stimulus etc. 

In [None]:
def annotate(patient_path):
    '''
    This function takes in the file path, path to condition events, cue event
    Return a file with the annotated events

    We assume that we get the conditions from the task for free
    and we assume that we get the cue start for free too
    Cue end will be modified using duration_dict

    '''
    cond_path = os.path.join(patient_path, 'condition_events.txt')
    cue_path = os.path.join(patient_path, 'cue_events.txt')
    output_path = os.path.join(patient_path, 'annotated_events.txt')
    condition_file = open(cond_path, 'r')
    condition_lines = condition_file.readlines()
    cue_file = open(cue_path, 'r')
    cue_lines = cue_file.readlines()
    with open(output_path, 'w') as file:
        for i in range(len(condition_lines)):
        # cue_s is cue start, cue_e is cue end, cond_s is condition start, cond_e is condition end
            cue_s1, cue_e1, cue= cue_lines[i].strip().split('\t')
            if i == len(condition_lines)-1:
                cue_s2, cue_e2, cue_2 = f"{float(cue_e1) + 6}", None, None
            else:
                cue_s2, cue_e2, cue_2 = cue_lines[i+1].strip().split('\t')
            # what should i do if cue.split('_')[2] is noisy
            # i think it is better to not include them in the annotated events- personal opinion
            if len(cue.split('_')) == 3 and ((cue.split('_')[2] == 'noisy') or (cue.split('_')[2] == 'noStim')):
                continue
            cue = cue.split('_')[1]
            
            cue_e1 = float(cue_s1) + float(duration_dict[cue])
            cue_text = speech_text_dict[cue]
            cond_s, cond_e, cond = condition_lines[i].strip().split('\t')
            # we are only writing the text if condition is listen, otherwise skip
            if cond.endswith(':=:'):
                continue
            elif cond.endswith('Listen'):
                
                # add buffer to cue_s1 and subtract buffer from cue_s2
                cue_e1 = float(cue_e1) + 0.25
                cue_s2 = float(cue_s2) - 0.25
                file.write(f'{cue_e1}\t{cue_s2}\t{cue_text}\n')
        # print length of condition lines and cue lines just for fun
    len_cue, len_cond = len(cue_lines), len(condition_lines)
    patient_name = patient_path.split('/')[-1]
    print(f'Patient {patient_name} has {len_cue} cue lines and {len_cond} condition lines')

    return output_path

    
# create function that converts annotations to TextGrid
def convert_to_textgrid(patient_path):
    '''
    This function takes in the file path of the annotated events
    and returns a TextGrid file so that we can load it into MFA
    '''
    file_path = os.path.join(patient_path, 'annotated_events.txt')
    from textgrid import TextGrid, IntervalTier
    # Load your text file
    entries = []
    with open(file_path, 'r') as file:
        for line in file:
            start, end, label = line.strip().split('\t')
            entries.append((float(start), float(end), label))

    # Create a TextGrid object and an interval tier
    tg = TextGrid()
    tier = IntervalTier(name='words')

    # Add intervals from your text file to the tier
    for start, end, label in entries:
        tier.add(start, end, label)

    # Add the tier to the TextGrid
    tg.append(tier)

    # Save the TextGrid file
    # remove txt from file path
    export_path = os.path.join(patient_path, 'allblocks.TextGrid')
    tg.write(f'{export_path}')
    return f'{export_path}.TextGrid'


# create function that extracts annotations from TextGrid
def extract_annotations(patient_path):
    '''
    This function takes in a file path and extracts the annotations from the TextGrid
    The outputs are two files, one for words and one for phones
    '''
    from textgrid import TextGrid
    # Load your TextGrid file
    file_path = os.path.join(patient_path, 'output_mfa/allblocks.TextGrid')
    tg = TextGrid.fromFile(f'{file_path}')

    # Specify the tier names you want to extract intervals from
    tier_names = ['words', 'phones']  # Adjust this to match the tier names in your TextGrid


    for tier_name in tier_names:
        # Find the correct tier
        for tier in tg.tiers:
            if tier.name == tier_name:
                selected_tier = tier
                break
        else:
            raise ValueError("Tier named '{}' not found.".format(tier_name))

        # Open a text file to write the intervals
        # add file path to the tier name
        export_mfa_path = os.path.join(patient_path, f'mfa_{tier_name.lower()}.txt')
        with open(file_path, 'w', encoding='utf-8') as file:
            # Write the intervals for the tier
            for interval in selected_tier:
                start = interval.minTime
                end = interval.maxTime
                label = interval.mark
                # Write the start time, end time, and label separated by tabs
                file.write(f'{start}\t{end}\t{label}\n')
    return f'{export_mfa_path}_words.txt', f'{export_mfa_path}_phones.txt'




### Dear reader, 

### MFA needs two folders two operate, one is input_mfa, and the other is ouput_mfa. Within input_mfa there should be a wav file named allblocks (name is modifiable), and another textGrid folder named allblocks too. 

In [None]:
def prepare_for_mfa(participant_path):

    # Paths for the new subdirectories
    input_mfa_path = os.path.join(participant_path, 'input_mfa')
    output_mfa_path = os.path.join(participant_path, 'output_mfa')

    # Create the 'input_mfa' and 'output_mfa' directories if they don't exist
    os.makedirs(input_mfa_path, exist_ok=True)
    os.makedirs(output_mfa_path, exist_ok=True)

    # Path to the 'allblocks.wav' and 'allblocks.TextGrid' files in the participant's folder
    allblocks_wav = os.path.join(participant_path, 'allblocks.wav')
    allblocks_textgrid = os.path.join(participant_path, 'allblocks.TextGrid')

    # Copy 'allblocks.wav' and 'allblocks.TextGrid' to 'input_mfa' if they exist
    if os.path.isfile(allblocks_wav):
        shutil.copy(allblocks_wav, input_mfa_path)
    if os.path.isfile(allblocks_textgrid):
        shutil.copy(allblocks_textgrid, input_mfa_path)




def run_mfa(patient_path):

    """
    This function takes in the file path of the participant and runs MFA on the participant's data.
    """
    # Paths to the input_mfa and output_mfa directories
    input_mfa_path = os.path.join(patient_path, 'input_mfa')
    output_mfa_path = os.path.join(patient_path, 'output_mfa')

    # clear the output_mfa directory

    
    # Command to activate conda environment
    activate_env_command = "conda activate aligner"
    
    # Command to run Montreal Forced Aligner
    mfa_command = f"mfa align --clean {input_mfa_path} english_us_mfa english_mfa {output_mfa_path}"
    
    # Execute the commands
    try:
        # Activate the conda environment
        subprocess.run(activate_env_command, shell=True, check=True)
        
        # Run the MFA align command
        subprocess.run(mfa_command, shell=True, check=True)
        
    except subprocess.CalledProcessError as e:
        print(f"An error occurred while running MFA: {e}")
        


In [None]:


pattern = re.compile(r'^D\d+$')
counter = 0
for item in os.listdir(root_directory):
    patient_path = os.path.join(root_directory, item)

    if os.path.isdir(patient_path) and pattern.match(item):
        counter += 1
print(f'The number of patients in the test folder is {counter}')

## Whether you want to run the MFA on a whole directory of patients or rather a few of them, run the code below. Respond to the prompt with yes if the latter, and otherwise if the former. 

In [None]:
patients = 'D21, D33, D55, D23, D22'
patients = patients.strip().split(',')
print(f'The patients in the test folder are {patients}')

In [None]:
wanna_run_this = input('Do you want to run the pipeline on specific patients? (yes/no): ')
if wanna_run_this == 'yes':
    patients = input('Enter the patients you want to run the pipeline on separated by a comma and NO SPACES: ')
    patients = patients.strip().split(',')
    start = time.time()
    for patient in tqdm(patients, desc='Running MFA', ascii=False, ncols=1000, bar_format='{l_bar}{bar}{r_bar}'):
        patient_path = os.path.join(root_directory, patient)
        print(f'Running MFA for patient {patient}')
        annotate(patient_path)
        convert_to_textgrid(patient_path)
        prepare_for_mfa(patient_path)
        run_mfa(patient_path)
        extract_annotations(patient_path)
    end = time.time()
    dur = end - start
    if dur < 60:
        print(f'The time taken to run the entire pipeline is for the selected patients is {dur} seconds')
    elif dur >= 60 and dur < 3600:
        print(f'The time taken to run the entire pipeline is for the selected patients is {dur/60} minutes')
    else:
        print(f'The time taken to run the entire pipeline is for the selected patients is {dur/3600} hours')

else:
    # run the pipeline on all patients
    start = time.time()
    for item in tqdm(os.listdir(root_directory), desc= 'Running MFA', ascii=False, ncols=1000, bar_format='{l_bar}{bar}{r_bar}'):
        patient_path = os.path.join(root_directory, item)

        if os.path.isdir(patient_path) and pattern.match(item):
            print(f'Running MFA for patient {item}')
            annotate(patient_path)
            convert_to_textgrid(patient_path)
            prepare_for_mfa(patient_path)
            run_mfa(patient_path)
            extract_annotations(patient_path)
        print(f'MFA for patient {item} has been completed')
    end = time.time()
    dur = end - start
    if dur < 60:
        time_taken = dur
        print(f'The time taken to run the entire pipeline is for all patients is {time_taken} seconds')
    elif dur >= 60 and dur < 3600:
        time_taken = dur/60
        print(f'The time taken to run the entire pipeline is for all patients is {time_taken} minutes')
    else:
        time_taken = dur/3600
        print(f'The time taken to run the entire pipeline is for all patients is {time_taken} hours')

In [None]:
3133.5939497947693/60

# Congrats!!!

### Now you have two files, mfa_words and mfa_phones that have the time stamps for start and end of single utterances (words/phonemes). 

### Please load these to Audacity along with your audio file to see the sound-word or sound-phoneme alignment.