In [66]:
# import all libraries

import glob
import re
import numpy as np
import pandas as pd
import os

In [108]:
# all the functions needed 

def get_df_from_textgrid(file, threshold): # change threshold as needed
    
    try:
        with open(file, 'r', encoding='utf-8') as f:
            lines = [line.rstrip() for line in f]
    except:
        with open(file, 'r', encoding = 'utf-16-be') as f:
            lines = [line.rstrip() for line in f]
    text_list = []
    xmin_list = []
    xmax_list = []
    interval_list = []
    layer_list = []
    layer = 1
    for line in lines[14:]:  #informations needed begin on the 9th lines
        if "item [" in line:
            # update layer number
            layer = layer + 1
        
        if 'text =' in line:
            line = line.split('"')[1]
            if line != "":
                text_list.append(line)
                layer_list.append(layer)
                #print (len(text_list), len(xmin_list), len(xmax_list), len(interval_list), line, interval_list[-2:], layer)
            else:
                xmin_list = xmin_list[:-1]
                xmax_list = xmax_list[:-1]
                interval_list = interval_list[:-1]
        if 'xmin' in line:
            time_min = line.split('=')[1].strip(' ')
            xmin_list.append(time_min)
        if 'xmax' in line:
            time_max = line.split('=')[1].strip(' ')
            xmax_list.append(time_max)
        if 'intervals [' in line:
            interval = line.split('[')[1].strip(']:')
            interval_list.append(interval)
        if 'intervals: size' in line:
            xmin_list = xmin_list[:-1]
            xmax_list = xmax_list[:-1]
    xmin_list = [float(item) for item in xmin_list]
    xmax_list = [float(item) for item in xmax_list]
    interval_list = [int(item) for item in interval_list]

    df = pd.DataFrame()
    df['text'] = text_list
    df['xmin'] = xmin_list
    df['xmax'] = xmax_list
    df['interval'] = interval_list
    df['layer'] = layer_list
    
    # keep layer 1 and 2
    df_sub = df[df['layer'].isin([1,2])]
    df_sub['overlap'] = df_sub.apply(lambda x: overlap_record(df_sub, x.layer, x.interval), axis = 1)
    df_sub['overlap_max'] = df_sub['overlap'].apply(lambda x: get_max_overlap_percent(x))
    df_sub_keep = df_sub[df_sub['overlap_max'] < threshold]
    
    return df_sub_keep

def overlap_judge(x1_min, x1_max, x2_min, x2_max):
    overlap_length = 0
    overlap_perc_x1 = 0
    overlap_perc_x2 = 0
    if x1_min <= x2_min:
        if x1_max > x2_min:
            overlap = 1
            overlap_length = min(x2_max, x1_max) - x2_min
        else:
            overlap = 0
    elif x1_min < x2_max:
        overlap = 1
        overlap_length = min(x2_max, x1_max) - x1_min
    elif x1_min >= x2_max:
        overlap = 0
    
    # get overlap percentage
    if overlap == 1:
        #print (overlap_length)
        overlap_perc_x1 = round(overlap_length / (x1_max - x1_min), 2)
        overlap_perc_x2 = round(overlap_length / (x2_max - x2_min), 2)
    return overlap, overlap_length, overlap_perc_x1, overlap_perc_x2


def overlap_record(df_sub, curr_layer, curr_interval):
    df = df_sub
    if curr_layer == 1:
        goal_layer = 2
    if curr_layer == 2:
        goal_layer = 1
    x1_min = df[(df['layer'] == curr_layer) & (df['interval'] == curr_interval)]['xmin'].tolist()[0]
    x1_max = df[(df['layer'] == curr_layer) & (df['interval'] == curr_interval)]['xmax'].tolist()[0]
    goal_layer_number = len(df[df['layer'] == goal_layer]) # number of intervals to be compared in total
    
    goal_interval_list = df[df['layer'] == goal_layer]['interval'].tolist()
    goal_xmin_list = df[df['layer'] == goal_layer]['xmin'].tolist()
    goal_xmax_list = df[df['layer'] == goal_layer]['xmax'].tolist()
    
    overlap_info = []
    i = 0
    #print (goal_layer_number)
    while i < goal_layer_number:
        #print (i)
        x2_min = goal_xmin_list[i]
        x2_max = goal_xmax_list[i]
        if overlap_judge(x1_min, x1_max, x2_min, x2_max)[0] == 1:
            overlap_info.append([goal_interval_list[i], overlap_judge(x1_min, x1_max, x2_min, x2_max)])
        i = i + 1
    return overlap_info

def get_max_overlap_percent(overlap_list):
    if overlap_list == []:
        return 0
    else:
        #print ([x[1][2] for x in overlap_list])
        max_overlap_percent = np.max([x[1][2] for x in overlap_list])
        
        return max_overlap_percent
    
def generate_textgrid_output(df_sub_keep, file):
    # generate textgrif from the datafrom

    xmax_all = np.max(df_sub_keep['xmax'])

    tiers = ['text', 'text']
    tiers_name = ['INT01', 'DAM01'] # rename layer
    tiers_sub = [1,2]


    lines = []
    lines_head = ['File type = "ooTextFile"',
    'Object class = "TextGrid"', '']
    xmin_all = 'xmin = ' + str(0)
    xmax_all = 'xmax = ' + str(xmax_all) # change xmax



    tier_info_general = ['tiers? <exists>', 
    'size = 2', 'item []:'] # change tier number

    sec_all = []
    for i in range(2): # change tier number
        df_curr = df_sub_keep[df_sub_keep['layer'] == tiers_sub[i]] # keep layer 1 and 2
        text_curr = df_curr[tiers[i]].tolist()
    
        xmin_list = df_curr['xmin'].tolist()
        xmax_list = df_curr['xmax'].tolist()

    
        #print (tiers[i])
        tier_info = [ 
        '    item [' + str(i + 1) +']:',
        '        class = "IntervalTier"', 
        '        name = "'+ tiers_name[i] +'"'             
        ]
        interval = 'intervals: size = ' + str(len(text_curr))
        sec_all = sec_all + tier_info + [xmin_all,xmax_all] + [interval]
        for j in range(len(text_curr)):
            interval_curr = 'intervals [' + str(j + 1) + ']:'
            xmin = 'xmin = ' + str(xmin_list[j])
            xmax = 'xmax = ' + str(xmax_list[j])
            text = 'text = "' + str(text_curr[j]) + '"'
            sec_all = sec_all + [interval_curr, xmin, xmax, text]

    lines = lines_head + [xmin_all,xmax_all] + tier_info_general + sec_all
    with open(output_dir + file.split('/')[-1].split('.')[0] + '_overlap_removed_15.Textgrid', 'w') as f: # 15 here indicates the overlap percentage, can change accordingly to remind the threshold choosen
        for line in lines:
            f.write(f"{line}\n")
            
def get_processed_result(input_dir, output_dir):
    # get all textgrid files
    
    files = glob.glob(input_dir + "*.TextGrid")
    for file in files:
        print (file) # print file name
        # change threshold as needed
        df_sub_keep = get_df_from_textgrid(str(file), threshold = 0.15)
        generate_textgrid_output(df_sub_keep, file)
        

In [100]:
file = '/Users/irisz/downloads/2023_ELIC/croatian/croatian_manual_annotated_1008/ckm001-2022-01-16-Trviž_02.TextGrid'
with open(file, 'r', encoding='utf-8') as file:
    lines = [line.rstrip() for line in file]
lines

['File type = "ooTextFile"',
 'Object class = "TextGrid"',
 '',
 'xmin = 0',
 'xmax = 337.482875',
 'tiers? <exists>',
 'size = 4',
 'item []:',
 '    item [1]:',
 '        class = "IntervalTier"',
 '        name = "TEXT@INT01"',
 '        xmin = 0',
 '        xmax = 337.482875',
 '        intervals: size = 32',
 '        intervals [1]:',
 '            xmin = 0',
 '            xmax = 1.9827386500133033',
 '            text = "Dobro. Ča se domišjaš od djetinjstva, da?"',
 '        intervals [2]:',
 '            xmin = 1.9827386500133033',
 '            xmax = 2.6743558054163663',
 '            text = ""',
 '        intervals [3]:',
 '            xmin = 2.6743558054163663',
 '            xmax = 7.306489540514602',
 '            text = "Dobro, od fameje rekla si da vas- @ da si bila (.) z nonići u hiže."',
 '        intervals [4]:',
 '            xmin = 7.306489540514602',
 '            xmax = 7.86632775927532',
 '            text = "((breath))"',
 '        intervals [5]:',
 '            

In [109]:

input_dir = "/Users/irisz/downloads/2023_ELIC/croatian/croatian_manual_annotated_1008/" # folder name that contains all the textgrid files
output_dir = "/Users/irisz/downloads/2023_ELIC/croatian/croatian_manual_annotated_1008_overlap_removed/" # output dir that stores all the new textgrid files

if not os.path.exists(output_dir):
    os.makedirs(output_dir) # make output dir if not exist
threshold = 0.15        
get_processed_result(input_dir, output_dir)    # will print out the files that have been processed


/Users/irisz/downloads/2023_ELIC/croatian/croatian_manual_annotated_1008/ckm006-2023-05-28-Kostrena_07.TextGrid
/Users/irisz/downloads/2023_ELIC/croatian/croatian_manual_annotated_1008/ckm006-2023-05-28-Kostrena_06.TextGrid
/Users/irisz/downloads/2023_ELIC/croatian/croatian_manual_annotated_1008/ckm001-2022-01-16-Trviž_02.TextGrid


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['overlap'] = df_sub.apply(lambda x: overlap_record(df_sub, x.layer, x.interval), axis = 1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_sub['overlap_max'] = df_sub['overlap'].apply(lambda x: get_max_overlap_percent(x))


/Users/irisz/downloads/2023_ELIC/croatian/croatian_manual_annotated_1008/ckm001-2022-01-16-Trviž_03.TextGrid
/Users/irisz/downloads/2023_ELIC/croatian/croatian_manual_annotated_1008/ckm005-2022-11-26-Vrbnik_06.TextGrid
/Users/irisz/downloads/2023_ELIC/croatian/croatian_manual_annotated_1008/ckm002-2022-01-04-Crikvenica_01.TextGrid
/Users/irisz/downloads/2023_ELIC/croatian/croatian_manual_annotated_1008/ckm005-2022-11-26-Vrbnik_01.TextGrid
/Users/irisz/downloads/2023_ELIC/croatian/croatian_manual_annotated_1008/ckm002-2022-01-04-Crikvenica_06.TextGrid
/Users/irisz/downloads/2023_ELIC/croatian/croatian_manual_annotated_1008/ckm002-2022-01-04-Crikvenica_07.TextGrid
/Users/irisz/downloads/2023_ELIC/croatian/croatian_manual_annotated_1008/ckm006-2023-05-28-Kostrena_01.TextGrid
/Users/irisz/downloads/2023_ELIC/croatian/croatian_manual_annotated_1008/ckm001-2022-01-16-Trviž_05.TextGrid
/Users/irisz/downloads/2023_ELIC/croatian/croatian_manual_annotated_1008/ckm001-2022-01-16-Trviž_04.TextG