Skip to content

Commit

Permalink
Merge pull request #1 from themadafrican/fixWinCfg
Browse files Browse the repository at this point in the history
Windows_error
  • Loading branch information
lingdoc committed Mar 18, 2016
2 parents 1da8457 + ce5e744 commit 95b067b
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 36 deletions.
20 changes: 14 additions & 6 deletions trs2txt.cfg
@@ -1,7 +1,15 @@
This document configures the 'trs2txt' converter. Create a backup of this file for future use. Only change the markers.
This document configures the 'trs2txt' converter.
Create a backup of this file for future use.
Only change the markers.

reference number: \ref
timecode beginning: \ELANBegin
timecode ending: \ELANEnd
text for language subtitles: \t
text for translation: \f


reference number: \ref

timecode beginning: \ELANBegin

timecode ending: \ELANEnd

text for language subtitles: \tx

text for translation: \ft
82 changes: 52 additions & 30 deletions trs2txt.py
Expand Up @@ -6,8 +6,9 @@
import glob
import datetime
import re
import io

config = open('trs2txt.cfg', 'r') # open the config file located in the same directory
config = io.open('trs2txt.cfg', 'r') # open the config file located in the same directory

ref = '' # create a blank string to store the 'ref' tag from the config file for processing
tbeg = '' # create a blank string to store the 'tbeg' tag from the config file for processing
Expand Down Expand Up @@ -41,7 +42,7 @@
ver += line[29:-1]+' ' # append it to the 'text' string
if line[0:22] == 'text for translation: ': # read the tag that identifies the translation subtitles
trans += line[22:-1]+' ' # append it to the 'trans' string

filenames = [] # create an empty list called 'filenames' to keep track of the .txt files in the directory

for index, file in enumerate(glob.glob("*.trs")): # use glob to create an enumerated list of the .txt files in the directory
Expand All @@ -51,9 +52,9 @@
trsfile = open(infile,'r') # Open each .trs file in 'read' mode

textfile = open(str(infile[0:-3])+'txt','w') # create a corresponding .txt file in 'write' mode to store the values we want from the .trs file

count = 0 # create a count value to keep track of lists

timecodes = [] # create a list value to keep track of timecodes
speaker = [] # create a list value to keep track of speakers
speakturn = '' # create a string value to keep track of speaker turns
Expand All @@ -62,78 +63,99 @@
lines = [] # create a list value to keep track of text lines

textfile.write(str('\\_sh v3.0 400 ELAN\n\\_DateStampHasFourDigitYear\n\n')) # write the header of the Toolbox file
for s in trsfile: # get all the lines we want from the .trs file and write them into a corresponding (new) .txt file

for line in trsfile: # get all the lines we want from the .trs file and write them into a corresponding (new) .txt file
try: # these 'try' loops are basically to ensure that if there are any lines that don't exist in the .trs file, they get ignored. Without these loops, those .trs files where different speakers weren't annotated would break the program.
if audiostart in s: # get the filename for the audio
result = re.search('%s(.*)%s' % (audiostart, audioend), s).group(1)
if audiostart in line: # get the filename for the audio
result = re.search('%s(.*)%s' % (audiostart, audioend), line).group(1)
audioname = result.replace(' ', '_') # replace spaces in the audio filename with underscores
textfile.write('\\id '+result+'\n') # write the \id of the file using the audio filename
except:
pass
try:
if section in s: # get the endtime of the sound file
complete = re.search('%s(.*)%s' % (endtime, endings), s).group(1) # story it in the 'complete' variable
if section in line: # get the endtime of the sound file
complete = re.search('%s(.*)%s' % (endtime, endings), line).group(1) # story it in the 'complete' variable
except:
pass
try:
if turnbeg+starttime in s: # get the first speaker's turn
spone = re.search('%s(.*)%s' % (starttime, endings+' '), s).group(1) # get the start of the speaker's turn
if turnbeg+starttime in line: # get the first speaker's turn
spone = re.search('%s(.*)%s' % (starttime, endings+' '), line).group(1) # get the start of the speaker's turn
speakvalstart.append(spone) # store it in a list
sptwo = re.search('%s(.*)%s' % (endtime, endings+' '), s).group(1) # get the end of the speaker's turn
sptwo = re.search('%s(.*)%s' % (endtime, endings+' '), line).group(1) # get the end of the speaker's turn
speakvalend.append(sptwo) # store it in another list
speak = re.search('%s(.*)%s' % (speakstart, endings+'>\n'), s).group(1) # get the name of the speaker
speak = re.search('%s(.*)%s' % (speakstart, endings+'>\n'), line).group(1) # get the name of the speaker
speakturn = speak # set the current value of the string variable 'speakturn' to the speaker name
except:
pass
try:
if turnbeg+speakstart in s: # get a non-first speaker's turn
speak = re.search('%s(.*)%s' % (speakstart, endings+' start'), s).group(1) # get the name of the speaker
if turnbeg+speakstart in line: # get a non-first speaker's turn
speak = re.search('%s(.*)%s' % (speakstart, endings+' start'), line).group(1) # get the name of the speaker
speakturn = speak # set the current value of the string variable 'speakturn' to the speaker name
spone = re.search('%s(.*)%s' % (starttime, endings+' '), s).group(1) # get the start of the speaker's turn
spone = re.search('%s(.*)%s' % (starttime, endings+' '), line).group(1) # get the start of the speaker's turn
speakvalstart.append(spone) # store it in a list
sptwo = re.search('%s(.*)%s' % (endtime, endings), s).group(1) # get the end of the speaker's turn
sptwo = re.search('%s(.*)%s' % (endtime, endings), line).group(1) # get the end of the speaker's turn
speakvalend.append(sptwo) # store it in another list
except:
pass
try:
if '<Sync' in s: # if there is a sync point
if '<Sync' in line: # if there is a sync point
speaker.append(speakturn) # add the name of the speaker to the 'speaker' list - this ensures that every turn has a corresponding speaker name
except:
pass
try:
if syncbeg in s: # if there is a sync point
sync = re.search('%s(.*)%s' % (syncbeg, syncend), s).group(1) # get the timecode
if syncbeg in line: # if there is a sync point
sync = re.search('%s(.*)%s' % (syncbeg, syncend), line).group(1) # get the timecode
timecodes.append(sync) # store it in the 'timecodes' list
except:
pass
try:
if '<' not in s: # if there is non-html tagged text, this is the text associated with a turn
lines.append(s) # add it to the 'lines' list
if '<' not in line: # if there is non-html tagged text, this is the text associated with a turn
lines.append(line) # add it to the 'lines' list
except:
pass


blue = len(timecodes)
# count = 0

for line in timecodes: # for each of the time-coded segments identified in the 'timecodes' list, do the following
if len(timecodes) != count+1: # if the counter number doesn't correspond to the number of the last timecode
# count += 1

# if blue != count+1:
# # print blue
# print count
# print timecodes[count]
# print len(lines)
# print ver
# print lines[count]
# count += 1
# elif blue == count+1:
# print count
# print timecodes[count]
# print len(lines)
# print ver
# print lines[count]
# print timecodes[blue]

if blue != count+1: # if the counter number doesn't correspond to the number of the last timecode
textfile.write(ref+audioname+'_'+str(count+1).zfill(3)+'\n') # write the 'ref' string and the modified file name
textfile.write(tbeg+str(timecodes[count])+'\n') # write the 'tbeg' string and the first timecode
textfile.write(tend+str(timecodes[count+1])+'\n') # write the 'tend' string and the second timecode
textfile.write(tpart+speaker[count]+'\n') # write the 'tpart' string and the name of the speaker in this turn
textfile.write(sound+audioname+'.wav '+str(timecodes[count])+' '+str(timecodes[count+1])+'\n') # write the 'sound' string and the name of the linked .wav file along with in and out points of the linked segment
textfile.write(ver+lines[count]) # write the 'ver' string and the corresponding line of text
textfile.write(str(ver)+str(lines[count])) # write the 'ver' string and the corresponding line of text
textfile.write(trans+'\n\n') # write the 'trans' line - this line will always be empty. If you have a corresponding free translation, you can copy it here in Toolbox
count += 1 # advance the counter by 1
elif len(timecodes) == count+1: # if the counter number corresponds to the number of the last timecode
elif blue == count+1: # if the counter number corresponds to the number of the last timecode
textfile.write(ref+audioname+'_'+str(count+1).zfill(3)+'\n') # write the 'ref' string and the modified file name
textfile.write(tbeg+str(timecodes[count])+'\n') # write the 'tbeg' string and the first timecode
textfile.write(tend+str(complete)+'\n') # write the 'tend' string and the second timecode
textfile.write(tpart+speaker[count-1]+'\n') # write the 'tpart' string and the name of the speaker in this turn
textfile.write(sound+audioname+'.wav '+str(timecodes[count])+' '+str(complete)+'\n') # write the 'sound' string and the name of the linked .wav file along with in and out points of the linked segment
textfile.write(ver+lines[count]) # write the 'ver' string and the corresponding line of text
textfile.write(str(ver)+str(lines[count])) # write the 'ver' string and the corresponding line of text
textfile.write(trans+'\n\n') # write the 'trans' line - this line will always be empty. If you have a corresponding free translation, you can copy it here in Toolbox
textfile.write('\ELANMediaURL '+audioname+'.wav\n'+'\ELANMediaMIME audio/x-wav'+'\n') # write the footer

trsfile.close() # close the trsfile now that all the data has been written to the 'textlines' lists/database
textfile.close() # close the textfile as well

# return to the head of the for loop and continue as long as there is a .trs file in the 'filenames' list

0 comments on commit 95b067b

Please sign in to comment.