In [1]:
# Imports
import os

# Regular expressions
import glob

In [2]:
# Couple of file writing helper functions

def import_textfile_to_linelist(filepath):
    with open (filepath, "r") as f:
        content = f.readlines()
    content = [x.strip('\n') for x in content] 
    return content

def output_linelist_to_textfile(filepath, linelist):
    with open (filepath, "w") as f:
        for currline in linelist:
            f.write(currline)
            f.write('\n')

In [4]:
def convert_linelist_to_speechonlylinelist(linelist):
    newlinelist = []
    for currline in linelist:
        
        # trim whitespace to better judge line length.
        currline = currline.strip()
        
        # if we have a non-blank line, process it further
        if len(currline) > 0:
            try:
                speaker, linetxt = currline.split(':', 1)
                speaker = speaker.strip()
                linetxt = linetxt.strip()
            except:
                print("ERROR: convert_linelist_to_speechonlylinelist({})".format(currline))
        else:
            speaker = 'none'
            linetxt = ''
        newlinelist.append(linetxt)
    return newlinelist

In [5]:
# An exchange is a piece of dialog between (normally) two people. Then a blank line. Then another exhange. 
# If we're converting dialog into Q's and A's, then we don't want the last line of one exchange to be a Q, 
# and the first line of the next to be the A.

def convert_linelist_to_exchangelist(linelist):
    currexchange = []
    exchangelist = []
    
    # loop through all the input lines
    for currline in linelist:
        
        # trim whitespace to better judge line length.
        currline = currline.strip()

        # if we have a non-blank line, add it to the exchange
        if len(currline) > 0:
            currexchange.append(currline)
        # else we have a blank line. if the curr exchange has content, add it to the returned exchange list and reset
        else:
            if (len(currexchange) > 0):
                exchangelist.append(currexchange)
                currexchange = []
    
    exchangelist.append(currexchange)
    currexchange = []
    
    return exchangelist

In [6]:
def create_qandalists_from_exchangelist(exchangelist):
    qlist = []
    alist = []
    for currexchange in exchangelist:
        if len(currexchange) > 1:
            for i in range(len(currexchange)-1):
                qlist.append(currexchange[i])
                alist.append(currexchange[i+1])
    return qlist, alist

In [7]:
# Fundamental parameters / setup
inputdir  = '../001-CorpusData/004-Dialog/001-Scripts/'
outputdir = '../001-CorpusData/004-Dialog/002-Exchanges/'
    
inputfile_list = glob.glob(inputdir + '*.txt')
print ('{} files'.format(len(inputfile_list)))
print (inputfile_list[0])

test_linelist = []
speechonly_linelist = []
test_exchlist = []
qlist = []
alist = []


187 files
../001-CorpusData/004-Dialog/001-Scripts\dialog_esl_conversations.txt


In [8]:
# Run a test on one of the smaller files to check everything is working

test_linelist = import_textfile_to_linelist(inputdir + 'st-ng-103-sample.txt')
print (test_linelist)
speechonly_linelist = convert_linelist_to_speechonlylinelist(test_linelist)
print ('\n', speechonly_linelist)
test_exchlist = convert_linelist_to_exchangelist(speechonly_linelist)
print ('\n', test_exchlist)
qlist, alist = create_qandalists_from_exchangelist(test_exchlist)
print ('\n', qlist)
print ('\n', alist)

output_linelist_to_textfile(outputdir + 'st-ng-103-sample-q.txt', qlist)
output_linelist_to_textfile(outputdir + 'st-ng-103-sample-a.txt', alist)
output_linelist_to_textfile(outputdir + 'st-ng-103-sample-all.txt', speechonly_linelist)

['DATA: SS Tsiolkovsky, repeat your message. ', 'WOMAN: Well hello, Enterprise. Welcome. I hope you have a lot of pretty boys on board. ', 'MAN: Do it! Yeah, go ahead. Do it!', 'DATA: Captain, what we just heard is impossible. ', '', 'RIKER: Cover the ship as planned. Move out. ', 'DATA: Indications of what humans would call a wild party? ', 'RIKER: Their Bridge. If this thing works, be sure to record everything. ', '', 'RIKER: You were right. Somebody blew the hatch. They were all sucked out into space. ', "DATA: Correction, sir, that's blown out. ", 'RIKER: Thank you, Data. ', '', 'RIKER: Apparently some of them were apparently blown out the emergency hatches. ', 'PICARD: But there were eighty people on that ship, Number One. ', 'RIKER: Yes sir.']

 ['SS Tsiolkovsky, repeat your message.', 'Well hello, Enterprise. Welcome. I hope you have a lot of pretty boys on board.', 'Do it! Yeah, go ahead. Do it!', 'Captain, what we just heard is impossible.', '', 'Cover the ship as planned. Mov

In [9]:
# Run the conversion over all of the files. 
# Note that it currently breaks, need to finish getting punctuation character our of the source files and make this more robust.

for currfile in inputfile_list:
    
    # import the file
    input_linelist      = import_textfile_to_linelist(currfile)
    print("Processing: {}".format(currfile))
    
    # Stages of conversion to q and a lists
    speechonly_linelist = convert_linelist_to_speechonlylinelist(input_linelist)
    test_exchlist       = convert_linelist_to_exchangelist(speechonly_linelist)
    qlist, alist        = create_qandalists_from_exchangelist(test_exchlist)
    
    # work out filenames to save to
    filename  = os.path.basename(currfile)
    noextname = os.path.splitext(filename)[0]
    
    q_fname   = outputdir + noextname + "-q.txt"
    a_fname   = outputdir + noextname + "-a.txt"
    all_fname = outputdir + noextname + "-all.txt"
    
    print (q_fname)
    print (a_fname)
    print (all_fname)
    print('\n')
    
    output_linelist_to_textfile(q_fname,   qlist)
    output_linelist_to_textfile(a_fname,   alist)
    output_linelist_to_textfile(all_fname, speechonly_linelist)
    

Processing: ../001-CorpusData/004-Dialog/001-Scripts\dialog_esl_conversations.txt
../001-CorpusData/004-Dialog/002-Exchanges/dialog_esl_conversations-q.txt
../001-CorpusData/004-Dialog/002-Exchanges/dialog_esl_conversations-a.txt
../001-CorpusData/004-Dialog/002-Exchanges/dialog_esl_conversations-all.txt


Processing: ../001-CorpusData/004-Dialog/001-Scripts\simpledialog.txt
../001-CorpusData/004-Dialog/002-Exchanges/simpledialog-q.txt
../001-CorpusData/004-Dialog/002-Exchanges/simpledialog-a.txt
../001-CorpusData/004-Dialog/002-Exchanges/simpledialog-all.txt


Processing: ../001-CorpusData/004-Dialog/001-Scripts\st-ng-102.txt
../001-CorpusData/004-Dialog/002-Exchanges/st-ng-102-q.txt
../001-CorpusData/004-Dialog/002-Exchanges/st-ng-102-a.txt
../001-CorpusData/004-Dialog/002-Exchanges/st-ng-102-all.txt


Processing: ../001-CorpusData/004-Dialog/001-Scripts\st-ng-103-sample.txt
../001-CorpusData/004-Dialog/002-Exchanges/st-ng-103-sample-q.txt
../001-CorpusData/004-Dialog/002-Exchanges/s

UnboundLocalError: local variable 'linetxt' referenced before assignment