In [114]:
import pandas as pd
import re
from json import dump, load
from collections import Counter

## 1. Clean Transcript

In [2]:
headers = {'ANNOUNCER','BIDEN', 'WARREN', 'SANDERS', 'HARRIS', 'YANG', 'BOOKER',
           'Oâ€™ROURKE', 'KLOBUCHAR', 'CASTRO', 'STEPHANOPOULOS', 'RAMOS', 'DAVIS',
           'MUIR', 'BUTTIGIEG'}

In [3]:
def clean_transcript(transcript_file, headers, include_comment=False):
    """
    Clean the ABC transcript. This function saves cleaned transcript and a list
    (in json format), where each entry is the speaker of the corresponding line
    in the transcript.
    
    Args:
        transcript_file(string): path to the transcript file (txt format)
        headers([string]): a list of speakers
        include_comment(bool): whether to include comment such as (APPLAUSE)
    
    """

    file_prefix = re.sub(r'(.+)\.txt', r'\1', transcript_file)
    
    cleaned_transcript = ''
    speakers = []
    cur_speaker = ''

    with open(transcript_file, 'r') as fp:
        for line in fp:
            line = line.replace('\n', '')

            if line == '':
                continue

            # If ignore comment
            if not include_comment and '(' in line:
                continue

            for header in headers:
                if header + ':' in line:
                    cur_speaker = header
                    line = line.replace(header + ': ', '')
                    break

            cleaned_transcript += line + '\n'
            speakers.append(cur_speaker)

    # Output the cleaned transcript and speaker information
    with open(file_prefix + '_cleaned.txt', 'w') as fp:
        fp.write(cleaned_transcript)

    dump(speakers, open(file_prefix + '_cleaned.json', 'w'),
         ensure_ascii=False, indent=2)

In [4]:
clean_transcript('full.txt')

## 2. Audio Segmentation

### 2.1. Sentencify Transcript

We want to put each sentence as one row for the transcript.

In [112]:
sentenced_transcript = []
sentenced_speaker = []

speaker_list = load(open('./full_cleaned.json', 'r'))

def switch_special(line, hide):
    special_chars = ['...', 'U.S.', 'p.m.', 'U.N.', 'D.A.', 'D.C.', 'Mr.', '.com']
    special_masks = ['+++', 'U+S+', 'p+m+', 'U+N+', 'D+A+', 'D+C+', 'Mr+', '+com']
    temp = line

    if hide:
        for i in range(len(special_chars)):
            if special_chars[i] in line:
                temp = temp.replace(special_chars[i], special_masks[i])
    else:
        if '+' in line:
            temp = temp.replace('+', '.')

    return temp
            

with open('./full_cleaned.txt', 'r') as fp:
    lines = fp.readlines()

    for i in range(len(lines)):
        has_three_dots = False
        has_usa = False
        cur_line = lines[i]
        cur_speaker = speaker_list[i]
        
        cur_line = switch_special(cur_line, True)

        while '.' in cur_line or '?' in cur_line:
            dot_index = cur_line.find('.' if '.' in cur_line else '?')

            # Skip number
            if cur_line[dot_index + 1].isdigit():
                temp_list = list(cur_line)
                temp_list[dot_index] = '+'
                cur_line = ''.join(temp_list)
                continue
            # Include ending quote
            if cur_line[dot_index + 1] == '"':
                dot_index += 1
            if cur_line[dot_index + 1: dot_index + 3] == '\'"':
                dot_index += 2
                    
            cur_sentense = cur_line[:dot_index + 1]
            cur_sentense = switch_special(cur_sentense, False)
            sentenced_transcript.append(cur_sentense)
            sentenced_speaker.append(cur_speaker)

            cur_line = cur_line[dot_index+1:]
            if cur_line[0] == ' ':
                cur_line = cur_line[1:]
            if cur_line == '\n':
                cur_line = ' '
            if cur_line[0] == '.':
                pass
                # print(lines[i])

        if not cur_line.isspace():
            cur_line = switch_special(cur_line, False)
            sentenced_transcript.append(cur_line)
            sentenced_speaker.append(cur_speaker)

leters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
for i in range(len(sentenced_transcript)):
    if sentenced_transcript[i][0] not in leters and sentenced_transcript[i][0:3] != '...':
        print(sentenced_transcript[i-2])
        print(sentenced_transcript[i-1])
        print(sentenced_transcript[i])
        print('\n')

In [118]:
df = pd.DataFrame({'sentense': sentenced_transcript,
                   'speaker': sentenced_speaker})
df.to_csv('full_sentence.csv', index=False)