# Cleaning of the Santa Barbara Corpus of American English

In [None]:
import numpy as np
import re
from os import listdir
from os.path import isfile, join

# Helper functions to clean the data:

In [213]:
regex = r"[!\"#$%&()*+,-.\/:;<=>?@[\]^_`{|}~\d]"
regex_white = r"[\s]+"
def clean_data(s):
    """Removes all specific tags from the transcription, like (SNIFF), (Hx), etc. and removes all punctuation"""
    line = []
    for word in re.sub(regex_white, " " ,re.sub(regex, "", s)).split():
        if (word == word.upper() and word.lower not in ["i", "a", "hiv"]) or word in ["Hx"]:
            continue
        else:
            line.append(word)
    return " ".join(line).strip()
        
def clean_file(elem):
    '''Cleans the transcription file consisting of initial and end time for each line, name of the person speaking and text uttered.
    Removes >ENV lines (which explain situations and not actual utterances) and collapses multi-line sentences into one line.'''
    cleaned = []
    person = []
    first = True
    for line in elem:
        try:
            nums, name, text = line.split("\t")
        except:
            continue

        if name.strip() == ">ENV":
            continue

        elif name.strip() != "":
            if first == True:
                first  = False
                person =[]
                person.append(clean_data(text.strip()))

            else:
                cleaned.append(" ".join(person))
                person =[]
                person.append(clean_data(text.strip()))

        elif name.strip() == "":
            person.append(clean_data(text.strip()))
    return cleaned 

def pickUp(filename, cleanname):
    """Replaces all instances of 'pick up' to 'pickup' for consistency"""
    yeah = open(filename, "r")
    lines = []
    for line in yeah.readlines():
        crn = []
        words = line.split()
        found = False
        if len(words) < 1:
            crn.append("")
        else:
            for i in range(len(words)-1):
                if found:
                    found = False
                    continue
                if words[i] == "pick" and words[i+1] == "up":
                    crn.append("pickup")
                    found = True
                else:
                    crn.append(words[i])
            if words[-1] != "up":
                crn.append(words[-1])
        lines.append(crn)

    for i in range(len(lines)):
        lines[i] = " ".join(lines[i])
    out_file = open(cleanname, "w")
    out_file.write("\n".join(lines))

### We go through all files in the SBCCorpus transcriptions directory, clean them and add them to a list:

In [216]:
cleaned_text = []
clean_SBC = []
path_files = "SBCorpus/TRN"
files = [f for f in listdir(path_files) if isfile(join(path_files, f))]

for file in files:
    SBC = np.loadtxt("SBCorpus/TRN/{}".format(file), dtype = "str", delimiter= "\n")
    cleaned_text += clean_file(SBC)

#Removing beginning words that our model was over predicting
for line in cleaned_text:
    if line[:2] == "no" or line[:2] == "ok" or line[:2] == "oo":
        clean_SBC.append(line[2:])
    elif line[:3] == "yes" or line[:3] == "yep" or line[:3] == "and":
        clean_SBC.append(line[3:])
    elif line[:4] == "yeah" or line[:4] == "okay" or line[:4] == "sure":
        clean_SBC.append(line[4:])
    else:
        clean_SBC.append(line)

In [219]:
# Writes the SBC cleaned data list into a .txt file, all lower case
with open("SBCorpus.train.txt", "w") as outfile:
    for line in clean_SBC:
        if line.strip() != "":
            outfile.write(line.lower() + "\n")

In [None]:
#Replacing all instances of 'pick up' to 'pickup' for consistency
pickUp("SBCorpus.train.txt","SBCpup.txt")