Maya Asher, 2/13/24
# Processing Santa Barbara Corpus of Spoken American English
## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from scipy import stats
from statistics import mean 
import re
import io
%pprint

Pretty printing has been turned OFF


## Read in correct files

In [2]:
# put trn files into a dict
path = "/Users/mayaasher/data_science/Stance-Taking-in-Spontaneous-Speech/data/utf-16"
os.chdir(path)
txt_dict = {}
for file in os.listdir():
    if file.endswith(".trn"): 
        filename = file
        f = open(path+'/'+filename, 'r', encoding='utf-16-be') # have to specify utf-16 big-endian
        text = f.read()
        txt_dict[str(filename)] = str(text)
        f.close()

In [3]:
len(txt_dict)

43

In [4]:
txt_dict.keys() # looks good

dict_keys(['SBC034.trn', 'SBC008.trn', 'SBC009.trn', 'SBC035.trn', 'SBC023.trn', 'SBC037.trn', 'SBC036.trn', 'SBC022.trn', 'SBC032.trn', 'SBC033.trn', 'SBC019.trn', 'SBC031.trn', 'SBC024.trn', 'SBC018.trn', 'SBC057.trn', 'SBC043.trn', 'SBC042.trn', 'SBC056.trn', 'SBC045.trn', 'SBC051.trn', 'SBC050.trn', 'SBC044.trn', 'SBC047.trn', 'SBC049.trn', 'SBC048.trn', 'SBC060.trn', 'SBC058.trn', 'SBC059.trn', 'SBC015.trn', 'SBC001.trn', 'SBC029.trn', 'SBC014.trn', 'SBC002.trn', 'SBC016.trn', 'SBC017.trn', 'SBC003.trn', 'SBC007.trn', 'SBC013.trn', 'SBC006.trn', 'SBC010.trn', 'SBC004.trn', 'SBC005.trn', 'SBC011.trn'])

In [5]:
print(txt_dict['SBC034.trn'][:1000])

﻿0.000	4.475	>ENV:	((DOOR_OPENING_AND_CLOSING))
4.475	5.077	KAREN:	Hi sweetie.
5.077	5.908	SCOTT:	... Hey.
5.908	6.489		.. (THROAT)
6.489	7.630	KAREN:	<X Sweetie frumptions X>.
7.630	10.399		... This is kinda open.
10.399	11.071	SCOTT:	... Yep.
11.071	12.008		... How was work?
12.008	13.158	>ENV:	((CLOSET))
13.158	13.889	KAREN:	I'm so tired.
13.889	15.155	SCOTT:	... Ti=red.
15.155	15.743	KAREN:	It was % --
15.743	16.277		It was okay,
16.277	16.939		I left my bag there.
16.939	17.809		... <VOX I left my bag,
17.809	18.523		and all my money,
18.523	19.574		and all my things= VOX>.
19.574	21.174	SCOTT:	... N[ow the gho]sts'll get it.
20.268	20.558	KAREN:	[They ca-] --
21.174	22.600		... <@ Ghosts'll get it @>,
22.600	23.958		... that's okay.
23.958	24.901		... (TSK) (H)
24.901	25.505		... Um,
25.505	27.246		... (TSK) (H)
27.246	27.954		... ten thirty,
27.954	29.582		there were probably .. thirty people in the store.
29.582	30.762	SCOTT:	... Good grief.
30.762	31.063	KAREN:	We're like,
31.

In [6]:
txt_dict['SBC034.trn'][:1000] # gotta clean this up...

"\ufeff0.000\t4.475\t>ENV:\t((DOOR_OPENING_AND_CLOSING))\n4.475\t5.077\tKAREN:\tHi sweetie.\n5.077\t5.908\tSCOTT:\t... Hey.\n5.908\t6.489\t\t.. (THROAT)\n6.489\t7.630\tKAREN:\t<X Sweetie frumptions X>.\n7.630\t10.399\t\t... This is kinda open.\n10.399\t11.071\tSCOTT:\t... Yep.\n11.071\t12.008\t\t... How was work?\n12.008\t13.158\t>ENV:\t((CLOSET))\n13.158\t13.889\tKAREN:\tI'm so tired.\n13.889\t15.155\tSCOTT:\t... Ti=red.\n15.155\t15.743\tKAREN:\tIt was % --\n15.743\t16.277\t\tIt was okay,\n16.277\t16.939\t\tI left my bag there.\n16.939\t17.809\t\t... <VOX I left my bag,\n17.809\t18.523\t\tand all my money,\n18.523\t19.574\t\tand all my things= VOX>.\n19.574\t21.174\tSCOTT:\t... N[ow the gho]sts'll get it.\n20.268\t20.558\tKAREN:\t[They ca-] --\n21.174\t22.600\t\t... <@ Ghosts'll get it @>,\n22.600\t23.958\t\t... that's okay.\n23.958\t24.901\t\t... (TSK) (H)\n24.901\t25.505\t\t... Um,\n25.505\t27.246\t\t... (TSK) (H)\n27.246\t27.954\t\t... ten thirty,\n27.954\t29.582\t\tthere were prob

## Clean up text
### Toy data 
We basically want to get rid of all non-alphabetic characters in each 'value' of the dictionary (each string/transcript). Let's work with a toy string first...

In [7]:
toy = "\ufeff0.000\t4.475\t>ENV:\t((DOOR_OPENING_AND_CLOSING))\n4.475\t5.077\tKAREN:\tHi sweetie.\n5.077\t5.908\tSCOTT:\t... Hey.\n5.908\t6.489\t\t.. (THROAT)\n6.489\t7.630\tKAREN:\t<X Sweetie frumptions X>.\n7.630\t10.399\t\t... This is kinda open.\n10.399\t11.071\tSCOTT:\t... Yep.\n11.071\t12.008\t\t... How was work?\n12.008\t13.158\t>ENV:\t((CLOSET))\n13.158\t13.889\tKAREN:\tI'm so tired.\n13.889\t15.155\tSCOTT:\t... Ti=red.\n15.155\t15.743\tKAREN:\tIt was % --\n15.743\t16.277\t\tIt was okay,\n16.277\t16.939\t\tI left my bag there.\n16.939\t17.809\t\t... <VOX I left my bag,\n17.809\t18.523\t\tand all my money,\n18.523\t19.574\t\tand all my things= VOX>.\n19.574\t21.174\tSCOTT:\t... N[ow the gho]sts'll get it.\n20.268\t20.558\tKAREN:\t[They ca-] --\n21.174\t22.600\t\t... <@ Ghosts'll get it @>,\n22.600\t23.958\t\t... that's okay.\n23.958\t24.901\t\t... (TSK) (H)\n24.901\t25.505\t\t... Um,\n25.505\t27.246\t\t... (TSK) (H)\n27.246\t27.954\t\t... ten thirty,\n27.954\t29.582\t\tthere were probably .. thirty people in the store.\n29.582\t30.762\tSCOTT:\t... Good grief.\n30.762\t31.063\tKAREN:\tWe're like,\n31."
toy

"\ufeff0.000\t4.475\t>ENV:\t((DOOR_OPENING_AND_CLOSING))\n4.475\t5.077\tKAREN:\tHi sweetie.\n5.077\t5.908\tSCOTT:\t... Hey.\n5.908\t6.489\t\t.. (THROAT)\n6.489\t7.630\tKAREN:\t<X Sweetie frumptions X>.\n7.630\t10.399\t\t... This is kinda open.\n10.399\t11.071\tSCOTT:\t... Yep.\n11.071\t12.008\t\t... How was work?\n12.008\t13.158\t>ENV:\t((CLOSET))\n13.158\t13.889\tKAREN:\tI'm so tired.\n13.889\t15.155\tSCOTT:\t... Ti=red.\n15.155\t15.743\tKAREN:\tIt was % --\n15.743\t16.277\t\tIt was okay,\n16.277\t16.939\t\tI left my bag there.\n16.939\t17.809\t\t... <VOX I left my bag,\n17.809\t18.523\t\tand all my money,\n18.523\t19.574\t\tand all my things= VOX>.\n19.574\t21.174\tSCOTT:\t... N[ow the gho]sts'll get it.\n20.268\t20.558\tKAREN:\t[They ca-] --\n21.174\t22.600\t\t... <@ Ghosts'll get it @>,\n22.600\t23.958\t\t... that's okay.\n23.958\t24.901\t\t... (TSK) (H)\n24.901\t25.505\t\t... Um,\n25.505\t27.246\t\t... (TSK) (H)\n27.246\t27.954\t\t... ten thirty,\n27.954\t29.582\t\tthere were prob

`.splitlines()` will put each line of the string into a list and
`split('\t')` will break each line into it's parts 

In [8]:
line = toy.splitlines()[1] 
print(line.split('\t'))
line

['4.475', '5.077', 'KAREN:', 'Hi sweetie.']


'4.475\t5.077\tKAREN:\tHi sweetie.'

### Trying it for real
Creating a dictionary that contains the file name as the value and a list of lists as the values. The larger list contains the entire transcript, and each smaller list contains a single line. I will then have to go in and remove what needs to be gone...good news is I can automatically remove indices 0, 1, and 2 because they are just time stamps and names. Or maybe I'll put it in a df...hmmmmmmmmm

In [9]:
for fn, text in txt_dict.items(): # iterate through each transcript
    temp = []
    split_lines = text.splitlines() # split each line into a list entry
    for line in split_lines:
        temp.append(line.split('\t')) # split each line into its components
    txt_dict[fn] = temp

In [10]:
txt_dict['SBC034.trn'][1]

['4.475', '5.077', 'KAREN:', 'Hi sweetie.']

In [11]:
txt_dict['SBC034.trn'][:10] # gotta clean this up...

[['\ufeff0.000', '4.475', '>ENV:', '((DOOR_OPENING_AND_CLOSING))'], ['4.475', '5.077', 'KAREN:', 'Hi sweetie.'], ['5.077', '5.908', 'SCOTT:', '... Hey.'], ['5.908', '6.489', '', '.. (THROAT)'], ['6.489', '7.630', 'KAREN:', '<X Sweetie frumptions X>.'], ['7.630', '10.399', '', '... This is kinda open.'], ['10.399', '11.071', 'SCOTT:', '... Yep.'], ['11.071', '12.008', '', '... How was work?'], ['12.008', '13.158', '>ENV:', '((CLOSET))'], ['13.158', '13.889', 'KAREN:', "I'm so tired."]]

### The specific things we want to remove

In [12]:
f = open("/Users/mayaasher/data_science/metadata/annotations.txt", "r")
removals = f.read()
f.close()

In [13]:
print(removals) # the specific chars we want to remove

Santa Barbara Corpus of Spoken American English Part-II

  

Units 
    Intonation Unit			RETURN 
    Truncated intonation unit		-- 
    word				SPACE 
    truncated word			_ 

Speakers 
    Speaker identity/turn start		: 
    Speech overlap			[ ] 

Transitional Continuity 
    Final				. 
    Continuing				, 
    Appeal				? 

Terminal Pitch Direction 
    Fall				\ 
    Rise				/ 
    Level				- 

Accent and Lengthening 
    Primary accent			^ 
    Secondary accent			' 
    Booster				! 
    Lengthening				= 

Tone 
    Fall				\ 
    Rise				/ 
    Fall-rise				\/ 
    Rise-fall				/\ 
    Level				- 

Pause 
    Long				...(N) 
    Medium				... 
    Short				.. 
    Latching				(0) 

Vocal Noises 
    Vocal noises			( ) 
    Inhalation				(H) 
    Exhalation				(Hx) 
    Glottal stop			% 
    Laughter				@ 

Quality 
    Quality				<Y  Y> 
    Laugh quality			<@  @> 
    Quotation quality			<Q  Q> 
    Multiple quality features		<Y  <Z  Z>  Y> 

Phonetics 
    Phonetic transcripti