In [1]:
import os

def merge_text_files(input_dir, output_file):
    with open(output_file, 'w') as output:
        for file_name in os.listdir(input_dir):
            if file_name.endswith('.txt'):  
                with open(os.path.join(input_dir, file_name), 'r') as input_file:
                    output.write(input_file.read())
                    output.write('\n')  


In [3]:
input_dir = 'C:/Users/kalap/Downloads/Total'

In [4]:
output_file = 'speech_words.txt'

In [5]:
merge_text_files(input_dir, output_file)

In [7]:
import re

def preprocess_text(text):
    patterns = [
        r'<SIL>',          
        r'<NOISE>',        
        r'<IVER>',        
        r'<VOCNOISE>',     
        r'<EXT-Ive>',      
        r'<HES-I>',        
        r'<EXCLUDE-name>'  
    ]
    
    
    regex_patterns = [re.compile(pattern) for pattern in patterns]
    

    for pattern in regex_patterns:
        text = re.sub(pattern, '', text)
    
   
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text


input_file_path = "speech_words.txt"
output_file_path = "output.txt"

with open(input_file_path, "r") as file:
    input_text = file.read()


clean_text = preprocess_text(input_text)

with open(output_file_path, "w") as file:
    file.write(clean_text)

print("Preprocessing complete. Clean text saved to:", output_file_path)


Preprocessing complete. Clean text saved to: output.txt


In [8]:
import nltk
from nltk.corpus import cmudict

In [9]:
nltk.download('cmudict')

[nltk_data] Downloading package cmudict to
[nltk_data]     C:\Users\kalap\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\cmudict.zip.


True

In [10]:
arpabet = nltk.corpus.cmudict.dict()

In [11]:
def extract_words(text, num_sentences=50):
    sentences = nltk.sent_tokenize(text) 
    sentences = sentences[:num_sentences]
    words = [word.lower() for sentence in sentences for word in nltk.word_tokenize(sentence)]    
    words = [word for word in words if word.isalnum()]
    
   
    unique_words = list(set(words))
    return unique_words

In [12]:
def convert_to_arpabet(word):
    if word in arpabet:
        return arpabet[word][0]
    else:
        return None

In [15]:
file_text = 'output.txt'

In [16]:
with open(file_text, "r") as file:
    processed_text = file.read()


In [17]:
unique_words = extract_words(processed_text)


In [18]:
unique_words = unique_words[:50]

In [19]:
arpabet_words = {}
for word in unique_words:
    arpabet_transcription = convert_to_arpabet(word)
    if arpabet_transcription:
        arpabet_words[word] = arpabet_transcription

for word, transcription in arpabet_words.items():
    print(f"{word}: {' '.join(transcription)}")

informed: IH0 N F AO1 R M D
pretty: P R IH1 T IY0
onto: AA1 N T UW0
i: AY1
felt: F EH1 L T
learning: L ER1 N IH0 NG
program: P R OW1 G R AE2 M
sporting: S P AO1 R T IH0 NG
read: R EH1 D
general: JH EH1 N ER0 AH0 L
feel: F IY1 L
let: L EH1 T
attached: AH0 T AE1 CH T
care: K EH1 R
or: AO1 R
world: W ER1 L D
still: S T IH1 L
african: AE1 F R AH0 K AH0 N
she: SH IY1
couple: K AH1 P AH0 L
swimsuit: S W IH1 M S UW2 T
house: HH AW1 S
scout: S K AW1 T
else: EH1 L S
promoting: P R AH0 M OW1 T IH0 NG
treadmill: T R EH1 D M IH2 L
what: W AH1 T
customs: K AH1 S T AH0 M Z
stories: S T AO1 R IY0 Z
happen: HH AE1 P AH0 N
we: W IY1
recognition: R EH2 K AH0 G N IH1 SH AH0 N
parent: P EH1 R AH0 N T
often: AO1 F AH0 N
forward: F AO1 R W ER0 D
laugh: L AE1 F
sister: S IH1 S T ER0
amount: AH0 M AW1 N T
myself: M AY2 S EH1 L F
computer: K AH0 M P Y UW1 T ER0
wow: W AW1
joey: JH OW1 IY0
houses: HH AW1 S AH0 Z
going: G OW1 IH0 NG
hilliard: HH IH1 L Y AA0 R D
news: N UW1 Z
lake: L EY1 K
people: P IY1 P AH0 L
t