Katherine Kairis, kak275@pitt.edu, 11/2/2017

NEW CONTINUING -- This file continues upon the first progress report

In [1]:
from bs4 import BeautifulSoup
import glob
import re
import nltk

In [2]:
transcripts = glob.glob('data/VOICE/VOICE2.0XML/XML/*.xml')
del transcripts[0]

tagged_transcripts = glob.glob('data/VOICE/VOICEPOSXML2.0/XML/*.xml')
del transcripts[0]

In [3]:
#Create three dictionaries
participants = {}
conversations = {}
tagged_convs = {}

## Getting info about the participants

In [4]:
def participant_info(contents):
    
    #Get all of the participants in the given conversation
    people = contents.find('listPerson', {'type': 'identified'}).findAll('person')
    
    for p in people:
        #info is a subdirector that contains a single participant's information. It will be 
        info = {}
        info['role'] = p['role']
        info['age'] = p.age.get_text()
        info['sex'] = p.sex.get_text()
        
        #In some cases, the occupation isn't listed. If it is included, get the text of the occupation field.
        #If it isn't included, "None" will be stored as the occupation, since p.occupation would return "None."
        try:
            info['occupation'] = p.occupation.get_text()
        except AttributeError:
            info['occupation'] = p.occupation
        
        #Get a list of the languages that the participant speaks. Iterate through the list, and add them to the
        #dictionary according to the speaker's level (ie. L1).
        languages = p.findAll('langKnown')
        for l in languages:
            level = l['level']
            language = l['tag']
        
            if level in info:
                info[level].append(language)
            else:
                info[level] = [language]
    
        #Get the participant's ID number, and make it a key in the participants dictionary. The value will be
        #the info dictionary
        name = p['xml:id']
        participants[name] = info

## Getting lines of the conversation from the file

In [5]:
def conversation_lines(file, contents, li):
    file_name = file.split("/")[-1]
    li[file_name] = contents

In [6]:
conversations = {}
for t in transcripts:
    file = open(t, 'r')
    text = file.read()
    xml_contents = BeautifulSoup(text, 'xml')
    conversation_lines(t, xml_contents, conversations)
    participant_info(xml_contents)

In [7]:
len(conversations)

150

In [8]:
#Get native English speakers
native_speakers = []

#There are multiple ways that English is listed as an L1 ("eng", "eng-US", "eng-CA", "eng-GB", "eng-GY", "eng-AU", etc)
#I used a regular expression to find all of these instances
r = re.compile("eng.*")

for person in participants:
    
    #returns a list of all languages that contain "eng.*" The length of this list should be 1 or 0. If it's 1, the
    #participant has English listed as an L1.
    english = list(filter(r.match, participants[person]['L1']))
    
    if len(english) != 0:
        #print(person, ':', participants[person])
        native_speakers.append(person)

In [9]:
bilinguals = []
L1_counts = {}
#participants[native_speakers[0]]
for p in participants:
    #print(participants[p]['L1'])
    languages = participants[p]['L1']
    if len(languages) > 1:
        bilinguals.append(p)
        if p in native_speakers:
            if 'eng' not in L1_counts:
                L1_counts['eng'] = 1
            else:
                L1_counts['eng'] += 1
        continue
    
    
    for l in languages:
        L1 = l.split("-")[0]
        if L1 not in L1_counts:
            L1_counts[L1] = 1
        else:
            L1_counts[L1] += 1
        #print(l.split("-")[0])

In [10]:
modified_conversations = {}
lines = {}

for file in conversations:
    conv_lines = {}
    
    c = conversations[file].findAll('u')

In [11]:
#Checks to make sure the line can be added to the dictionary.
#A line must meet the following criteria: 
#the participant cannot be a native speaker of English
#the participant must be listed in the participant directory
#the participant cannot be bilingual
#the line cannot contain any non-English words
#the line cannot contain the speaker reading anything out loud
def valid_utterance(participant, line):
    if participant in native_speakers:
        return False
    if participant not in participants:
        return False
    #if len(text) == 0:
    #    return False
    if line.foreign != None:
        return False
    if line.unclear != None: 
        return False
    if line.reading_aloud != None:
        return False
    if line.reading != None :
        return False
    return True

In [12]:
#Iterate through all of the files in VOICE to a nested dictionary that contains the word tokens of the
#conversations.
#The keys of the dictionaries are the file names. The values of these entries are subdictionaries. The keys of the
#subdictionary are (participant, line_number) tuples, and the values are lists of tokens.
tokenized_conversations = {}

for file in conversations:
    conv_lines = {}
    
    c = conversations[file].findAll('u')
    
    for l in c:
        participant = l['who'].replace("#", "")
        line_id = l['xml:id']
        text = l.get_text()
        tokens = nltk.word_tokenize(text)
        
        if len(text) != 0 and valid_utterance(participant, l) == True:
            key = (line_id, participant)
            conv_lines[key] = tokens
    
    tokenized_conversations[file] = conv_lines


In [13]:
#Get the text from the pos-tagged files
tagged_conv_lines = {}
for t in tagged_transcripts:
    file = open(t, 'r')
    text = file.read()
    xml_contents = BeautifulSoup(text, 'xml')
    conversation_lines(t, xml_contents, tagged_conv_lines)

In [14]:
#Iterate through all of the files in VOICE to a nested dictionary that contains the (word, tag) tuples from the
#conversations.
#The keys of the dictionaries are the file names. The values of these entries are subdictionaries. The keys of the
#subdictionary are (participant, line_number) tuples, and the values are lists of (word, tag) tuples.
tagged_conversations = {}

for file in tagged_conv_lines:
    conv_lines = {}
    c = tagged_conv_lines[file].findAll('u')
    
    for l in c:
        utterance = []
        
        participant = l['who'].replace("#", "")
        line_id = l['xml:id']
        key = (participant, line_id)
        
        if valid_utterance(participant, l) == True:            
            tags = l.findAll('w')
            for t in tags:
                word = t.text
                ana = str(t).split()[1]
                ana = ana.split("=")
                tag = ana[1][2:]
                tag = tag.split('"')[0]
                #print(word, tag)
            
                utterance.append((word, tag))
                
            conv_lines[key] = utterance
        
    tagged_conversations[file] = conv_lines

In [15]:
#Save the two dictionaries as pickle files
import pickle

In [17]:
f = open('VOICE_tokenized.p', 'wb')
pickle.dump(tokenized_conversations, f, -1)
f.close()

In [18]:
f = open('VOICE_tagged.p', 'wb')
pickle.dump(tagged_conversations, f, -1)
f.close()