Katherine Kairis, kak275@pitt.edu, 10/12/2017

In [1]:
from bs4 import BeautifulSoup
import glob
import re
import nltk

In [2]:
transcripts = glob.glob('data/VOICE/VOICE2.0XML/XML/*.xml')
del transcripts[0]

In [3]:
#Create two dictionaries: one containing information about the participants, and one containing the conversations
participants = {}
conversations = {}

# Getting info about the participants
The participant_info function extracts information about the participants and stores it in the "participants" dictionary. The keys of the dictionary are the participants' ID numbers. The values are sub-dictionaries that include the participant's role, age, sex, and occupation (if listed). The sub-dictionaries also include the participants' L1s, which are stored in lists (since some participants have multiple L1s).

In [4]:
def participant_info(contents):
    
    #Get all of the participants in the given conversation
    people = contents.find('listPerson', {'type': 'identified'}).findAll('person')
    
    for p in people:
        #info is a subdirector that contains a single participant's information. It will be 
        info = {}
        info['role'] = p['role']
        info['age'] = p.age.get_text()
        info['sex'] = p.sex.get_text()
        
        #In some cases, the occupation isn't listed. If it is included, get the text of the occupation field.
        #If it isn't included, "None" will be stored as the occupation, since p.occupation would return "None."
        try:
            info['occupation'] = p.occupation.get_text()
        except AttributeError:
            info['occupation'] = p.occupation
        
        #Get a list of the languages that the participant speaks. Iterate through the list, and add them to the
        #dictionary according to the speaker's level (ie. L1).
        languages = p.findAll('langKnown')
        for l in languages:
            level = l['level']
            language = l['tag']
        
            if level in info:
                info[level].append(language)
            else:
                info[level] = [language]
    
        #Get the participant's ID number, and make it a key in the participants dictionary. The value will be
        #the info dictionary
        name = p['xml:id']
        participants[name] = info

# Getting lines of the conversation from the file
The conversation_lines function gets each line from the current conversation. The lines are stored as lists in the "conversations" dictionary, whose keys are the names of the XML files. For now, I decided to keep the lines in their XML format; there are a lot of annotations in the XML format that could be useful later on, such as the speaker, pauses, and intonation markings. Converting the XML lines into text/getting rid of the tags is simple, so I could change this later on.

In [5]:
def conversation_lines(file, contents):
    file_name = file.split("/")[-1]
    text_body = contents.body
    xml_lines = text_body.findAll('u')
    conversations[file_name] = xml_lines

# Processing the XML files
This section iterates through all of the files (except for corpus-header.xml) in the VOICE1.0XML/XML directory. It calls conversation_lines and participant_info to extract some important parts of the data from the corpus.

In [6]:
for t in transcripts:
    file = open(t, 'r')
    text = file.read()
    xml_contents = BeautifulSoup(text, 'xml')
    conversation_lines(t, xml_contents)
    participant_info(xml_contents)

In [7]:
conversations['EDcon496.xml'][0]

<u who="#EDcon496_S1" xml:id="EDcon496_u_1"> e<c type="lengthening"/>r leads so <pause/> ma<c type="lengthening"/>n i'm still stuck on lead du<c type="lengthening"/>de <pause dur="PT3S"/></u>

In [8]:
participants['EDcon250_S2']

{'L1': ['ger-AT', 'eng-US'],
 'age': '25-34',
 'occupation': None,
 'role': 'participant',
 'sex': 'female'}

In [9]:
#Get native English speakers
native_speakers = []

#There are multiple ways that English is listed as an L1 ("eng", "eng-US", "eng-CA", "eng-GB", "eng-GY", "eng-AU", etc)
#I used a regular expression to find all of these instances
r = re.compile("eng.*")

for person in participants:
    
    #returns a list of all languages that contain "eng.*" The length of this list should be 1 or 0. If it's 1, the
    #participant has English listed as an L1.
    english = list(filter(r.match, participants[person]['L1']))
    
    if len(english) != 0:
        #print(person, ':', participants[person])
        native_speakers.append(person)

In [10]:
participant = native_speakers[0]
languages = participants[participant]['L1']

In [11]:
english_dialects = {}
for p in native_speakers:
    languages = participants[p]['L1']
    for l in languages:
        if 'eng' not in l:
            continue
        #print(l)
        
        if l not in english_dialects:
            english_dialects[l] = 1
        else:
            english_dialects[l] += 1

In [12]:
english_dialects

{'eng': 8,
 'eng-AU': 3,
 'eng-CA': 6,
 'eng-GB': 39,
 'eng-GY': 1,
 'eng-IE': 3,
 'eng-MT': 7,
 'eng-US': 20}

In [13]:
bilinguals = []
L1_counts = {}
#participants[native_speakers[0]]
for p in participants:
    #print(participants[p]['L1'])
    languages = participants[p]['L1']
    if len(languages) > 1:
        bilinguals.append(p)
        if p in native_speakers:
            if 'eng' not in L1_counts:
                L1_counts['eng'] = 1
            else:
                L1_counts['eng'] += 1
        continue
    
    
    for l in languages:
        L1 = l.split("-")[0]
        if L1 not in L1_counts:
            L1_counts[L1] = 1
        else:
            L1_counts[L1] += 1
        #print(l.split("-")[0])

In [14]:
len(bilinguals)

44

In [15]:
for p in bilinguals:
    print(participants[p]['L1'])

['ger-AT', 'eng-US']
['eng-GY', 'dut-NL']
['fre-FR', 'eng']
['mlt-MT', 'eng-MT']
['mlt-MT', 'eng-MT']
['mlt-MT', 'eng-MT']
['mlt-MT', 'ger-AT']
['ger-AT', 'eng-US']
['ukr-UA', 'rus']
['ger-DE', 'ind-ID']
['ger-AT', 'eng-US']
['eng', 'heb', 'dut']
['slo-SK', 'ger']
['eng-CA', 'chi-CN']
['eng-CA', 'chi-CN']
['fre-CH', 'ger-CH']
['dut-NL', 'eng']
['eng-GB', 'spa-ES']
['fre-CH', 'ger-CH']
['eng-GB', 'spa-ES']
['eng-GB', 'spa-ES']
['eng-GB', 'spa-ES']
['fre-CH', 'ger-CH']
['eng-GB', 'spa-ES']
['eng-GB', 'spa-ES']
['fre-CH', 'ger-CH']
['eng-GB', 'spa-ES']
['fre-CH', 'ger-CH']
['eng-GB', 'spa-ES']
['mlt-MT', 'eng-MT']
['cat-ES', 'spa-ES']
['mlt-MT', 'eng-MT']
['mlt-MT', 'eng-MT']
['mlt-MT', 'eng-MT']
['spa-ES', 'cat-ES']
['dut', 'ger-AT']
['spa-ES', 'cat']
['por-PT', 'ger-AT']
['por-PT', 'ger-AT']
['dut', 'ger-AT']
['ger', 'ita']
['ger-AT', 'pol-PL']
['ara-PS', 'ger-AT']
['per-IR', 'eng-US']


In [16]:
L1_counts

{'alb': 11,
 'ara': 4,
 'arm': 6,
 'bos': 5,
 'bul': 13,
 'cat': 6,
 'chi': 7,
 'cze': 18,
 'dan': 35,
 'dut': 72,
 'eng': 87,
 'est': 8,
 'fin': 51,
 'fre': 63,
 'ger': 303,
 'gre': 14,
 'hin': 4,
 'hun': 13,
 'ice': 3,
 'ind': 2,
 'ita': 54,
 'jpn': 5,
 'kaz': 2,
 'kir': 2,
 'kor': 14,
 'lav': 19,
 'lit': 9,
 'mac': 14,
 'mlt': 22,
 'nor': 34,
 'per': 2,
 'pol': 35,
 'por': 21,
 'rum': 30,
 'rus': 22,
 'scc': 24,
 'scr': 13,
 'slo': 29,
 'slv': 16,
 'spa': 72,
 'swe': 16,
 'tgl': 1,
 'tur': 15,
 'ukr': 4,
 'und': 37,
 'urd': 2,
 'vie': 1,
 'yor': 1}

In [17]:
modified_conversations = {}
lines = {}

for file in conversations:
    conv_lines = {}
    
    c = conversations[file]
    
    for l in c:
        participant = l['who'].replace("#", "")
        line_id = l['xml:id']
        text = l.get_text()
        tokens = nltk.word_tokenize(text)
        
        if participant in native_speakers:
            continue
            
        if participant not in participants:
            #print(participant)
            continue
        
        elif len(text) == 0:
            continue
    
        elif l.foreign != None:     #returns the line if has "foreign" tag; None if it doesn't contain the tag
            #print(l.foreign.get_text())   #text of line  
            #print(l.foreign['xml:lang'])  #language used
            #print(l)                      #line (in XML format)
            continue
        
        elif l.unclear != None:     #returns the line if has "unclear" tag; None if it doesn't contain the tag
            #    print(l)
            continue
    
        elif l.reading_aloud != None:
            continue
            #print(l)
        
        elif l.reading != None :
            continue
            #print(l)
        
        else:
            key = (line_id, participant)
            #conv_lines[key] = text
            conv_lines[key] = tokens
    
    modified_conversations[file] = conv_lines
        
        

In [18]:
#%pprint
#modified_conversations['EDcon4.xml']
#modified_conversations['EDsed364.xml']
#modified_conversations['EDsed362.xml']

In [19]:
##Processing Part of speech tagged corpus

In [20]:
transcripts = glob.glob('data/VOICE/VOICEPOSXML2.0/XML/*.xml')
del transcripts[0]

In [21]:
tagged_conversations = {}
def xml_conversation_lines(file, contents):
    file_name = file.split("/")[-1]
    tagged_conversations[file_name] = contents

In [22]:
for t in transcripts:
    file = open(t, 'r')
    text = file.read()
    xml_contents = BeautifulSoup(text, 'xml')
    #conversation_lines(t, xml_contents)
    #participant_info(xml_contents)
    xml_conversation_lines(t, xml_contents)

In [23]:
utterances = xml_contents.body.findAll('u')

for u in utterances[:5]:
    participant = u['who'].replace("#", "")
    line_id = u['xml:id']
    
    key = (participant, line_id)
    print(key)
    
    tags = u.findAll('w')
   
    for t in tags:
        word = t.get_text()
        tag = t['ana'].replace("#", "")
        tu = (word, tag)
        print('\t', tu)
        

('PRwgd537_S10', 'PRwgd537_u_1')
	 ('er', 'UHfUH')
	 ('so', 'DMfDM')
	 ('c-', 'XXfXX')
	 ('c-', 'XXfXX')
	 ('can', 'MDfMD')
	 ('we', 'PPfPP')
	 ('prove', 'VVfVV')
	 ('that', 'INfIN')
	 ('if', 'INfIN')
	 ('er', 'UHfUH')
	 ('three', 'CDfCD')
	 ('form', 'NNfNN')
	 ('omega', 'NNfNN')
	 ('alpha', 'NNfNN')
	 ('define', 'VVPfVVP')
	 ('er', 'UHfUH')
	 ('this', 'DTfDT')
	 ('er', 'UHfUH')
	 ('distribution', 'NNfNN')
	 ('s_h', 'SYMfSYM')
	 ('_0', 'PAfPA')
	 ('and', 'CCfCC')
	 ('satisfies', 'VVZfVVZ')
	 ('this', 'DTfDT')
	 ('condition', 'NNfNN')
	 ('so', 'DMfDM')
	 ('in', 'INfIN')
	 ('fact', 'NNfNN')
	 ('er', 'UHfUH')
	 ('we', 'PPfPP')
	 ('have', 'VHPfVHP')
	 ('definition', 'NNfNN')
	 ('that', 'INfIN')
	 ('_0', 'PAfPA')
	 ('you', 'PPfPP')
	 ('say', 'VVPfVVP')
	 ('that', 'INfIN')
('PRwgd537_S11', 'PRwgd537_u_2')
	 ('yeah', 'REfRE')
('PRwgd537_S10', 'PRwgd537_u_3')
	 ('er', 'UHfUH')
	 ('three', 'CDfCD')
	 ('er', 'UHfUH')
	 ('_0', 'PAfPA')
	 ('okay', 'REfRE')
	 ('triple', 'NNfNN')
	 ('_1', 'PAfPA')
	

In [24]:
pos_tagged = {}
for conv in tagged_conversations:
    conv_lines = {}
    
    for line in tagged_conversations[conv].body.findAll('u'):
        utterance = []
   
        participant = line['who'].replace("#", "")
        line_id = line['xml:id']
        
        key = (participant, line_id)
        
        tags = line.findAll('w')
        #print(tags)
   
        for t in tags:
            ##print(t)
            word = t.get_text()
            #tag = t['ana'].replace("#", "")
            #print(tag)
            #pair = (word, tag)
            ##print('\t', tu)
            utterance.append((word, tag))
        
        conv_lines[key] = utterance

    print(conv)
    pos_tagged[conv] = conv_lines
        
        
        


EDcon250.xml
EDcon4.xml
EDcon496.xml
EDcon521.xml
EDint328.xml
EDint330.xml
EDint331.xml
EDint604.xml
EDint605.xml
EDsed251.xml
EDsed301.xml
EDsed31.xml
EDsed362.xml
EDsed363.xml
EDsed364.xml
EDsve421.xml
EDsve422.xml
EDsve423.xml
EDsve451.xml
EDsve452.xml
EDwgd241.xml
EDwgd305.xml
EDwgd497.xml
EDwgd5.xml
EDwgd6.xml
EDwsd15.xml
EDwsd242.xml
EDwsd302.xml
EDwsd303.xml
EDwsd304.xml
EDwsd306.xml
EDwsd464.xml
EDwsd499.xml
EDwsd590.xml
EDwsd9.xml
LEcon227.xml
LEcon228.xml
LEcon229.xml
LEcon329.xml
LEcon351.xml
LEcon352.xml
LEcon353.xml
LEcon405.xml
LEcon417.xml
LEcon418.xml
LEcon420.xml
LEcon545.xml
LEcon547.xml
LEcon548.xml
LEcon560.xml
LEcon562.xml
LEcon565.xml
LEcon566.xml
LEcon573.xml
LEcon575.xml
LEcon8.xml
LEint551.xml
LEint552.xml
LEint553.xml
LEint554.xml
LEint555.xml
PBcon594.xml
PBmtg269.xml
PBmtg27.xml
PBmtg280.xml
PBmtg3.xml
PBmtg300.xml
PBmtg414.xml
PBmtg462.xml
PBmtg463.xml
PBpan10.xml
PBpan25.xml
PBpan28.xml
PBpan581.xml
PBqas410.xml
PBqas411.xml
PBqas412.xml
PBqas523.xml
PBsv

In [25]:
pos_tagged['EDcon250.xml']

{('EDcon250_S1', 'EDcon250_u_1'): [('f_prego', 'PAfPA'), ('_0', 'PAfPA')],
 ('EDcon250_S1', 'EDcon250_u_10'): [('f_passt', 'PAfPA'), ('_2', 'PAfPA')],
 ('EDcon250_S1', 'EDcon250_u_13'): [('yeah', 'PAfPA'),
  ('yes', 'PAfPA'),
  ('it', 'PAfPA'),
  ("'s", 'PAfPA'),
  ('not', 'PAfPA'),
  ('very', 'PAfPA'),
  ('big', 'PAfPA'),
  ('but', 'PAfPA'),
  ('_1', 'PAfPA'),
  ('we', 'PAfPA'),
  ('have', 'PAfPA'),
  ('only', 'PAfPA'),
  ('_0', 'PAfPA')],
 ('EDcon250_S1', 'EDcon250_u_15'): [('big', 'PAfPA'),
  ('pizza', 'PAfPA'),
  ('we', 'PAfPA'),
  ('have', 'PAfPA'),
  ('no', 'PAfPA'),
  ('small', 'PAfPA')],
 ('EDcon250_S1', 'EDcon250_u_19'): [('one', 'PAfPA'),
  ('okay', 'PAfPA'),
  ('one', 'PAfPA'),
  ('f_toscana', 'PAfPA'),
  ('medium', 'PAfPA'),
  ('okay', 'PAfPA')],
 ('EDcon250_S1', 'EDcon250_u_27'): [('for', 'PAfPA'), ('you', 'PAfPA')],
 ('EDcon250_S1', 'EDcon250_u_29'): [('f_margherita', 'PAfPA')],
 ('EDcon250_S1', 'EDcon250_u_3'): [('f_gemuese', 'PAfPA'),
  ('f_lasagne', 'PAfPA'),
  ('f_ger

In [26]:
pos_tagged.keys()

dict_keys(['EDcon250.xml', 'EDcon4.xml', 'EDcon496.xml', 'EDcon521.xml', 'EDint328.xml', 'EDint330.xml', 'EDint331.xml', 'EDint604.xml', 'EDint605.xml', 'EDsed251.xml', 'EDsed301.xml', 'EDsed31.xml', 'EDsed362.xml', 'EDsed363.xml', 'EDsed364.xml', 'EDsve421.xml', 'EDsve422.xml', 'EDsve423.xml', 'EDsve451.xml', 'EDsve452.xml', 'EDwgd241.xml', 'EDwgd305.xml', 'EDwgd497.xml', 'EDwgd5.xml', 'EDwgd6.xml', 'EDwsd15.xml', 'EDwsd242.xml', 'EDwsd302.xml', 'EDwsd303.xml', 'EDwsd304.xml', 'EDwsd306.xml', 'EDwsd464.xml', 'EDwsd499.xml', 'EDwsd590.xml', 'EDwsd9.xml', 'LEcon227.xml', 'LEcon228.xml', 'LEcon229.xml', 'LEcon329.xml', 'LEcon351.xml', 'LEcon352.xml', 'LEcon353.xml', 'LEcon405.xml', 'LEcon417.xml', 'LEcon418.xml', 'LEcon420.xml', 'LEcon545.xml', 'LEcon547.xml', 'LEcon548.xml', 'LEcon560.xml', 'LEcon562.xml', 'LEcon565.xml', 'LEcon566.xml', 'LEcon573.xml', 'LEcon575.xml', 'LEcon8.xml', 'LEint551.xml', 'LEint552.xml', 'LEint553.xml', 'LEint554.xml', 'LEint555.xml', 'PBcon594.xml', 'PBmtg269

In [27]:
pos_tagged['EDcon4.xml']

{('EDcon4_S1', 'EDcon4_u_1'): [('running', 'PAfPA'),
  ('we', 'PAfPA'),
  ('got', 'PAfPA'),
  ('_0', 'PAfPA')],
 ('EDcon4_S2', 'EDcon4_u_2'): [('with', 'PAfPA'), ('whom', 'PAfPA')],
 ('EDcon4_S1', 'EDcon4_u_3'): [('a_[firstname1]', 'PAfPA'),
  ('_0', 'PAfPA'),
  ('the', 'PAfPA'),
  ('belgium', 'PAfPA'),
  ('_0', 'PAfPA')],
 ('EDcon4_S2', 'EDcon4_u_4'): [('okay', 'PAfPA'),
  ('@@', 'PAfPA'),
  ('_0', 'PAfPA')],
 ('EDcon4_S1', 'EDcon4_u_5'): [('and', 'PAfPA'),
  ('@@', 'PAfPA'),
  ('and', 'PAfPA'),
  ('er', 'PAfPA'),
  ('_0', 'PAfPA')],
 ('EDcon4_S3', 'EDcon4_u_6'): [('oh', 'PAfPA'),
  ('_0', 'PAfPA'),
  ('sorry', 'PAfPA'),
  ('_1', 'PAfPA')],
 ('EDcon4_S1', 'EDcon4_u_7'): [('and', 'PAfPA'),
  ('the', 'PAfPA'),
  ('problem', 'PAfPA'),
  ('was', 'PAfPA'),
  ('that', 'PAfPA'),
  ('she', 'PAfPA'),
  ('was', 'PAfPA'),
  ('like', 'PAfPA'),
  ('running', 'PAfPA')],
 ('EDcon4_S2', 'EDcon4_u_8'): [('@@', 'PAfPA'), ('@', 'PAfPA')],
 ('EDcon4_S1', 'EDcon4_u_9'): [('the', 'PAfPA'),
  ('whole', 'PAf

In [28]:
for l in pos_tagged['EDcon4.xml']:
    print(pos_tagged['EDcon4.xml'][l])

[('running', 'PAfPA'), ('we', 'PAfPA'), ('got', 'PAfPA'), ('_0', 'PAfPA')]
[('with', 'PAfPA'), ('whom', 'PAfPA')]
[('a_[firstname1]', 'PAfPA'), ('_0', 'PAfPA'), ('the', 'PAfPA'), ('belgium', 'PAfPA'), ('_0', 'PAfPA')]
[('okay', 'PAfPA'), ('@@', 'PAfPA'), ('_0', 'PAfPA')]
[('and', 'PAfPA'), ('@@', 'PAfPA'), ('and', 'PAfPA'), ('er', 'PAfPA'), ('_0', 'PAfPA')]
[('oh', 'PAfPA'), ('_0', 'PAfPA'), ('sorry', 'PAfPA'), ('_1', 'PAfPA')]
[('and', 'PAfPA'), ('the', 'PAfPA'), ('problem', 'PAfPA'), ('was', 'PAfPA'), ('that', 'PAfPA'), ('she', 'PAfPA'), ('was', 'PAfPA'), ('like', 'PAfPA'), ('running', 'PAfPA')]
[('@@', 'PAfPA'), ('@', 'PAfPA')]
[('the', 'PAfPA'), ('whole', 'PAfPA'), ('city', 'PAfPA'), ('and', 'PAfPA'), ('i', 'PAfPA'), ('was', 'PAfPA'), ('like', 'PAfPA'), ('_0', 'PAfPA')]
[('@@', 'PAfPA'), ('@', 'PAfPA')]
[('i', 'PAfPA'), ("'ll", 'PAfPA'), ('be', 'PAfPA'), ('back', 'PAfPA'), ('and', 'PAfPA'), ('we', 'PAfPA'), ('saw', 'PAfPA'), ('the', 'PAfPA'), ('people', 'PAfPA'), ('because', 'PAfPA

[('i', 'PAfPA'), ("'m", 'PAfPA'), ('going', 'PAfPA'), ('to', 'PAfPA'), ('the', 'PAfPA'), ('crazy', 'PAfPA'), ('world', 'PAfPA'), ('where', 'PAfPA'), ('the', 'PAfPA'), ('wh-', 'PAfPA'), ('where', 'PAfPA'), ('the', 'PAfPA'), ('weather', 'PAfPA'), ('xxx', 'PAfPA'), ('with', 'PAfPA'), ('the', 'PAfPA'), ('same', 'PAfPA'), ('people', 'PAfPA'), ('_0', 'PAfPA')]
[('it', 'PAfPA'), ("'s", 'PAfPA'), ('nice', 'PAfPA'), ('_1', 'PAfPA')]
[('is', 'PAfPA'), ('it', 'PAfPA'), ('_1', 'PAfPA')]
[('it', 'PAfPA'), ("'s", 'PAfPA'), ('get', 'PAfPA'), ('come', 'PAfPA'), ('on', 'PAfPA'), ('it', 'PAfPA'), ("'s", 'PAfPA'), ('nicer', 'PAfPA'), ('than', 'PAfPA'), ('this', 'PAfPA'), ('morning', 'PAfPA'), ('_0', 'PAfPA')]
[('yeah', 'PAfPA'), ('but', 'PAfPA'), ('still', 'PAfPA'), ('it', 'PAfPA'), ("'s", 'PAfPA'), ('freezing', 'PAfPA'), ('_37', 'PAfPA')]
[('go', 'PAfPA'), ('upstairs', 'PAfPA'), ('to', 'PAfPA'), ('take', 'PAfPA'), ('this', 'PAfPA'), ('xx', 'PAfPA'), ('xx', 'PAfPA'), ('x', 'PAfPA'), ('_3', 'PAfPA'), ('wh