In [1]:
import pickle
import nltk
from nltk.corpus import stopwords
import pandas as pd
import matplotlib.pyplot as plt
stopWords = set(stopwords.words('english'))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
f = open('VOICE_tokenized.p', 'rb')
VOICE_toks = pickle.load(f)
f.close()

f = open('VOICE_tagged.p', 'rb')
VOICE_tags = pickle.load(f)
f.close()

f = open('VOICE_participant_info.p', 'rb')
participants = pickle.load(f)
f.close()

In [13]:
def get_monolingual_speakers(participant_dict, language):
    speakers = []
    for p in participant_dict.keys():
        L1s = participant_dict[p]['L1']
        if(len(L1s) == 1 and language in L1s):
            speakers.append(p)
            
    return speakers

In [46]:
eng_speakers = get_monolingual_speakers(participants, 'eng')
pol_speakers = get_monolingual_speakers(participants, 'pol')
kor_speakers = get_monolingual_speakers(participants, 'kor')
fin_speakers = get_monolingual_speakers(participants, 'fin')
dan_speakers = get_monolingual_speakers(participants, 'dan')
tur_speakers = get_monolingual_speakers(participants, 'tur')
hun_speakers = get_monolingual_speakers(participants, 'hun')
por_speakers = get_monolingual_speakers(participants, 'por')
rus_speakers = get_monolingual_speakers(participants, 'rus')
mlt_speakers = get_monolingual_speakers(participants, 'mlt')
lav_speakers = get_monolingual_speakers(participants, 'lav')

In [48]:
print("English speakers:", len(eng_speakers))
print("Polish speakers:", len(pol_speakers))
print("Korean speakers:", len(kor_speakers))
print("Finnish speakers:", len(fin_speakers))
print("Danish speakers:", len(dan_speakers))
print("Turkish speakers:", len(tur_speakers))
print("Hungarian speakers:", len(hun_speakers))
print("Portugese speakers:", len(por_speakers))
print("Russian speakers:", len(rus_speakers))
print("Maltese speakers:", len(mlt_speakers))
print("Latvian speakers:", len(lav_speakers))

English speakers: 62
Polish speakers: 35
Korean speakers: 14
Finnish speakers: 51
Danish speakers: 35
Turkish speakers: 14
Hungarian speakers: 13
Portugese speakers: 21
Russian speakers: 22
Maltese speakers: 22
Latvian speakers: 19


In [None]:
#Need to use English
#Use Korean, Finnish, Turkish
#Danish, Portugese, Polish

In [91]:
#List of speakers
def get_tagged_utterances(tokens, speakers): 
    utterances = []
    for conversation in tokens.keys():
        for pair in tokens[conversation]:
            if(pair[0] in speakers):
                utterances.append(tokens[conversation][pair])
                
    return utterances
                
    
    

In [94]:
eng_speech = get_tagged_utterances(VOICE_tags, eng_speakers)
kor_speech = get_tagged_utterances(VOICE_tags, kor_speakers)
fin_speech = get_tagged_utterances(VOICE_tags, fin_speakers)
tur_speech = get_tagged_utterances(VOICE_tags, tur_speakers)
dan_speech = get_tagged_utterances(VOICE_tags, dan_speakers)
por_speech = get_tagged_utterances(VOICE_tags, por_speakers)
pol_speech = get_tagged_utterances(VOICE_tags, pol_speakers)

In [100]:
def get_len_dists(tok_list):
    len_dict = {}
    total = 0
    for l in tok_list:
        utterance_length = len(l)
        
        if(utterance_length not in len_dict):
            len_dict[utterance_length] = 1
        else:
            len_dict[utterance_length] += 1
            
        total += 1
            
    for l in len_dict.keys():
        len_dict[l] = len_dict[l]/total
        
    return len_dict

In [111]:
def get_top_dists(len_dists):
    for s in sorted(len_dists, key=len_dists.get, reverse=True)[:10]:
        print("\t" + str(s) + ": " + str(len_dists[s]))

In [112]:
eng_lens = get_len_dists(eng_speech)
kor_lens = get_len_dists(kor_speech)
fin_lens = get_len_dists(fin_speech)
tur_lens = get_len_dists(tur_speech)
dan_lens = get_len_dists(dan_speech)
por_lens = get_len_dists(por_speech)
pol_lens = get_len_dists(pol_speech)

In [115]:
print("English:")
get_top_dists(eng_lens)
print("Korean:")
get_top_dists(kor_lens)
print("Finnish:")
get_top_dists(fin_lens)
print("Turkish:")
get_top_dists(tur_lens)
print("Danish:")
get_top_dists(dan_lens)
print("Portugese:")
get_top_dists(por_lens)
print("Polish:")
get_top_dists(pol_lens)

English:
	1: 0.272583559168925
	2: 0.0948509485094851
	3: 0.06097560975609756
	4: 0.05984643179765131
	5: 0.04968383017163505
	6: 0.047651309846431796
	7: 0.04245709123757904
	8: 0.037037037037037035
	9: 0.03071364046973803
	10: 0.026196928635953028
Korean:
	1: 0.26313813813813813
	2: 0.16554054054054054
	3: 0.08108108108108109
	4: 0.05593093093093093
	6: 0.05067567567567568
	5: 0.04804804804804805
	8: 0.03490990990990991
	7: 0.03303303303303303
	10: 0.02702702702702703
	9: 0.025525525525525526
Finnish:
	1: 0.32821368948247076
	2: 0.11619365609348915
	3: 0.06310517529215359
	4: 0.051419031719532556
	5: 0.04040066777963272
	6: 0.038731218697829715
	7: 0.032387312186978295
	9: 0.02904841402337229
	8: 0.02671118530884808
	10: 0.019031719532554257
Turkish:
	1: 0.14722222222222223
	2: 0.11388888888888889
	3: 0.05
	4: 0.044444444444444446
	5: 0.041666666666666664
	6: 0.03333333333333333
	8: 0.03333333333333333
	9: 0.030555555555555555
	16: 0.025
	10: 0.022222222222222223
Danish:
	1: 0.271252

In [None]:
#Will analyze tags in sentences of a fixed length. Want to pick an utterance length that's common across all L1s
# (short utterances are very common in all groups), but want to pick length long enough to get some interesting results

#Will try utterances of length 3 first

In [135]:
def tags_in_utterance_length(tok_list, utterance_len):
    utterances = []
    for u in tok_list:
        if(len(u) == utterance_len):
            tags = [pair[1] for pair in u]
            utterances.append(tags)
            
    return utterances

In [136]:
eng3_tags = tags_in_utterance_length(eng_speech, 3)
kor3_tags = tags_in_utterance_length(kor_speech, 3)
fin3_tags = tags_in_utterance_length(fin_speech, 3)
tur3_tags = tags_in_utterance_length(tur_speech, 3)
dan3_tags = tags_in_utterance_length(dan_speech, 3)
por3_tags = tags_in_utterance_length(por_speech, 3)
pol3_tags = tags_in_utterance_length(pol_speech, 3)

In [142]:
len(eng3_tags)
len(kor3_tags)
len(fin3_tags)
len(tur3_tags)
len(dan3_tags)
len(por3_tags)
len(pol3_tags)

270

216

189

18

226

75

280