In [1]:
import pickle
import nltk
from nltk.corpus import stopwords
import pandas as pd
import matplotlib.pyplot as plt
stopWords = set(stopwords.words('english'))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
f = open('VOICE_tokenized.p', 'rb')
VOICE_toks = pickle.load(f)
f.close()

f = open('VOICE_tagged.p', 'rb')
VOICE_tags = pickle.load(f)
f.close()

f = open('VOICE_native_tagged.p', 'rb')
VOICE_native_tags = pickle.load(f)
f.close()

f = open('VOICE_participant_info.p', 'rb')
participants = pickle.load(f)
f.close()

In [3]:
def get_monolingual_speakers(participant_dict, language):
    speakers = []
    for p in participant_dict.keys():
        L1s = participant_dict[p]['L1']
        if(len(L1s) == 1 and language in L1s):
            speakers.append(p)
            
    return speakers

In [4]:
eng_speakers = get_monolingual_speakers(participants, 'eng')
pol_speakers = get_monolingual_speakers(participants, 'pol')
kor_speakers = get_monolingual_speakers(participants, 'kor')
fin_speakers = get_monolingual_speakers(participants, 'fin')
dan_speakers = get_monolingual_speakers(participants, 'dan')
tur_speakers = get_monolingual_speakers(participants, 'tur')
hun_speakers = get_monolingual_speakers(participants, 'hun')
por_speakers = get_monolingual_speakers(participants, 'por')
rus_speakers = get_monolingual_speakers(participants, 'rus')
mlt_speakers = get_monolingual_speakers(participants, 'mlt')
lav_speakers = get_monolingual_speakers(participants, 'lav')

In [5]:
print("English speakers:", len(eng_speakers))
print("Polish speakers:", len(pol_speakers))
print("Korean speakers:", len(kor_speakers))
print("Finnish speakers:", len(fin_speakers))
print("Danish speakers:", len(dan_speakers))
print("Turkish speakers:", len(tur_speakers))
print("Hungarian speakers:", len(hun_speakers))
print("Portuguese speakers:", len(por_speakers))
print("Russian speakers:", len(rus_speakers))
print("Maltese speakers:", len(mlt_speakers))
print("Latvian speakers:", len(lav_speakers))

English speakers: 62
Polish speakers: 35
Korean speakers: 14
Finnish speakers: 51
Danish speakers: 35
Turkish speakers: 14
Hungarian speakers: 13
Portuguese speakers: 21
Russian speakers: 22
Maltese speakers: 22
Latvian speakers: 19


In [6]:
#Need to use English
#Use Korean, Finnish, Turkish
#Danish, Portuguese, Polish

In [7]:
#List of speakers
def get_tagged_utterances(tokens, speakers): 
    utterances = []
    for conversation in tokens.keys():
        for pair in tokens[conversation]:
            if(pair[1] in speakers):
                utterances.append(tokens[conversation][pair])
                
    return utterances
                
    
    

In [8]:
eng_speech = get_tagged_utterances(VOICE_native_tags, eng_speakers)
kor_speech = get_tagged_utterances(VOICE_tags, kor_speakers)
fin_speech = get_tagged_utterances(VOICE_tags, fin_speakers)
tur_speech = get_tagged_utterances(VOICE_tags, tur_speakers)
dan_speech = get_tagged_utterances(VOICE_tags, dan_speakers)
por_speech = get_tagged_utterances(VOICE_tags, por_speakers)
pol_speech = get_tagged_utterances(VOICE_tags, pol_speakers)

In [9]:
def get_len_dists(tok_list):
    len_dict = {}
    total = 0
    for l in tok_list:
        utterance_length = len(l)
        
        if(utterance_length not in len_dict):
            len_dict[utterance_length] = 1
        else:
            len_dict[utterance_length] += 1
            
        total += 1
            
    for l in len_dict.keys():
        len_dict[l] = len_dict[l]/total
        
    return len_dict

In [10]:
def get_top_dists(len_dists):
    for s in sorted(len_dists, key=len_dists.get, reverse=True)[:10]:
        print("\t" + str(s) + ": " + str(len_dists[s]))

In [11]:
eng_lens = get_len_dists(eng_speech)
kor_lens = get_len_dists(kor_speech)
fin_lens = get_len_dists(fin_speech)
tur_lens = get_len_dists(tur_speech)
dan_lens = get_len_dists(dan_speech)
por_lens = get_len_dists(por_speech)
pol_lens = get_len_dists(pol_speech)

In [12]:
print("English:")
get_top_dists(eng_lens)
print("Korean:")
get_top_dists(kor_lens)
print("Finnish:")
get_top_dists(fin_lens)
print("Turkish:")
get_top_dists(tur_lens)
print("Danish:")
get_top_dists(dan_lens)
print("Portuguese:")
get_top_dists(por_lens)
print("Polish:")
get_top_dists(pol_lens)

English:
	1: 0.272583559168925
	2: 0.0948509485094851
	3: 0.06097560975609756
	4: 0.05984643179765131
	5: 0.04968383017163505
	6: 0.047651309846431796
	7: 0.04245709123757904
	8: 0.037037037037037035
	9: 0.03071364046973803
	10: 0.026196928635953028
Korean:
	1: 0.26313813813813813
	2: 0.16554054054054054
	3: 0.08108108108108109
	4: 0.05593093093093093
	6: 0.05067567567567568
	5: 0.04804804804804805
	8: 0.03490990990990991
	7: 0.03303303303303303
	10: 0.02702702702702703
	9: 0.025525525525525526
Finnish:
	1: 0.32821368948247076
	2: 0.11619365609348915
	3: 0.06310517529215359
	4: 0.051419031719532556
	5: 0.04040066777963272
	6: 0.038731218697829715
	7: 0.032387312186978295
	9: 0.02904841402337229
	8: 0.02671118530884808
	10: 0.019031719532554257
Turkish:
	1: 0.14722222222222223
	2: 0.11388888888888889
	3: 0.05
	4: 0.044444444444444446
	5: 0.041666666666666664
	6: 0.03333333333333333
	8: 0.03333333333333333
	9: 0.030555555555555555
	16: 0.025
	10: 0.022222222222222223
Danish:
	1: 0.271252

In [13]:
#Will analyze tags in sentences of a fixed length. Want to pick an utterance length that's common across all L1s
# (short utterances are very common in all groups), but want to pick length long enough to get some interesting results

#Will try utterances of length 3 first

In [22]:
def tags_in_utterance_length(tok_list, utterance_len):
    utterances = []
    for u in tok_list:
        if(len(u) == utterance_len):
            tags = [pair[1] for pair in u]
            utterances.append(tuple(tags))
            
    return utterances

In [23]:
eng3_tags = tags_in_utterance_length(eng_speech, 3)
kor3_tags = tags_in_utterance_length(kor_speech, 3)
fin3_tags = tags_in_utterance_length(fin_speech, 3)
tur3_tags = tags_in_utterance_length(tur_speech, 3)
dan3_tags = tags_in_utterance_length(dan_speech, 3)
por3_tags = tags_in_utterance_length(por_speech, 3)
pol3_tags = tags_in_utterance_length(pol_speech, 3)

In [24]:
len(eng3_tags)
len(kor3_tags)
len(fin3_tags)
len(tur3_tags)
len(dan3_tags)
len(por3_tags)
len(pol3_tags)

270

216

189

18

226

75

280

In [29]:
def get_freq_dict(li):
    freqs = nltk.FreqDist(li)
    return freqs

In [44]:
#freqs = nltk.FreqDist(eng3_tags)
#freqs.most_common(10)
eng3_tags_freqs = get_freq_dict(eng3_tags)
kor3_tags_freqs = get_freq_dict(kor3_tags)
fin3_tags_freqs = get_freq_dict(fin3_tags)
dan3_tags_freqs = get_freq_dict(dan3_tags)
pol3_tags_freqs = get_freq_dict(pol3_tags)

In [50]:
common_seqs = []
for pair in eng3_tags_freqs.most_common(10):
    common_seqs.append(pair[0])
    
for pair in kor3_tags_freqs.most_common(10):
    common_seqs.append(pair[0])
    
for pair in fin3_tags_freqs.most_common(10):
    common_seqs.append(pair[0])
    
for pair in dan3_tags_freqs.most_common(10):
    common_seqs.append(pair[0])
    
for pair in pol3_tags_freqs.most_common(10):
    common_seqs.append(pair[0])

common_seqs = set(common_seqs)

31

In [54]:
for seq in common_seqs:
    print(seq)
    print("\tEnglish: " + str(eng3_tags_freqs[seq] / len(eng3_tags)))
    print("\tKorean: "+ str(kor3_tags_freqs[seq] / len(kor3_tags)))
    print("\tFinnish: " + str(fin3_tags_freqs[seq] / len(fin3_tags)))
    print("\tDanish: " + str(dan3_tags_freqs[seq] / len(dan3_tags)))
    print("\tPolish: " + str(pol3_tags_freqs[seq] / len(pol3_tags)))

('REfRE', 'JJfJJ', 'NNfNN')
	English: 0.0
	Korean: 0.009259259259259259
	Finnish: 0.0
	Danish: 0.0
	Polish: 0.0
('RBfRB', 'RBfRB', 'PAfPA')
	English: 0.003703703703703704
	Korean: 0.004629629629629629
	Finnish: 0.0
	Danish: 0.004424778761061947
	Polish: 0.007142857142857143
('FWfFW', 'FWfFW', 'PAfPA')
	English: 0.0
	Korean: 0.004629629629629629
	Finnish: 0.0
	Danish: 0.008849557522123894
	Polish: 0.010714285714285714
('PPfPP', 'MDfMD', 'VBfVB')
	English: 0.0
	Korean: 0.0
	Finnish: 0.010582010582010581
	Danish: 0.004424778761061947
	Polish: 0.0
('NNfNN', 'CCfCC', 'NNfNN')
	English: 0.007407407407407408
	Korean: 0.0
	Finnish: 0.021164021164021163
	Danish: 0.004424778761061947
	Polish: 0.007142857142857143
('REfRE', 'PAfPA', 'UNIfUNI')
	English: 0.0
	Korean: 0.0
	Finnish: 0.005291005291005291
	Danish: 0.008849557522123894
	Polish: 0.0
('NNfNN', 'NNfNN', 'PAfPA')
	English: 0.007407407407407408
	Korean: 0.009259259259259259
	Finnish: 0.005291005291005291
	Danish: 0.004424778761061947
	Polis

In [63]:
eng_tags = [[t[1] for t in u] for u in eng_speech]
eng_toks = [[t[0] for t in u] for u in eng_speech]

kor_tags = [[t[1] for t in u] for u in kor_speech]
kor_toks = [[t[0] for t in u] for u in kor_speech]

fin_tags = [[t[1] for t in u] for u in fin_speech]
fin_toks = [[t[0] for t in u] for u in fin_speech]

tur_tags = [[t[1] for t in u] for u in tur_speech]
tur_toks = [[t[0] for t in u] for u in tur_speech]

dan_tags = [[t[1] for t in u] for u in dan_speech]
dan_toks = [[t[0] for t in u] for u in dan_speech]

por_tags = [[t[1] for t in u] for u in por_speech]
por_toks = [[t[0] for t in u] for u in por_speech]

pol_tags = [[t[1] for t in u] for u in pol_speech]
pol_toks = [[t[0] for t in u] for u in pol_speech]

In [82]:
def get_trigrams(tags):
    trigram_list = []
    for u in tags:
        for trigram in set(nltk.trigrams(u)):
            if len(trigram) > 0:
                trigram_list.append(trigram)
    return trigram_list

In [86]:
eng_tag_trigrams = get_trigrams(eng_tags)
kor_tag_trigrams = get_trigrams(kor_tags)
fin_tag_trigrams = get_trigrams(fin_tags)
tur_tag_trigrams = get_trigrams(tur_tags)
dan_tag_trigrams = get_trigrams(dan_tags)
por_tag_trigrams = get_trigrams(por_tags)
pol_tag_trigrams = get_trigrams(pol_tags)

In [99]:
eng_freq = nltk.FreqDist(eng_tag_trigrams)
kor_freq = nltk.FreqDist(kor_tag_trigrams)
fin_freq = nltk.FreqDist(fin_tag_trigrams)
tur_freq = nltk.FreqDist(tur_tag_trigrams)
dan_freq = nltk.FreqDist(dan_tag_trigrams)
por_freq = nltk.FreqDist(por_tag_trigrams)
pol_freq = nltk.FreqDist(pol_tag_trigrams)

In [100]:
for trigram in eng_freq.most_common(20):
    trigram = trigram[0]
    print(trigram)
    
    print("\tEnglish: " + str(eng_freq[trigram] / len(eng_tag_trigrams)))
    print("\tKorean: " + str(kor_freq[trigram] / len(kor_tag_trigrams)))
    print("\tFinnish: " + str(fin_freq[trigram] / len(fin_tag_trigrams)))
    print("\tTurkish: " + str(tur_freq[trigram] / len(tur_tag_trigrams)))
    print("\tDanish: " + str(dan_freq[trigram] / len(dan_tag_trigrams)))
    print("\tPortuguese: " + str(por_freq[trigram] / len(por_tag_trigrams)))
    print("\tPolish: " + str(pol_freq[trigram] / len(pol_tag_trigrams)))

('INfIN', 'DTfDT', 'NNfNN')
	English: 0.009448325273150636
	Korean: 0.007010456273764259
	Finnish: 0.009592150545507798
	Turkish: 0.005439129739241722
	Danish: 0.009394171779141104
	Portuguese: 0.009030866693624478
	Polish: 0.009049419741817707
('DTfDT', 'NNfNN', 'INfIN')
	English: 0.007119828049435787
	Korean: 0.004039923954372623
	Finnish: 0.007029362231822508
	Turkish: 0.004239321708526636
	Danish: 0.008397239263803681
	Portuguese: 0.006132902008356921
	Polish: 0.006493675837788499
('DTfDT', 'JJfJJ', 'NNfNN')
	English: 0.006985491671144546
	Korean: 0.0051093155893536125
	Finnish: 0.006663249615581753
	Turkish: 0.004239321708526636
	Danish: 0.008052147239263804
	Portuguese: 0.005121984094891494
	Polish: 0.005502673099491459
('PPfPP', 'MDfMD', 'VVfVV')
	English: 0.004992835393157801
	Korean: 0.00374287072243346
	Finnish: 0.00388079373215201
	Turkish: 0.0029595264757638776
	Danish: 0.003853527607361963
	Portuguese: 0.00491980051219841
	Polish: 0.005320119963489372
('NNfNN', 'INfIN', 'D

In [101]:
for trigram in kor_freq.most_common(20):
    trigram = trigram[0]
    print(trigram)
    
    print("\tKorean: " + str(kor_freq[trigram] / len(kor_tag_trigrams)))
    print("\tEnglish: " + str(eng_freq[trigram] / len(eng_tag_trigrams)))
    print("\tFinnish: " + str(fin_freq[trigram] / len(fin_tag_trigrams)))
    print("\tTurkish: " + str(tur_freq[trigram] / len(tur_tag_trigrams)))
    print("\tDanish: " + str(dan_freq[trigram] / len(dan_tag_trigrams)))
    print("\tPortuguese: " + str(por_freq[trigram] / len(por_tag_trigrams)))
    print("\tPolish: " + str(pol_freq[trigram] / len(pol_tag_trigrams)))

('INfIN', 'DTfDT', 'NNfNN')
	Korean: 0.007010456273764259
	English: 0.009448325273150636
	Finnish: 0.009592150545507798
	Turkish: 0.005439129739241722
	Danish: 0.009394171779141104
	Portuguese: 0.009030866693624478
	Polish: 0.009049419741817707
('NNfNN', 'NNfNN', 'PAfPA')
	Korean: 0.006297528517110266
	English: 0.0016568153322586423
	Finnish: 0.0024895657904371385
	Turkish: 0.0017597184450487923
	Danish: 0.002032208588957055
	Portuguese: 0.0018870467718021297
	Polish: 0.002529664884600339
('DTfDT', 'NNfNN', 'PAfPA')
	Korean: 0.00528754752851711
	English: 0.0038061973849185028
	Finnish: 0.004466573918137219
	Turkish: 0.0035994240921452566
	Danish: 0.004236963190184049
	Portuguese: 0.005526351260277665
	Polish: 0.0049550136914852
('DTfDT', 'JJfJJ', 'NNfNN')
	Korean: 0.0051093155893536125
	English: 0.006985491671144546
	Finnish: 0.006663249615581753
	Turkish: 0.004239321708526636
	Danish: 0.008052147239263804
	Portuguese: 0.005121984094891494
	Polish: 0.005502673099491459
('JJfJJ', 'NNfNN

In [102]:
for trigram in fin_freq.most_common(20):
    trigram = trigram[0]
    print(trigram)
    
    print("\tFinnish: " + str(fin_freq[trigram] / len(fin_tag_trigrams)))
    print("\tEnglish: " + str(eng_freq[trigram] / len(eng_tag_trigrams)))
    print("\tKorean: " + str(kor_freq[trigram] / len(kor_tag_trigrams)))
    print("\tTurkish: " + str(tur_freq[trigram] / len(tur_tag_trigrams)))
    print("\tDanish: " + str(dan_freq[trigram] / len(dan_tag_trigrams)))
    print("\tPortuguese: " + str(por_freq[trigram] / len(por_tag_trigrams)))
    print("\tPolish: " + str(pol_freq[trigram] / len(pol_tag_trigrams)))

('INfIN', 'DTfDT', 'NNfNN')
	Finnish: 0.009592150545507798
	English: 0.009448325273150636
	Korean: 0.007010456273764259
	Turkish: 0.005439129739241722
	Danish: 0.009394171779141104
	Portuguese: 0.009030866693624478
	Polish: 0.009049419741817707
('DTfDT', 'NNfNN', 'INfIN')
	Finnish: 0.007029362231822508
	English: 0.007119828049435787
	Korean: 0.004039923954372623
	Turkish: 0.004239321708526636
	Danish: 0.008397239263803681
	Portuguese: 0.006132902008356921
	Polish: 0.006493675837788499
('DTfDT', 'JJfJJ', 'NNfNN')
	Finnish: 0.006663249615581753
	English: 0.006985491671144546
	Korean: 0.0051093155893536125
	Turkish: 0.004239321708526636
	Danish: 0.008052147239263804
	Portuguese: 0.005121984094891494
	Polish: 0.005502673099491459
('NNfNN', 'INfIN', 'DTfDT')
	Finnish: 0.00611408069122062
	English: 0.004813720222102812
	Korean: 0.003208174904942966
	Turkish: 0.0035994240921452566
	Danish: 0.006077453987730061
	Portuguese: 0.004245855236554792
	Polish: 0.005528752118920329
('DTfDT', 'NNfNN', 

In [127]:
for trigram in eng_freq.most_common(100):
    avg = 0
    avg = avg + (kor_freq[trigram] / len(kor_tag_trigrams)) 
    avg = avg + (fin_freq[trigram] / len(fin_tag_trigrams))
    avg = avg + (tur_freq[trigram] / len(tur_tag_trigrams)) 
    avg = avg + (dan_freq[trigram] / len(dan_tag_trigrams)

SyntaxError: unexpected EOF while parsing (<ipython-input-127-4e5b80d92e6a>, line 6)

In [162]:
more_common = {}
less_common = {}

for trigram in eng_freq:
    #trigram = trigram[0]
    
    avg = 0
    avg = avg + (kor_freq[trigram] / len(kor_tag_trigrams))
    avg = avg + (fin_freq[trigram] / len(fin_tag_trigrams))
    avg = avg + (tur_freq[trigram] / len(tur_tag_trigrams))
    avg = avg + (dan_freq[trigram] / len(dan_tag_trigrams))
    avg = avg + (por_freq[trigram] / len(por_tag_trigrams))
    avg = avg + (pol_freq[trigram] / len(pol_tag_trigrams))
    
    avg /= 6
    
    eng_percent = (eng_freq[trigram] / len(eng_tag_trigrams))
    if eng_percent > (avg * 2) and eng_freq[trigram] > 5:
        more_common[trigram] = eng_freq[trigram]
        
        
    if eng_percent < (avg * 0.5) and eng_freq[trigram] > 5:
        less_common[trigram] = eng_freq[trigram]

In [171]:
for trigram in sorted(more_common, key=more_common.get, reverse=True)[:10]:
    print(trigram)
    
    print("\tEnglish: " + str(eng_freq[trigram] / len(eng_tag_trigrams)))
    print("\tFinnish: " + str(fin_freq[trigram] / len(fin_tag_trigrams)))
    print("\tKorean: " + str(kor_freq[trigram] / len(kor_tag_trigrams)))
    print("\tTurkish: " + str(tur_freq[trigram] / len(tur_tag_trigrams)))
    print("\tDanish: " + str(dan_freq[trigram] / len(dan_tag_trigrams)))
    print("\tPortuguese: " + str(por_freq[trigram] / len(por_tag_trigrams)))
    print("\tPolish: " + str(pol_freq[trigram] / len(pol_tag_trigrams)))

('PPfPP', 'VHPfVHP', 'VVNfVVN')
	English: 0.0018807092960773778
	Finnish: 0.0007322252324815113
	Korean: 0.0004752851711026616
	Turkish: 0.0005599104143337066
	Danish: 0.0010927914110429448
	Portuguese: 0.0003369726378218089
	Polish: 0.001147476854870257
('PPfPP', 'VBSfVBS', 'DTfDT')
	English: 0.0018807092960773778
	Finnish: 0.0010251153254741158
	Korean: 0.000594106463878327
	Turkish: 0.00047992321228603423
	Danish: 0.0017446319018404907
	Portuguese: 0.0009435233859010648
	Polish: 0.0006780545051506063
('VVGfVVG', 'TOfTO', 'VVfVV')
	English: 0.0016568153322586423
	Finnish: 0.0010251153254741158
	Korean: 0.0001782319391634981
	Turkish: 0.001039833626619741
	Danish: 0.0009969325153374234
	Portuguese: 0.0007413398032079795
	Polish: 0.0009388446994393011
('INfIN', 'PPfPP', 'VBSfVBS')
	English: 0.0013657531792942862
	Finnish: 0.0008786702789778135
	Korean: 0.0002376425855513308
	Turkish: 0.00023996160614301711
	Danish: 0.0006901840490797546
	Portuguese: 0.000876128858336703
	Polish: 0.0008

In [173]:
for trigram in sorted(less_common, key=less_common.get, reverse=True)[:10]:
    print(trigram)
    print("\tEnglish: " + str(eng_freq[trigram] / len(eng_tag_trigrams)))
    print("\tFinnish: " + str(fin_freq[trigram] / len(fin_tag_trigrams)))
    print("\tKorean: " + str(kor_freq[trigram] / len(kor_tag_trigrams)))
    print("\tTurkish: " + str(tur_freq[trigram] / len(tur_tag_trigrams)))
    print("\tDanish: " + str(dan_freq[trigram] / len(dan_tag_trigrams)))
    print("\tPortuguese: " + str(por_freq[trigram] / len(por_tag_trigrams)))
    print("\tPolish: " + str(pol_freq[trigram] / len(pol_tag_trigrams)))

('DTfDT', 'NNSfNNS', 'PAfPA')
	English: 0.000738850080601827
	Finnish: 0.0014644504649630227
	Korean: 0.0008911596958174905
	Turkish: 0.0019996800511918092
	Danish: 0.0009585889570552147
	Portuguese: 0.002426202992317024
	Polish: 0.0015125831268744294
('PAfPA', 'NNfNN', 'PAfPA')
	English: 0.0006045137023105857
	Finnish: 0.0007688364941055869
	Korean: 0.0022576045627376424
	Turkish: 0.0016797312430011197
	Danish: 0.0007860429447852761
	Portuguese: 0.0010109179134654266
	Polish: 0.0010170817577259096
('PAfPA', 'RBfRB', 'RBfRB')
	English: 0.0005821243059287122
	Finnish: 0.0015742842498352493
	Korean: 0.0013070342205323193
	Turkish: 0.001039833626619741
	Danish: 0.0015145705521472392
	Portuguese: 0.0013478905512872355
	Polish: 0.0009388446994393011
('PAfPA', 'JJfJJ', 'NNfNN')
	English: 0.0005597349095468387
	Finnish: 0.0009885040638500403
	Korean: 0.00219819391634981
	Turkish: 0.0014397696368581027
	Danish: 0.0009010736196319018
	Portuguese: 0.0010109179134654266
	Polish: 0.000886686660581

In [295]:
def get_discourse_markers(speech):
    dm_list = []
    markers = [[t for t in u if t[1] == "DMfDM"] for u in speech]
    for m in markers:
        dm_list.extend(m)
        
    #return dm_list
    return dm_list

In [296]:
eng_dm = get_discourse_markers(eng_speech)
kor_dm = get_discourse_markers(kor_speech)
fin_dm = get_discourse_markers(fin_speech)
tur_dm = get_discourse_markers(tur_speech)
dan_dm = get_discourse_markers(dan_speech)
por_dm = get_discourse_markers(por_speech)
pol_dm = get_discourse_markers(pol_speech)

In [297]:
def get_dm_percent(speech, dm_list):
    total = 0
    for u in speech:
        total += len(u)
    
    return len(dm_list)/total

In [298]:
get_dm_percent(eng_speech, eng_dm)
get_dm_percent(kor_speech, kor_dm)
get_dm_percent(fin_speech, fin_dm)
get_dm_percent(tur_speech, tur_dm)
get_dm_percent(dan_speech, dan_dm)
get_dm_percent(por_speech, por_dm)
get_dm_percent(pol_speech, pol_dm)

0.01425064047822374

0.01322858819811278

0.01228812339923451

0.010293102366158288

0.012870197598201042

0.008287292817679558

0.017388458155186876

In [299]:
eng_dm_words = [dm[0].replace('\n', '') for dm in eng_dm]
kor_dm_words = [dm[0].replace('\n', '') for dm in kor_dm]
fin_dm_words = [dm[0].replace('\n', '') for dm in fin_dm]
tur_dm_words = [dm[0].replace('\n', '') for dm in tur_dm]
dan_dm_words = [dm[0].replace('\n', '') for dm in dan_dm]
por_dm_words = [dm[0].replace('\n', '') for dm in por_dm]
pol_dm_words = [dm[0].replace('\n', '') for dm in pol_dm]

In [300]:
eng_dm_freqs = nltk.FreqDist(eng_dm_words)
eng_dm_freqs

FreqDist({'like': 202,
          'look': 1,
          'right': 50,
          'so': 419,
          'well': 116,
          'whatever': 13})

In [301]:
kor_dm_freqs = nltk.FreqDist(kor_dm_words)
kor_dm_freqs

FreqDist({'like': 55, 'right': 33, 'so': 174, 'well': 27, 'whatever': 4})

In [302]:
fin_dm_freqs = nltk.FreqDist(fin_dm_words)
fin_dm_freqs

FreqDist({'like': 111, 'right': 10, 'so': 226, 'well': 72, 'whatever': 8})

In [303]:
tur_dm_freqs = nltk.FreqDist(tur_dm_words)
tur_dm_freqs

FreqDist({'like': 16, 'right': 15, 'so': 119, 'well': 13, 'whatever': 1})

In [304]:
dan_dm_freqs = nltk.FreqDist(dan_dm_words)
dan_dm_freqs

FreqDist({'like': 125, 'right': 30, 'so': 499, 'well': 137, 'whatever': 16})

In [305]:
por_dm_freqs = nltk.FreqDist(por_dm_words)
por_dm_freqs

FreqDist({'like': 18, 'right': 3, 'so': 114, 'well': 15, 'whatever': 3})

In [306]:
pol_dm_freqs = nltk.FreqDist(pol_dm_words)
pol_dm_freqs

FreqDist({'like': 265,
          'look': 1,
          'right': 40,
          'so': 450,
          'well': 66,
          'whatever': 28})

In [309]:
def get_word_count(speech):
    total = 0
    for u in speech:
        total += len(u)
    return total

In [313]:
for dm in eng_dm_freqs:
    print(dm)
    print("\tEnglish: " + str(eng_dm_freqs[dm] / get_word_count(eng_speech)))
    print("\tFinnish: " + str(fin_dm_freqs[dm] / get_word_count(fin_speech)))
    print("\tKorean: " + str(kor_dm_freqs[dm] / get_word_count(kor_speech)))
    print("\tTurkish: " + str(tur_dm_freqs[dm] / get_word_count(tur_speech)))
    print("\tDanish: " + str(dan_dm_freqs[dm] / get_word_count(dan_speech)))
    print("\tPortuguese: " + str(por_dm_freqs[dm] / get_word_count(por_speech)))
    print("\tPolish: " + str(pol_dm_freqs[dm] / get_word_count(pol_speech)))

so
	English: 0.00745445488186735
	Finnish: 0.0065037842815620596
	Korean: 0.00785588514154138
	Turkish: 0.007468775497395343
	Danish: 0.007958151922555539
	Portuguese: 0.006174845628859279
	Polish: 0.009205654317451874
well
	English: 0.00206376316538571
	Finnish: 0.002072002072002072
	Korean: 0.001219016659894352
	Turkish: 0.0008159166509759618
	Danish: 0.002184903433647513
	Portuguese: 0.0008124796880077998
	Polish: 0.001350162633226275
like
	English: 0.0035937944776544264
	Finnish: 0.003194336527669861
	Korean: 0.002483182084969976
	Turkish: 0.0010042051088934914
	Danish: 0.0019935250307002855
	Portuguese: 0.0009749756256093598
	Polish: 0.005421107542499437
right
	English: 0.0008895530885283234
	Finnish: 0.00028777806555584334
	Korean: 0.0014899092509819856
	Turkish: 0.0009414422895876483
	Danish: 0.0004784460073680685
	Portuguese: 0.00016249593760155997
	Polish: 0.0008182803837735
whatever
	English: 0.00023128380301736407
	Finnish: 0.00023022245244467467
	Korean: 0.00018059506072508

In [315]:
for dm in eng_dm_freqs:
    print(dm)
    print("\tEnglish: " + str(eng_dm_freqs[dm] / len(eng_dm_words)))
    print("\tFinnish: " + str(fin_dm_freqs[dm] / len(fin_dm_words)))
    print("\tKorean: " + str(kor_dm_freqs[dm] / len(kor_dm_words)))
    print("\tTurkish: " + str(tur_dm_freqs[dm] / len(tur_dm_words)))
    print("\tDanish: " + str(dan_dm_freqs[dm] / len(dan_dm_words)))
    print("\tPortuguese: " + str(por_dm_freqs[dm] / len(por_dm_words)))
    print("\tPolish: " + str(pol_dm_freqs[dm] / len(pol_dm_words)))

so
	English: 0.5230961298377028
	Finnish: 0.5292740046838408
	Korean: 0.5938566552901023
	Turkish: 0.725609756097561
	Danish: 0.6183395291201983
	Portuguese: 0.7450980392156863
	Polish: 0.5294117647058824
well
	English: 0.14481897627965043
	Finnish: 0.1686182669789227
	Korean: 0.09215017064846416
	Turkish: 0.07926829268292683
	Danish: 0.1697645600991326
	Portuguese: 0.09803921568627451
	Polish: 0.07764705882352942
like
	English: 0.25218476903870163
	Finnish: 0.25995316159250587
	Korean: 0.18771331058020477
	Turkish: 0.0975609756097561
	Danish: 0.15489467162329615
	Portuguese: 0.11764705882352941
	Polish: 0.31176470588235294
right
	English: 0.062421972534332085
	Finnish: 0.0234192037470726
	Korean: 0.11262798634812286
	Turkish: 0.09146341463414634
	Danish: 0.03717472118959108
	Portuguese: 0.0196078431372549
	Polish: 0.047058823529411764
whatever
	English: 0.016229712858926344
	Finnish: 0.01873536299765808
	Korean: 0.013651877133105802
	Turkish: 0.006097560975609756
	Danish: 0.0198265179