In [1]:
import pickle
import nltk
from nltk.corpus import stopwords
import pandas as pd
import matplotlib.pyplot as plt
stopWords = set(stopwords.words('english'))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
f = open('VOICE_tokenized.p', 'rb')
VOICE_toks = pickle.load(f)
f.close()

f = open('VOICE_tagged.p', 'rb')
VOICE_tags = pickle.load(f)
f.close()

f = open('VOICE_native_tagged.p', 'rb')
VOICE_native_tags = pickle.load(f)
f.close()

f = open('VOICE_participant_info.p', 'rb')
participants = pickle.load(f)
f.close()

In [3]:
def get_monolingual_speakers(participant_dict, language):
    speakers = []
    for p in participant_dict.keys():
        L1s = participant_dict[p]['L1']
        if(len(L1s) == 1 and language in L1s):
            speakers.append(p)
            
    return speakers

In [4]:
eng_speakers = get_monolingual_speakers(participants, 'eng')
pol_speakers = get_monolingual_speakers(participants, 'pol')
kor_speakers = get_monolingual_speakers(participants, 'kor')
fin_speakers = get_monolingual_speakers(participants, 'fin')
dan_speakers = get_monolingual_speakers(participants, 'dan')
tur_speakers = get_monolingual_speakers(participants, 'tur')
hun_speakers = get_monolingual_speakers(participants, 'hun')
por_speakers = get_monolingual_speakers(participants, 'por')
rus_speakers = get_monolingual_speakers(participants, 'rus')
mlt_speakers = get_monolingual_speakers(participants, 'mlt')
lav_speakers = get_monolingual_speakers(participants, 'lav')

In [5]:
print("English speakers:", len(eng_speakers))
print("Polish speakers:", len(pol_speakers))
print("Korean speakers:", len(kor_speakers))
print("Finnish speakers:", len(fin_speakers))
print("Danish speakers:", len(dan_speakers))
print("Turkish speakers:", len(tur_speakers))
print("Hungarian speakers:", len(hun_speakers))
print("Portuguese speakers:", len(por_speakers))
print("Russian speakers:", len(rus_speakers))
print("Maltese speakers:", len(mlt_speakers))
print("Latvian speakers:", len(lav_speakers))

English speakers: 62
Polish speakers: 35
Korean speakers: 14
Finnish speakers: 51
Danish speakers: 35
Turkish speakers: 14
Hungarian speakers: 13
Portuguese speakers: 21
Russian speakers: 22
Maltese speakers: 22
Latvian speakers: 19


In [6]:
#Need to use English
#Use Korean, Finnish, Turkish
#Danish, Portuguese, Polish

In [7]:
#List of speakers
def get_tagged_utterances(tokens, speakers): 
    utterances = []
    for conversation in tokens.keys():
        for pair in tokens[conversation]:
            if(pair[1] in speakers):
                utterances.append(tokens[conversation][pair])
                
    return utterances

In [8]:
eng_speech = get_tagged_utterances(VOICE_native_tags, eng_speakers)
kor_speech = get_tagged_utterances(VOICE_tags, kor_speakers)
fin_speech = get_tagged_utterances(VOICE_tags, fin_speakers)
tur_speech = get_tagged_utterances(VOICE_tags, tur_speakers)
dan_speech = get_tagged_utterances(VOICE_tags, dan_speakers)
por_speech = get_tagged_utterances(VOICE_tags, por_speakers)
pol_speech = get_tagged_utterances(VOICE_tags, pol_speakers)

In [9]:
def get_len_dists(tok_list):
    len_dict = {}
    total = 0
    for l in tok_list:
        utterance_length = len(l)
        
        if(utterance_length not in len_dict):
            len_dict[utterance_length] = 1
        else:
            len_dict[utterance_length] += 1
            
        total += 1
            
    for l in len_dict.keys():
        len_dict[l] = len_dict[l]/total
        
    return len_dict

In [10]:
def get_top_dists(len_dists):
    for s in sorted(len_dists, key=len_dists.get, reverse=True)[:10]:
        print("\t" + str(s) + ": " + str(len_dists[s]))

In [11]:
eng_lens = get_len_dists(eng_speech)
kor_lens = get_len_dists(kor_speech)
fin_lens = get_len_dists(fin_speech)
tur_lens = get_len_dists(tur_speech)
dan_lens = get_len_dists(dan_speech)
por_lens = get_len_dists(por_speech)
pol_lens = get_len_dists(pol_speech)

In [12]:
print("English:")
get_top_dists(eng_lens)
print("Korean:")
get_top_dists(kor_lens)
print("Finnish:")
get_top_dists(fin_lens)
print("Turkish:")
get_top_dists(tur_lens)
print("Danish:")
get_top_dists(dan_lens)
print("Portuguese:")
get_top_dists(por_lens)
print("Polish:")
get_top_dists(pol_lens)

English:
	1: 0.272583559168925
	2: 0.0948509485094851
	3: 0.06097560975609756
	4: 0.05984643179765131
	5: 0.04968383017163505
	6: 0.047651309846431796
	7: 0.04245709123757904
	8: 0.037037037037037035
	9: 0.03071364046973803
	10: 0.026196928635953028
Korean:
	1: 0.26313813813813813
	2: 0.16554054054054054
	3: 0.08108108108108109
	4: 0.05593093093093093
	6: 0.05067567567567568
	5: 0.04804804804804805
	8: 0.03490990990990991
	7: 0.03303303303303303
	10: 0.02702702702702703
	9: 0.025525525525525526
Finnish:
	1: 0.32821368948247076
	2: 0.11619365609348915
	3: 0.06310517529215359
	4: 0.051419031719532556
	5: 0.04040066777963272
	6: 0.038731218697829715
	7: 0.032387312186978295
	9: 0.02904841402337229
	8: 0.02671118530884808
	10: 0.019031719532554257
Turkish:
	1: 0.14722222222222223
	2: 0.11388888888888889
	3: 0.05
	4: 0.044444444444444446
	5: 0.041666666666666664
	6: 0.03333333333333333
	8: 0.03333333333333333
	9: 0.030555555555555555
	16: 0.025
	10: 0.022222222222222223
Danish:
	1: 0.271252

In [13]:
#Will analyze tags in sentences of a fixed length. Want to pick an utterance length that's common across all L1s
# (short utterances are very common in all groups), but want to pick length long enough to get some interesting results

#Will try utterances of length 3 first

In [14]:
def tags_in_utterance_length(tok_list, utterance_len):
    utterances = []
    for u in tok_list:
        if(len(u) == utterance_len):
            tags = [pair[1] for pair in u]
            utterances.append(tuple(tags))
            
    return utterances

In [15]:
eng3_tags = tags_in_utterance_length(eng_speech, 3)
kor3_tags = tags_in_utterance_length(kor_speech, 3)
fin3_tags = tags_in_utterance_length(fin_speech, 3)
tur3_tags = tags_in_utterance_length(tur_speech, 3)
dan3_tags = tags_in_utterance_length(dan_speech, 3)
por3_tags = tags_in_utterance_length(por_speech, 3)
pol3_tags = tags_in_utterance_length(pol_speech, 3)

In [16]:
len(eng3_tags)
len(kor3_tags)
len(fin3_tags)
len(tur3_tags)
len(dan3_tags)
len(por3_tags)
len(pol3_tags)

270

216

189

18

226

75

280

In [17]:
def get_freq_dict(li):
    freqs = nltk.FreqDist(li)
    return freqs

In [18]:
#freqs = nltk.FreqDist(eng3_tags)
#freqs.most_common(10)
eng3_tags_freqs = get_freq_dict(eng3_tags)
kor3_tags_freqs = get_freq_dict(kor3_tags)
fin3_tags_freqs = get_freq_dict(fin3_tags)
dan3_tags_freqs = get_freq_dict(dan3_tags)
pol3_tags_freqs = get_freq_dict(pol3_tags)

In [19]:
common_seqs = []
for pair in eng3_tags_freqs.most_common(10):
    common_seqs.append(pair[0])
    
for pair in kor3_tags_freqs.most_common(10):
    common_seqs.append(pair[0])
    
for pair in fin3_tags_freqs.most_common(10):
    common_seqs.append(pair[0])
    
for pair in dan3_tags_freqs.most_common(10):
    common_seqs.append(pair[0])
    
for pair in pol3_tags_freqs.most_common(10):
    common_seqs.append(pair[0])

common_seqs = set(common_seqs)

In [20]:
for seq in common_seqs:
    print(seq)
    print("\tEnglish: " + str(eng3_tags_freqs[seq] / len(eng3_tags)))
    print("\tKorean: "+ str(kor3_tags_freqs[seq] / len(kor3_tags)))
    print("\tFinnish: " + str(fin3_tags_freqs[seq] / len(fin3_tags)))
    print("\tDanish: " + str(dan3_tags_freqs[seq] / len(dan3_tags)))
    print("\tPolish: " + str(pol3_tags_freqs[seq] / len(pol3_tags)))

('PPfPP', 'MDfMD', 'PAfPA')
	English: 0.011111111111111112
	Korean: 0.0
	Finnish: 0.0
	Danish: 0.0
	Polish: 0.0
('NNfNN', 'NNfNN', 'PAfPA')
	English: 0.007407407407407408
	Korean: 0.009259259259259259
	Finnish: 0.005291005291005291
	Danish: 0.004424778761061947
	Polish: 0.0
('PPfPP', 'VBSfVBS', 'RBfRB')
	English: 0.007407407407407408
	Korean: 0.004629629629629629
	Finnish: 0.0
	Danish: 0.017699115044247787
	Polish: 0.0
('REfRE', 'PAfPA', 'REfRE')
	English: 0.04814814814814815
	Korean: 0.018518518518518517
	Finnish: 0.021164021164021163
	Danish: 0.01327433628318584
	Polish: 0.02142857142857143
('UHfUH', 'REfRE', 'REfRE')
	English: 0.003703703703703704
	Korean: 0.009259259259259259
	Finnish: 0.021164021164021163
	Danish: 0.008849557522123894
	Polish: 0.0035714285714285713
('NNfNN', 'REfRE', 'PAfPA')
	English: 0.0
	Korean: 0.013888888888888888
	Finnish: 0.0
	Danish: 0.0
	Polish: 0.0
('NPfNP', 'REfRE', 'PAfPA')
	English: 0.0
	Korean: 0.013888888888888888
	Finnish: 0.0
	Danish: 0.0
	Polish:

In [21]:
eng_tags = [[t[1] for t in u] for u in eng_speech]
eng_toks = [[t[0] for t in u] for u in eng_speech]

kor_tags = [[t[1] for t in u] for u in kor_speech]
kor_toks = [[t[0] for t in u] for u in kor_speech]

fin_tags = [[t[1] for t in u] for u in fin_speech]
fin_toks = [[t[0] for t in u] for u in fin_speech]

tur_tags = [[t[1] for t in u] for u in tur_speech]
tur_toks = [[t[0] for t in u] for u in tur_speech]

dan_tags = [[t[1] for t in u] for u in dan_speech]
dan_toks = [[t[0] for t in u] for u in dan_speech]

por_tags = [[t[1] for t in u] for u in por_speech]
por_toks = [[t[0] for t in u] for u in por_speech]

pol_tags = [[t[1] for t in u] for u in pol_speech]
pol_toks = [[t[0] for t in u] for u in pol_speech]

## Trigrams

In [22]:
def get_trigrams(tags):
    trigram_list = []
    for u in tags:
        for trigram in set(nltk.trigrams(u)):
            if len(trigram) > 0:
                trigram_list.append(trigram)
    return trigram_list

In [23]:
eng_tag_trigrams = get_trigrams(eng_tags)
kor_tag_trigrams = get_trigrams(kor_tags)
fin_tag_trigrams = get_trigrams(fin_tags)
tur_tag_trigrams = get_trigrams(tur_tags)
dan_tag_trigrams = get_trigrams(dan_tags)
por_tag_trigrams = get_trigrams(por_tags)
pol_tag_trigrams = get_trigrams(pol_tags)

### Most common part-of-speech trigrams in Engish. Compare with these frequencies with other L1s

In [24]:
eng_freq = nltk.FreqDist(eng_tag_trigrams)
kor_freq = nltk.FreqDist(kor_tag_trigrams)
fin_freq = nltk.FreqDist(fin_tag_trigrams)
tur_freq = nltk.FreqDist(tur_tag_trigrams)
dan_freq = nltk.FreqDist(dan_tag_trigrams)
por_freq = nltk.FreqDist(por_tag_trigrams)
pol_freq = nltk.FreqDist(pol_tag_trigrams)

In [25]:
for trigram in eng_freq.most_common(20):
    trigram = trigram[0]
    print(trigram)
    
    print("\tEnglish: " + str(eng_freq[trigram] / len(eng_tag_trigrams)))
    print("\tKorean: " + str(kor_freq[trigram] / len(kor_tag_trigrams)))
    print("\tFinnish: " + str(fin_freq[trigram] / len(fin_tag_trigrams)))
    print("\tTurkish: " + str(tur_freq[trigram] / len(tur_tag_trigrams)))
    print("\tDanish: " + str(dan_freq[trigram] / len(dan_tag_trigrams)))
    print("\tPortuguese: " + str(por_freq[trigram] / len(por_tag_trigrams)))
    print("\tPolish: " + str(pol_freq[trigram] / len(pol_tag_trigrams)))

('INfIN', 'DTfDT', 'NNfNN')
	English: 0.009448325273150636
	Korean: 0.007010456273764259
	Finnish: 0.009592150545507798
	Turkish: 0.005439129739241722
	Danish: 0.009394171779141104
	Portuguese: 0.009030866693624478
	Polish: 0.009049419741817707
('DTfDT', 'NNfNN', 'INfIN')
	English: 0.007119828049435787
	Korean: 0.004039923954372623
	Finnish: 0.007029362231822508
	Turkish: 0.004239321708526636
	Danish: 0.008397239263803681
	Portuguese: 0.006132902008356921
	Polish: 0.006493675837788499
('DTfDT', 'JJfJJ', 'NNfNN')
	English: 0.006985491671144546
	Korean: 0.0051093155893536125
	Finnish: 0.006663249615581753
	Turkish: 0.004239321708526636
	Danish: 0.008052147239263804
	Portuguese: 0.005121984094891494
	Polish: 0.005502673099491459
('PPfPP', 'MDfMD', 'VVfVV')
	English: 0.004992835393157801
	Korean: 0.00374287072243346
	Finnish: 0.00388079373215201
	Turkish: 0.0029595264757638776
	Danish: 0.003853527607361963
	Portuguese: 0.00491980051219841
	Polish: 0.005320119963489372
('NNfNN', 'INfIN', 'D

### Most common part-of-speech trigrams for Korean speakers. Compare with these frequencies with other L1s

In [26]:
for trigram in kor_freq.most_common(20):
    trigram = trigram[0]
    print(trigram)
    
    print("\tKorean: " + str(kor_freq[trigram] / len(kor_tag_trigrams)))
    print("\tEnglish: " + str(eng_freq[trigram] / len(eng_tag_trigrams)))
    print("\tFinnish: " + str(fin_freq[trigram] / len(fin_tag_trigrams)))
    print("\tTurkish: " + str(tur_freq[trigram] / len(tur_tag_trigrams)))
    print("\tDanish: " + str(dan_freq[trigram] / len(dan_tag_trigrams)))
    print("\tPortuguese: " + str(por_freq[trigram] / len(por_tag_trigrams)))
    print("\tPolish: " + str(pol_freq[trigram] / len(pol_tag_trigrams)))

('INfIN', 'DTfDT', 'NNfNN')
	Korean: 0.007010456273764259
	English: 0.009448325273150636
	Finnish: 0.009592150545507798
	Turkish: 0.005439129739241722
	Danish: 0.009394171779141104
	Portuguese: 0.009030866693624478
	Polish: 0.009049419741817707
('NNfNN', 'NNfNN', 'PAfPA')
	Korean: 0.006297528517110266
	English: 0.0016568153322586423
	Finnish: 0.0024895657904371385
	Turkish: 0.0017597184450487923
	Danish: 0.002032208588957055
	Portuguese: 0.0018870467718021297
	Polish: 0.002529664884600339
('DTfDT', 'NNfNN', 'PAfPA')
	Korean: 0.00528754752851711
	English: 0.0038061973849185028
	Finnish: 0.004466573918137219
	Turkish: 0.0035994240921452566
	Danish: 0.004236963190184049
	Portuguese: 0.005526351260277665
	Polish: 0.0049550136914852
('DTfDT', 'JJfJJ', 'NNfNN')
	Korean: 0.0051093155893536125
	English: 0.006985491671144546
	Finnish: 0.006663249615581753
	Turkish: 0.004239321708526636
	Danish: 0.008052147239263804
	Portuguese: 0.005121984094891494
	Polish: 0.005502673099491459
('JJfJJ', 'NNfNN

### Most common part-of-speech trigrams for Finnish speakers. Compare with these frequencies with other L1s

In [27]:
for trigram in fin_freq.most_common(20):
    trigram = trigram[0]
    print(trigram)
    
    print("\tFinnish: " + str(fin_freq[trigram] / len(fin_tag_trigrams)))
    print("\tEnglish: " + str(eng_freq[trigram] / len(eng_tag_trigrams)))
    print("\tKorean: " + str(kor_freq[trigram] / len(kor_tag_trigrams)))
    print("\tTurkish: " + str(tur_freq[trigram] / len(tur_tag_trigrams)))
    print("\tDanish: " + str(dan_freq[trigram] / len(dan_tag_trigrams)))
    print("\tPortuguese: " + str(por_freq[trigram] / len(por_tag_trigrams)))
    print("\tPolish: " + str(pol_freq[trigram] / len(pol_tag_trigrams)))

('INfIN', 'DTfDT', 'NNfNN')
	Finnish: 0.009592150545507798
	English: 0.009448325273150636
	Korean: 0.007010456273764259
	Turkish: 0.005439129739241722
	Danish: 0.009394171779141104
	Portuguese: 0.009030866693624478
	Polish: 0.009049419741817707
('DTfDT', 'NNfNN', 'INfIN')
	Finnish: 0.007029362231822508
	English: 0.007119828049435787
	Korean: 0.004039923954372623
	Turkish: 0.004239321708526636
	Danish: 0.008397239263803681
	Portuguese: 0.006132902008356921
	Polish: 0.006493675837788499
('DTfDT', 'JJfJJ', 'NNfNN')
	Finnish: 0.006663249615581753
	English: 0.006985491671144546
	Korean: 0.0051093155893536125
	Turkish: 0.004239321708526636
	Danish: 0.008052147239263804
	Portuguese: 0.005121984094891494
	Polish: 0.005502673099491459
('NNfNN', 'INfIN', 'DTfDT')
	Finnish: 0.00611408069122062
	English: 0.004813720222102812
	Korean: 0.003208174904942966
	Turkish: 0.0035994240921452566
	Danish: 0.006077453987730061
	Portuguese: 0.004245855236554792
	Polish: 0.005528752118920329
('DTfDT', 'NNfNN', 

In [28]:
"""
for trigram in eng_freq.most_common(100):
    avg = 0
    avg = avg + (kor_freq[trigram] / len(kor_tag_trigrams)) 
    avg = avg + (fin_freq[trigram] / len(fin_tag_trigrams))
    avg = avg + (tur_freq[trigram] / len(tur_tag_trigrams)) 
    avg = avg + (dan_freq[trigram] / len(dan_tag_trigrams))
    
""" 

'\nfor trigram in eng_freq.most_common(100):\n    avg = 0\n    avg = avg + (kor_freq[trigram] / len(kor_tag_trigrams)) \n    avg = avg + (fin_freq[trigram] / len(fin_tag_trigrams))\n    avg = avg + (tur_freq[trigram] / len(tur_tag_trigrams)) \n    avg = avg + (dan_freq[trigram] / len(dan_tag_trigrams))\n    \n'

### Get native speaker trigram outliers (in terms of frequency vs. other L1s)

In [29]:
more_common = {}
less_common = {}

for trigram in eng_freq:
    #trigram = trigram[0]
    
    avg = 0
    avg = avg + (kor_freq[trigram] / len(kor_tag_trigrams))
    avg = avg + (fin_freq[trigram] / len(fin_tag_trigrams))
    avg = avg + (tur_freq[trigram] / len(tur_tag_trigrams))
    avg = avg + (dan_freq[trigram] / len(dan_tag_trigrams))
    avg = avg + (por_freq[trigram] / len(por_tag_trigrams))
    avg = avg + (pol_freq[trigram] / len(pol_tag_trigrams))
    
    avg /= 6
    
    eng_percent = (eng_freq[trigram] / len(eng_tag_trigrams))
    if eng_percent > (avg * 2) and eng_freq[trigram] > 5:
        more_common[trigram] = eng_freq[trigram]
        
        
    if eng_percent < (avg * 0.5) and eng_freq[trigram] > 5:
        less_common[trigram] = eng_freq[trigram]

#### Native trigram outliers -- more frequent that other L1s.

In [30]:
for trigram in sorted(more_common, key=more_common.get, reverse=True)[:10]:
    print(trigram)
    
    print("\tEnglish: " + str(eng_freq[trigram] / len(eng_tag_trigrams)))
    print("\tFinnish: " + str(fin_freq[trigram] / len(fin_tag_trigrams)))
    print("\tKorean: " + str(kor_freq[trigram] / len(kor_tag_trigrams)))
    print("\tTurkish: " + str(tur_freq[trigram] / len(tur_tag_trigrams)))
    print("\tDanish: " + str(dan_freq[trigram] / len(dan_tag_trigrams)))
    print("\tPortuguese: " + str(por_freq[trigram] / len(por_tag_trigrams)))
    print("\tPolish: " + str(pol_freq[trigram] / len(pol_tag_trigrams)))

('PPfPP', 'VHPfVHP', 'VVNfVVN')
	English: 0.0018807092960773778
	Finnish: 0.0007322252324815113
	Korean: 0.0004752851711026616
	Turkish: 0.0005599104143337066
	Danish: 0.0010927914110429448
	Portuguese: 0.0003369726378218089
	Polish: 0.001147476854870257
('PPfPP', 'VBSfVBS', 'DTfDT')
	English: 0.0018807092960773778
	Finnish: 0.0010251153254741158
	Korean: 0.000594106463878327
	Turkish: 0.00047992321228603423
	Danish: 0.0017446319018404907
	Portuguese: 0.0009435233859010648
	Polish: 0.0006780545051506063
('VVGfVVG', 'TOfTO', 'VVfVV')
	English: 0.0016568153322586423
	Finnish: 0.0010251153254741158
	Korean: 0.0001782319391634981
	Turkish: 0.001039833626619741
	Danish: 0.0009969325153374234
	Portuguese: 0.0007413398032079795
	Polish: 0.0009388446994393011
('INfIN', 'PPfPP', 'VBSfVBS')
	English: 0.0013657531792942862
	Finnish: 0.0008786702789778135
	Korean: 0.0002376425855513308
	Turkish: 0.00023996160614301711
	Danish: 0.0006901840490797546
	Portuguese: 0.000876128858336703
	Polish: 0.0008

#### Native trigram outliers -- less frequent that other L1s. (Maybe try this again, but excluding pause and hesitation tags)

In [31]:
for trigram in sorted(less_common, key=less_common.get, reverse=True)[:10]:
    print(trigram)
    print("\tEnglish: " + str(eng_freq[trigram] / len(eng_tag_trigrams)))
    print("\tFinnish: " + str(fin_freq[trigram] / len(fin_tag_trigrams)))
    print("\tKorean: " + str(kor_freq[trigram] / len(kor_tag_trigrams)))
    print("\tTurkish: " + str(tur_freq[trigram] / len(tur_tag_trigrams)))
    print("\tDanish: " + str(dan_freq[trigram] / len(dan_tag_trigrams)))
    print("\tPortuguese: " + str(por_freq[trigram] / len(por_tag_trigrams)))
    print("\tPolish: " + str(pol_freq[trigram] / len(pol_tag_trigrams)))

('DTfDT', 'NNSfNNS', 'PAfPA')
	English: 0.000738850080601827
	Finnish: 0.0014644504649630227
	Korean: 0.0008911596958174905
	Turkish: 0.0019996800511918092
	Danish: 0.0009585889570552147
	Portuguese: 0.002426202992317024
	Polish: 0.0015125831268744294
('PAfPA', 'NNfNN', 'PAfPA')
	English: 0.0006045137023105857
	Finnish: 0.0007688364941055869
	Korean: 0.0022576045627376424
	Turkish: 0.0016797312430011197
	Danish: 0.0007860429447852761
	Portuguese: 0.0010109179134654266
	Polish: 0.0010170817577259096
('PAfPA', 'RBfRB', 'RBfRB')
	English: 0.0005821243059287122
	Finnish: 0.0015742842498352493
	Korean: 0.0013070342205323193
	Turkish: 0.001039833626619741
	Danish: 0.0015145705521472392
	Portuguese: 0.0013478905512872355
	Polish: 0.0009388446994393011
('PAfPA', 'JJfJJ', 'NNfNN')
	English: 0.0005597349095468387
	Finnish: 0.0009885040638500403
	Korean: 0.00219819391634981
	Turkish: 0.0014397696368581027
	Danish: 0.0009010736196319018
	Portuguese: 0.0010109179134654266
	Polish: 0.000886686660581

## Comparing discourse markers

In [32]:
def get_discourse_markers(speech):
    dm_list = []
    markers = [[t for t in u if t[1] == "DMfDM"] for u in speech]
    for m in markers:
        dm_list.extend(m)
        
    #return dm_list
    return dm_list

In [33]:
eng_dm = get_discourse_markers(eng_speech)
kor_dm = get_discourse_markers(kor_speech)
fin_dm = get_discourse_markers(fin_speech)
tur_dm = get_discourse_markers(tur_speech)
dan_dm = get_discourse_markers(dan_speech)
por_dm = get_discourse_markers(por_speech)
pol_dm = get_discourse_markers(pol_speech)

In [34]:
def get_dm_percent(speech, dm_list):
    total = 0
    for u in speech:
        total += len(u)
    
    return len(dm_list)/total

### Percent of discourse markers across L1s

In [109]:
dm_df = pd.DataFrame(index = ['determiner proportions'], columns = ['L1=English', 'L1=Korean', 'L1=Finnish', 'L1=Turkish', 'L1=Danish', 'L1=Portuguese', 'L1=Polish'])

dm_df['L1=English'] = get_dm_percent(eng_speech, eng_dm)
dm_df['L1=Korean'] = get_dm_percent(kor_speech, kor_dm)
dm_df['L1=Finnish'] = get_dm_percent(fin_speech, fin_dm)
dm_df['L1=Turkish'] = get_dm_percent(tur_speech, tur_dm)
dm_df['L1=Danish'] = get_dm_percent(dan_speech, dan_dm)
dm_df['L1=Portuguese'] = get_dm_percent(por_speech, por_dm)
dm_df['L1=Polish'] = get_dm_percent(pol_speech, pol_dm)

dm_df

Unnamed: 0,L1=English,L1=Korean,L1=Finnish,L1=Turkish,L1=Danish,L1=Portuguese,L1=Polish
determiner proportions,0.014251,0.013229,0.012288,0.010293,0.01287,0.008287,0.017388


In [36]:
eng_dm_words = [dm[0].replace('\n', '') for dm in eng_dm]
kor_dm_words = [dm[0].replace('\n', '') for dm in kor_dm]
fin_dm_words = [dm[0].replace('\n', '') for dm in fin_dm]
tur_dm_words = [dm[0].replace('\n', '') for dm in tur_dm]
dan_dm_words = [dm[0].replace('\n', '') for dm in dan_dm]
por_dm_words = [dm[0].replace('\n', '') for dm in por_dm]
pol_dm_words = [dm[0].replace('\n', '') for dm in pol_dm]

In [37]:
eng_dm_freqs = nltk.FreqDist(eng_dm_words)
eng_dm_freqs

FreqDist({'like': 202,
          'look': 1,
          'right': 50,
          'so': 419,
          'well': 116,
          'whatever': 13})

In [38]:
kor_dm_freqs = nltk.FreqDist(kor_dm_words)
kor_dm_freqs

FreqDist({'like': 55, 'right': 33, 'so': 174, 'well': 27, 'whatever': 4})

In [39]:
fin_dm_freqs = nltk.FreqDist(fin_dm_words)
fin_dm_freqs

FreqDist({'like': 111, 'right': 10, 'so': 226, 'well': 72, 'whatever': 8})

In [40]:
tur_dm_freqs = nltk.FreqDist(tur_dm_words)
tur_dm_freqs

FreqDist({'like': 16, 'right': 15, 'so': 119, 'well': 13, 'whatever': 1})

In [41]:
dan_dm_freqs = nltk.FreqDist(dan_dm_words)
dan_dm_freqs

FreqDist({'like': 125, 'right': 30, 'so': 499, 'well': 137, 'whatever': 16})

In [42]:
por_dm_freqs = nltk.FreqDist(por_dm_words)
dm_df = pd.DataFrame(index = ['determiner proportions'], columns = ['L1=English', 'L1=Korean', 'L1=Finnish', 'L1=Turkish', 'L1=Danish', 'L1=Portuguese', 'L1=Polish'])

dm_df['L1=English'] = get_dm_percent(eng_speech, eng_dm)
dm_df['L1=Korean'] = get_dm_percent(kor_speech, kor_dm)
dm_df['L1=Finnish'] = get_dm_percent(fin_speech, fin_dm)
dm_df['L1=Turkish'] = get_dm_percent(tur_speech, tur_dm)
dm_df['L1=Danish'] = get_dm_percent(dan_speech, dan_dm)
dm_df['L1=Portuguese'] = get_dm_percent(por_speech, por_dm)
dm_df['L1=Polish'] = get_dm_percent(pol_speech, pol_dm)

dm_dfpor_dm_freqs

FreqDist({'like': 18, 'right': 3, 'so': 114, 'well': 15, 'whatever': 3})

In [43]:
pol_dm_freqs = nltk.FreqDist(pol_dm_words)
pol_dm_freqs

FreqDist({'like': 265,
          'look': 1,
          'right': 40,
          'so': 450,
          'well': 66,
          'whatever': 28})

In [127]:
discourse_markers = eng_dm_freqs.most_common()
discourse_markers = [b[0] for b in discourse_markers]

dm_words_df = pd.DataFrame(index = discourse_markers, columns = ['L1=English', 'L1=Korean', 'L1=Finnish', 'L1=Turkish', 'L1=Danish', 'L1=Portuguese', 'L1=Polish'])

for word in discourse_markers:
    dm_words_df['L1=English'][word] = eng_dm_freqs[word]/sum(eng_dm_freqs.values())
    dm_words_df['L1=Korean'][word] = kor_dm_freqs[word]/sum(kor_dm_freqs.values())
    dm_words_df['L1=Finnish'][word] = fin_dm_freqs[word]/sum(fin_dm_freqs.values())
    dm_words_df['L1=Turkish'][word] = tur_dm_freqs[word]/sum(tur_dm_freqs.values())
    dm_words_df['L1=Danish'][word] = dan_dm_freqs[word]/sum(dan_dm_freqs.values())
    dm_words_df['L1=Portuguese'][word] = por_dm_freqs[word]/sum(por_dm_freqs.values())
    dm_words_df['L1=Polish'][word] = pol_dm_freqs[word]/sum(pol_dm_freqs.values())
    
dm_words_df

Unnamed: 0,L1=English,L1=Korean,L1=Finnish,L1=Turkish,L1=Danish,L1=Portuguese,L1=Polish
so,0.523096,0.593857,0.529274,0.72561,0.61834,0.745098,0.529412
like,0.252185,0.187713,0.259953,0.097561,0.154895,0.117647,0.311765
well,0.144819,0.0921502,0.168618,0.0792683,0.169765,0.0980392,0.0776471
right,0.062422,0.112628,0.0234192,0.0914634,0.0371747,0.0196078,0.0470588
whatever,0.0162297,0.0136519,0.0187354,0.00609756,0.0198265,0.0196078,0.0329412
look,0.00124844,0.0,0.0,0.0,0.0,0.0,0.00117647


### Getting distribution of specific discourse markers across specific L1s

In [44]:
def get_word_count(speech):
    total = 0
    for u in speech:
        total += len(u)
    return total

In [45]:
for dm in eng_dm_freqs:
    print(dm)
    print("\tEnglish: " + str(eng_dm_freqs[dm] / len(eng_dm_words)))
    print("\tFinnish: " + str(fin_dm_freqs[dm] / len(fin_dm_words)))
    print("\tKorean: " + str(kor_dm_freqs[dm] / len(kor_dm_words)))
    print("\tTurkish: " + str(tur_dm_freqs[dm] / len(tur_dm_words)))
    print("\tDanish: " + str(dan_dm_freqs[dm] / len(dan_dm_words)))
    print("\tPortuguese: " + str(por_dm_freqs[dm] / len(por_dm_words)))
    print("\tPolish: " + str(pol_dm_freqs[dm] / len(pol_dm_words)))

so
	English: 0.5230961298377028
	Finnish: 0.5292740046838408
	Korean: 0.5938566552901023
	Turkish: 0.725609756097561
	Danish: 0.6183395291201983
	Portuguese: 0.7450980392156863
	Polish: 0.5294117647058824
well
	English: 0.14481897627965043
	Finnish: 0.1686182669789227
	Korean: 0.09215017064846416
	Turkish: 0.07926829268292683
	Danish: 0.1697645600991326
	Portuguese: 0.09803921568627451
	Polish: 0.07764705882352942
like
	English: 0.25218476903870163
	Finnish: 0.25995316159250587
	Korean: 0.18771331058020477
	Turkish: 0.0975609756097561
	Danish: 0.15489467162329615
	Portuguese: 0.11764705882352941
	Polish: 0.31176470588235294
right
	English: 0.062421972534332085
	Finnish: 0.0234192037470726
	Korean: 0.11262798634812286
	Turkish: 0.09146341463414634
	Danish: 0.03717472118959108
	Portuguese: 0.0196078431372549
	Polish: 0.047058823529411764
whatever
	English: 0.016229712858926344
	Finnish: 0.01873536299765808
	Korean: 0.013651877133105802
	Turkish: 0.006097560975609756
	Danish: 0.0198265179

## Comparing Article/Determiner Use

In [46]:
#eng_speech -- (word, tag) tupes
#eng_tags -- list of tags

### Proportion of determiners

In [47]:
def num_words(li):
    count = 0
    for u in li:
        count += len(u)
    
    return count

In [48]:
eng_words = num_words(eng_speech)

In [49]:
def num_determiners(li):
    count = 0
    dets = ([[t for t in li if t == "DTfDT" ] for li in eng_tags])
    #print(dets)
    for u in dets:
        count += len(u)
        
    return count

In [50]:
#"DTfDT" = determiner
#len([t for t in eng_tags if t == "REfRE"]) / len(eng_tags)
dets = ([[t for t in u if t == "DTfDT" ] for u in eng_tags])

In [105]:
eng_dets = num_determiners(eng_speech)
eng_dets/eng_words

kor_words = num_words(kor_speech)
kor_dets = num_determiners(kor_speech)
kor_dets/kor_words

fin_words = num_words(fin_speech)
fin_dets = num_determiners(fin_speech)
fin_dets/fin_words

tur_words = num_words(tur_speech)
tur_dets = num_determiners(tur_speech)
tur_dets/tur_words

dan_words = num_words(dan_speech)
dan_dets = num_determiners(dan_speech)
dan_dets/dan_words

por_words = num_words(por_speech)
por_dets = num_determiners(por_speech)
por_dets/por_words

pol_words = num_words(pol_speech)
pol_dets = num_determiners(pol_speech)
pol_dets/pol_words

0.07924138912610304

0.20109260011738678

0.1281763503985726

0.2795455971882257

0.07103328389391257

0.24125230202578268

0.09111552073317922

In [106]:
det_df = pd.DataFrame(index = ['determiner proportions'], columns = ['L1=English', 'L1=Korean', 'L1=Finnish', 'L1=Turkish', 'L1=Danish', 'L1=Portuguese', 'L1=Polish'])

det_df['L1=English'] = eng_dets/eng_words
det_df['L1=Korean'] = kor_dets/kor_words
det_df['L1=Finnish'] = fin_dets/fin_words
det_df['L1=Turkish'] = tur_dets/tur_words
det_df['L1=Danish'] = dan_dets/dan_words
det_df['L1=Portuguese'] = por_dets/por_words
det_df['L1=Polish'] = pol_dets/pol_words

det_df

Unnamed: 0,L1=English,L1=Korean,L1=Finnish,L1=Turkish,L1=Danish,L1=Portuguese,L1=Polish
determiner proportions,0.079241,0.201093,0.128176,0.279546,0.071033,0.241252,0.091116


### Comparing specfic determiner words

In [107]:
def determiner_words(li):
    words = []
    
    for utterance in li:
        for pair in utterance:
            if pair[1] == "DTfDT":
                words.append(pair[0].replace("\n", ""))
    
    return words

In [108]:
det_words = eng_det_words.most_common()
det_words = [b[0] for b in det_words]

det_words_df = pd.DataFrame(index = det_words, columns = ['L1=English', 'L1=Korean', 'L1=Finnish', 'L1=Turkish', 'L1=Danish', 'L1=Portuguese', 'L1=Polish'])

for word in det_words:
    det_words_df['L1=English'][word] = eng_det_words[word]/eng_words
    det_words_df['L1=Korean'][word] = kor_det_words[word]/kor_words
    det_words_df['L1=Finnish'][word] = fin_det_words[word]/fin_words
    det_words_df['L1=Turkish'][word] = tur_det_words[word]/tur_words
    det_words_df['L1=Danish'][word] = dan_det_words[word]/dan_words
    det_words_df['L1=Portuguese'][word] = por_det_words[word]/por_words
    det_words_df['L1=Polish'][word] = pol_det_words[word]/pol_words
det_words_df

Unnamed: 0,L1=English,L1=Korean,L1=Finnish,L1=Turkish,L1=Danish,L1=Portuguese,L1=Polish
the,0.0346926,0.0298885,0.0440013,0.0504613,0.0409869,0.0441447,0.0419573
a,0.0152114,0.00957154,0.0123745,0.0112973,0.0179098,0.013108,0.00961479
that,0.0125605,0.00708836,0.0113385,0.00916337,0.0124874,0.0053082,0.00525745
this,0.00583547,0.00663687,0.00834556,0.00483274,0.00677799,0.0110497,0.0115787
some,0.00217051,0.00501151,0.00184178,0.00263604,0.0022168,0.00346658,0.00202524
an,0.00209935,0.000316041,0.00164033,0.000941442,0.00271119,0.0012458,0.000900108
all,0.00167236,0.00130931,0.00164033,0.00156907,0.0017862,0.00216661,0.00149336
these,0.00154782,0.000677231,0.00218711,0.00125526,0.00102068,0.00113747,0.00061371
any,0.000818389,0.000948124,0.000863334,0.00100421,0.000781462,0.0012458,0.00108422
those,0.000693851,0.000496636,0.000949668,0.00106697,0.000653876,0.00102914,0.00114559
