In [1]:
import pickle
import nltk
from nltk.corpus import stopwords
import pandas as pd
import matplotlib.pyplot as plt
stopWords = set(stopwords.words('english'))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
f = open('VOICE_tokenized.p', 'rb')
VOICE_toks = pickle.load(f)
f.close()

f = open('VOICE_tagged.p', 'rb')
VOICE_tags = pickle.load(f)
f.close()

f = open('VOICE_native_tagged.p', 'rb')
VOICE_native_tags = pickle.load(f)
f.close()

f = open('VOICE_participant_info.p', 'rb')
participants = pickle.load(f)
f.close()

In [3]:
def get_monolingual_speakers(participant_dict, language):
    speakers = []
    for p in participant_dict.keys():
        L1s = participant_dict[p]['L1']
        if(len(L1s) == 1 and language in L1s):
            speakers.append(p)
            
    return speakers

In [4]:
eng_speakers = get_monolingual_speakers(participants, 'eng')
pol_speakers = get_monolingual_speakers(participants, 'pol')
kor_speakers = get_monolingual_speakers(participants, 'kor')
fin_speakers = get_monolingual_speakers(participants, 'fin')
dan_speakers = get_monolingual_speakers(participants, 'dan')
tur_speakers = get_monolingual_speakers(participants, 'tur')
hun_speakers = get_monolingual_speakers(participants, 'hun')
por_speakers = get_monolingual_speakers(participants, 'por')
rus_speakers = get_monolingual_speakers(participants, 'rus')
mlt_speakers = get_monolingual_speakers(participants, 'mlt')
lav_speakers = get_monolingual_speakers(participants, 'lav')

In [5]:
print("English speakers:", len(eng_speakers))
print("Polish speakers:", len(pol_speakers))
print("Korean speakers:", len(kor_speakers))
print("Finnish speakers:", len(fin_speakers))
print("Danish speakers:", len(dan_speakers))
print("Turkish speakers:", len(tur_speakers))
print("Hungarian speakers:", len(hun_speakers))
print("Portuguese speakers:", len(por_speakers))
print("Russian speakers:", len(rus_speakers))
print("Maltese speakers:", len(mlt_speakers))
print("Latvian speakers:", len(lav_speakers))

English speakers: 62
Polish speakers: 35
Korean speakers: 14
Finnish speakers: 51
Danish speakers: 35
Turkish speakers: 14
Hungarian speakers: 13
Portuguese speakers: 21
Russian speakers: 22
Maltese speakers: 22
Latvian speakers: 19


In [6]:
#Need to use English
#Use Korean, Finnish, Turkish
#Danish, Portuguese, Polish

In [7]:
#List of speakers
def get_tagged_utterances(tokens, speakers): 
    utterances = []
    for conversation in tokens.keys():
        for pair in tokens[conversation]:
            if(pair[1] in speakers):
                utterances.append(tokens[conversation][pair])
                
    return utterances

In [8]:
eng_speech = get_tagged_utterances(VOICE_native_tags, eng_speakers)
kor_speech = get_tagged_utterances(VOICE_tags, kor_speakers)
fin_speech = get_tagged_utterances(VOICE_tags, fin_speakers)
tur_speech = get_tagged_utterances(VOICE_tags, tur_speakers)
dan_speech = get_tagged_utterances(VOICE_tags, dan_speakers)
por_speech = get_tagged_utterances(VOICE_tags, por_speakers)
pol_speech = get_tagged_utterances(VOICE_tags, pol_speakers)

In [9]:
def get_pairs(li):
    tokens = []
    for u in li:
        for w in u:
            tokens.append(w)
    return tokens

In [10]:
def remove_tags(li):
    unwanted_tags = ["BRfBR", "PAfPA", "UHfUH", "UNIfUNI", "UNKfNN", "LAfLA", "XXfXX"]
    return [pair for pair in li if pair[1] not in unwanted_tags]

In [11]:
def get_tags(li):
    return[pair[1] for pair in li]

In [12]:
eng_toks = get_pairs(eng_speech)
eng_toks = remove_tags(eng_toks)
eng_tags = get_tags(eng_toks)

kor_toks = get_pairs(kor_speech)
kor_toks = remove_tags(kor_toks)
kor_tags = get_tags(kor_toks)

fin_toks = get_pairs(fin_speech)
fin_toks = remove_tags(fin_toks)
fin_tags = get_tags(fin_toks)

tur_toks = get_pairs(tur_speech)
tur_toks = remove_tags(tur_toks)
tur_tags = get_tags(tur_toks)

dan_toks = get_pairs(dan_speech)
dan_toks = remove_tags(dan_toks)
dan_tags = get_tags(dan_toks)

por_toks = get_pairs(por_speech)
por_toks = remove_tags(por_toks)
por_tags = get_tags(por_toks)

pol_toks = get_pairs(pol_speech)
pol_toks = remove_tags(pol_toks)
pol_tags = get_tags(pol_toks)

## Trigrams

In [13]:
def get_trigrams(li):
    trigram_list = []
    for trigram in list(nltk.trigrams(li)):
        if len(trigram) > 0:
            trigram_list.append(trigram)
            
    return trigram_list
        

In [14]:
eng_tag_trigrams = get_trigrams(eng_tags)
kor_tag_trigrams = get_trigrams(kor_tags)
fin_tag_trigrams = get_trigrams(fin_tags)
tur_tag_trigrams = get_trigrams(tur_tags)
dan_tag_trigrams = get_trigrams(dan_tags)
por_tag_trigrams = get_trigrams(por_tags)
pol_tag_trigrams = get_trigrams(pol_tags)

In [15]:
len(eng_tag_trigrams)

47990

### Most common part-of-speech trigrams in Engish. Compare with these frequencies with other L1s

In [16]:
eng_freq = nltk.FreqDist(eng_tag_trigrams)
kor_freq = nltk.FreqDist(kor_tag_trigrams)
fin_freq = nltk.FreqDist(fin_tag_trigrams)
tur_freq = nltk.FreqDist(tur_tag_trigrams)
dan_freq = nltk.FreqDist(dan_tag_trigrams)
por_freq = nltk.FreqDist(por_tag_trigrams)
pol_freq = nltk.FreqDist(pol_tag_trigrams)

In [17]:
eng_freq.most_common(20)

[(('INfIN', 'DTfDT', 'NNfNN'), 719),
 (('DTfDT', 'NNfNN', 'INfIN'), 580),
 (('DTfDT', 'JJfJJ', 'NNfNN'), 450),
 (('NNfNN', 'INfIN', 'DTfDT'), 366),
 (('REfRE', 'REfRE', 'REfRE'), 365),
 (('PPfPP', 'MDfMD', 'VVfVV'), 270),
 (('INfIN', 'DTfDT', 'JJfJJ'), 230),
 (('NNfNN', 'INfIN', 'NNfNN'), 204),
 (('JJfJJ', 'NNfNN', 'INfIN'), 202),
 (('PPfPP', 'VVPfVVP', 'PPfPP'), 195),
 (('PPfPP', 'VVPfVVP', 'RBfRB'), 188),
 (('DTfDT', 'NNfNN', 'NNfNN'), 186),
 (('PPfPP', 'VBSfVBS', 'RBfRB'), 186),
 (('INfIN', 'DTfDT', 'NNSfNNS'), 176),
 (('RBfRB', 'INfIN', 'DTfDT'), 158),
 (('PPfPP', 'VBPfVBP', 'VVGfVVG'), 157),
 (('DTfDT', 'NNfNN', 'CCfCC'), 154),
 (('TOfTO', 'VVfVV', 'INfIN'), 151),
 (('INfIN', 'PPfPP', 'VVPfVVP'), 148),
 (('DTfDT', 'NNfNN', 'PPfPP'), 144)]

In [18]:
for trigram in eng_freq.most_common(10):
    trigram = trigram[0]
    print(trigram)
    
    print("\tEnglish: " + str(eng_freq[trigram] / len(eng_tag_trigrams)))
    print("\tKorean: " + str(kor_freq[trigram] / len(kor_tag_trigrams)))
    print("\tFinnish: " + str(fin_freq[trigram] / len(fin_tag_trigrams)))
    print("\tTurkish: " + str(tur_freq[trigram] / len(tur_tag_trigrams)))
    print("\tDanish: " + str(dan_freq[trigram] / len(dan_tag_trigrams)))
    print("\tPortuguese: " + str(por_freq[trigram] / len(por_tag_trigrams)))
    print("\tPolish: " + str(pol_freq[trigram] / len(pol_tag_trigrams)))

('INfIN', 'DTfDT', 'NNfNN')
	English: 0.014982287976661804
	Korean: 0.010785081641260469
	Finnish: 0.015523567598080722
	Turkish: 0.019967241244832696
	Danish: 0.015679111009628193
	Portuguese: 0.01752612066059993
	Polish: 0.01934010278308796
('DTfDT', 'NNfNN', 'INfIN')
	English: 0.012085851219003959
	Korean: 0.008134000120503705
	Finnish: 0.012489415749364945
	Turkish: 0.01536541611418766
	Danish: 0.013871446512158922
	Portuguese: 0.014897202561509943
	Polish: 0.011594130936716403
('DTfDT', 'JJfJJ', 'NNfNN')
	English: 0.00937695353198583
	Korean: 0.008977526058926311
	Finnish: 0.011184024837708157
	Turkish: 0.013571484283597224
	Danish: 0.012273090535449253
	Portuguese: 0.0100438153016515
	Polish: 0.009185928151145759
('NNfNN', 'INfIN', 'DTfDT')
	English: 0.007626588872681809
	Korean: 0.006627703801891908
	Finnish: 0.010160880609652836
	Turkish: 0.011699555416894158
	Danish: 0.009209574913422384
	Portuguese: 0.010178631614425346
	Polish: 0.010352789294669679
('REfRE', 'REfRE', 'REfRE'

### Most common part-of-speech trigrams for Korean speakers. Compare with these frequencies with other L1s

In [19]:
for trigram in kor_freq.most_common(10):
    trigram = trigram[0]
    print(trigram)
    
    print("\tKorean: " + str(kor_freq[trigram] / len(kor_tag_trigrams)))
    print("\tEnglish: " + str(eng_freq[trigram] / len(eng_tag_trigrams)))
    print("\tFinnish: " + str(fin_freq[trigram] / len(fin_tag_trigrams)))
    print("\tTurkish: " + str(tur_freq[trigram] / len(tur_tag_trigrams)))
    print("\tDanish: " + str(dan_freq[trigram] / len(dan_tag_trigrams)))
    print("\tPortuguese: " + str(por_freq[trigram] / len(por_tag_trigrams)))
    print("\tPolish: " + str(pol_freq[trigram] / len(pol_tag_trigrams)))

('REfRE', 'REfRE', 'REfRE')
	Korean: 0.020485629933120444
	English: 0.007605751198166285
	Finnish: 0.01203076488851256
	Turkish: 0.001949925902815693
	Danish: 0.011645164973170453
	Portuguese: 0.003302999662959218
	Polish: 0.012612031083194716
('INfIN', 'DTfDT', 'NNfNN')
	Korean: 0.010785081641260469
	English: 0.014982287976661804
	Finnish: 0.015523567598080722
	Turkish: 0.019967241244832696
	Danish: 0.015679111009628193
	Portuguese: 0.01752612066059993
	Polish: 0.01934010278308796
('DTfDT', 'JJfJJ', 'NNfNN')
	Korean: 0.008977526058926311
	English: 0.00937695353198583
	Finnish: 0.011184024837708157
	Turkish: 0.013571484283597224
	Danish: 0.012273090535449253
	Portuguese: 0.0100438153016515
	Polish: 0.009185928151145759
('DTfDT', 'NNfNN', 'INfIN')
	Korean: 0.008134000120503705
	English: 0.012085851219003959
	Finnish: 0.012489415749364945
	Turkish: 0.01536541611418766
	Danish: 0.013871446512158922
	Portuguese: 0.014897202561509943
	Polish: 0.011594130936716403
('NNfNN', 'INfIN', 'DTfDT')

### Most common part-of-speech trigrams for Finnish speakers. Compare with these frequencies with other L1s

In [20]:
for trigram in fin_freq.most_common(10):
    trigram = trigram[0]
    print(trigram)
    
    print("\tFinnish: " + str(fin_freq[trigram] / len(fin_tag_trigrams)))
    print("\tEnglish: " + str(eng_freq[trigram] / len(eng_tag_trigrams)))
    print("\tKorean: " + str(kor_freq[trigram] / len(kor_tag_trigrams)))
    print("\tTurkish: " + str(tur_freq[trigram] / len(tur_tag_trigrams)))
    print("\tDanish: " + str(dan_freq[trigram] / len(dan_tag_trigrams)))
    print("\tPortuguese: " + str(por_freq[trigram] / len(por_tag_trigrams)))
    print("\tPolish: " + str(pol_freq[trigram] / len(pol_tag_trigrams)))

('INfIN', 'DTfDT', 'NNfNN')
	Finnish: 0.015523567598080722
	English: 0.014982287976661804
	Korean: 0.010785081641260469
	Turkish: 0.019967241244832696
	Danish: 0.015679111009628193
	Portuguese: 0.01752612066059993
	Polish: 0.01934010278308796
('DTfDT', 'NNfNN', 'INfIN')
	Finnish: 0.012489415749364945
	English: 0.012085851219003959
	Korean: 0.008134000120503705
	Turkish: 0.01536541611418766
	Danish: 0.013871446512158922
	Portuguese: 0.014897202561509943
	Polish: 0.011594130936716403
('REfRE', 'REfRE', 'REfRE')
	Finnish: 0.01203076488851256
	English: 0.007605751198166285
	Korean: 0.020485629933120444
	Turkish: 0.001949925902815693
	Danish: 0.011645164973170453
	Portuguese: 0.003302999662959218
	Polish: 0.012612031083194716
('DTfDT', 'JJfJJ', 'NNfNN')
	Finnish: 0.011184024837708157
	English: 0.00937695353198583
	Korean: 0.008977526058926311
	Turkish: 0.013571484283597224
	Danish: 0.012273090535449253
	Portuguese: 0.0100438153016515
	Polish: 0.009185928151145759
('NNfNN', 'INfIN', 'DTfDT')

### Get native speaker trigram outliers (in terms of frequency vs. other L1s)
Some tags to get rid of: BR(breathing), PA(pause), UH(interjections and hesitations), UNI(unintelligible), UNK(unknown)
https://www.univie.ac.at/voice/page/documents/VOICE_tagging_manual.pdf

In [21]:
more_common = {}
less_common = {}

for trigram in eng_freq:
    #trigram = trigram[0]
    
    avg = 0
    avg = avg + (kor_freq[trigram] / len(kor_tag_trigrams))
    avg = avg + (fin_freq[trigram] / len(fin_tag_trigrams))
    avg = avg + (tur_freq[trigram] / len(tur_tag_trigrams))
    avg = avg + (dan_freq[trigram] / len(dan_tag_trigrams))
    avg = avg + (por_freq[trigram] / len(por_tag_trigrams))
    avg = avg + (pol_freq[trigram] / len(pol_tag_trigrams))
    
    avg /= 6
    
    eng_percent = (eng_freq[trigram] / len(eng_tag_trigrams))
    if eng_percent > (avg * 2) and eng_freq[trigram] > 5:
        more_common[trigram] = eng_freq[trigram]
        
        
    if eng_percent < (avg * 0.5) and eng_freq[trigram] > 5:
        less_common[trigram] = eng_freq[trigram]

#### Native trigram outliers -- more frequent that other L1s.

In [22]:
for trigram in sorted(more_common, key=more_common.get, reverse=True)[:10]:
    print(trigram)
    
    print("\tEnglish: " + str(eng_freq[trigram] / len(eng_tag_trigrams)))
    print("\tFinnish: " + str(fin_freq[trigram] / len(fin_tag_trigrams)))
    print("\tKorean: " + str(kor_freq[trigram] / len(kor_tag_trigrams)))
    print("\tTurkish: " + str(tur_freq[trigram] / len(tur_tag_trigrams)))
    print("\tDanish: " + str(dan_freq[trigram] / len(dan_tag_trigrams)))
    print("\tPortuguese: " + str(por_freq[trigram] / len(por_tag_trigrams)))
    print("\tPolish: " + str(pol_freq[trigram] / len(pol_tag_trigrams)))

('PPfPP', 'VHPfVHP', 'VVNfVVN')
	English: 0.002208793498645551
	Finnish: 0.00091730172170477
	Korean: 0.0006627703801891908
	Turkish: 0.0006239762889010217
	Danish: 0.0012748791718993797
	Portuguese: 0.0006066734074823053
	Polish: 0.001390302639092331
('PPfPP', 'VBDfVBD', 'VVGfVVG')
	English: 0.0013752865180245884
	Finnish: 0.0003175275190516511
	Korean: 6.0251852744471894e-05
	Turkish: 0.00015599407222525544
	Danish: 0.0004186170415191993
	Portuguese: 0.0013481631277384564
	Polish: 0.0004965366568186896
('VBPfVBP', 'VVGfVVG', 'INfIN')
	English: 0.0013336111689935403
	Finnish: 0.0009525825571549535
	Korean: 6.0251852744471894e-05
	Turkish: 0.0007019733250136495
	Danish: 0.0006088975149370171
	Portuguese: 0.0009437141894169194
	Polish: 0.0007199781523870999
('DTfDT', 'VBSfVBS', 'RBfRB')
	English: 0.0010418837257762034
	Finnish: 0.0007056167090036692
	Korean: 0.00030125926372235944
	Turkish: 7.799703611262772e-05
	Danish: 0.000799177988354835
	Portuguese: 0.0
	Polish: 0.00039722932545495

#### Native trigram outliers -- less frequent that other L1s. (Maybe try this again, but excluding pause and hesitation tags)

In [23]:
for trigram in sorted(less_common, key=less_common.get, reverse=True)[:10]:
    print(trigram)
    print("\tEnglish: " + str(eng_freq[trigram] / len(eng_tag_trigrams)))
    print("\tFinnish: " + str(fin_freq[trigram] / len(fin_tag_trigrams)))
    print("\tKorean: " + str(kor_freq[trigram] / len(kor_tag_trigrams)))
    print("\tTurkish: " + str(tur_freq[trigram] / len(tur_tag_trigrams)))
    print("\tDanish: " + str(dan_freq[trigram] / len(dan_tag_trigrams)))
    print("\tPortuguese: " + str(por_freq[trigram] / len(por_tag_trigrams)))
    print("\tPolish: " + str(pol_freq[trigram] / len(pol_tag_trigrams)))

('PPfPP', 'VHPfVHP', 'DTfDT')
	English: 0.0006876432590122942
	Finnish: 0.0010231442280553204
	Korean: 0.0014460444658673255
	Turkish: 0.001403946650027299
	Danish: 0.0018266925448110515
	Portuguese: 0.0019548365352207615
	Polish: 0.0008192854837508379
('PPfPP', 'PPfPP', 'PPfPP')
	English: 0.0004584288393415295
	Finnish: 0.000458650860852385
	Korean: 0.0007230222329336627
	Turkish: 0.00023399110833788317
	Danish: 0.001788636450127488
	Portuguese: 0.0018874283788338389
	Polish: 0.0005710171553414931
('NPfNP', 'CCfCC', 'NPfNP')
	English: 0.0004375911648260054
	Finnish: 0.0005644933672029354
	Korean: 0.0025908296680122915
	Turkish: 0.0012479525778020435
	Danish: 0.0005137572782281082
	Portuguese: 0.0010785305021907652
	Polish: 0.0005958439881824276
('CCfCC', 'DTfDT', 'JJfJJ')
	English: 0.0003959158157949573
	Finnish: 0.0011289867344058708
	Korean: 0.00030125926372235944
	Turkish: 0.001949925902815693
	Danish: 0.0007611218936712714
	Portuguese: 0.0009437141894169194
	Polish: 0.000446882991

## Comparing discourse markers

In [24]:
def get_discourse_markers(speech):
    dm_list = []
    markers = [[t for t in u if t[1] == "DMfDM"] for u in speech]
    for m in markers:
        dm_list.extend(m)
        
    #return dm_list
    return dm_list

In [25]:
eng_dm = get_discourse_markers(eng_speech)
kor_dm = get_discourse_markers(kor_speech)
fin_dm = get_discourse_markers(fin_speech)
tur_dm = get_discourse_markers(tur_speech)
dan_dm = get_discourse_markers(dan_speech)
por_dm = get_discourse_markers(por_speech)
pol_dm = get_discourse_markers(pol_speech)

In [26]:
def get_dm_percent(speech, dm_list):
    total = 0
    for u in speech:
        total += len(u)
    
    return len(dm_list)/total

### Percent of discourse markers across L1s

In [27]:
dm_df = pd.DataFrame(index = ['determiner proportions'], columns = ['L1=English', 'L1=Korean', 'L1=Finnish', 'L1=Turkish', 'L1=Danish', 'L1=Portuguese', 'L1=Polish'])

dm_df['L1=English'] = get_dm_percent(eng_speech, eng_dm)
dm_df['L1=Korean'] = get_dm_percent(kor_speech, kor_dm)
dm_df['L1=Finnish'] = get_dm_percent(fin_speech, fin_dm)
dm_df['L1=Turkish'] = get_dm_percent(tur_speech, tur_dm)
dm_df['L1=Danish'] = get_dm_percent(dan_speech, dan_dm)
dm_df['L1=Portuguese'] = get_dm_percent(por_speech, por_dm)
dm_df['L1=Polish'] = get_dm_percent(pol_speech, pol_dm)

dm_df

Unnamed: 0,L1=English,L1=Korean,L1=Finnish,L1=Turkish,L1=Danish,L1=Portuguese,L1=Polish
determiner proportions,0.014251,0.013229,0.012288,0.010293,0.01287,0.008287,0.017388


In [28]:
eng_dm_words = [dm[0].replace('\n', '') for dm in eng_dm]
kor_dm_words = [dm[0].replace('\n', '') for dm in kor_dm]
fin_dm_words = [dm[0].replace('\n', '') for dm in fin_dm]
tur_dm_words = [dm[0].replace('\n', '') for dm in tur_dm]
dan_dm_words = [dm[0].replace('\n', '') for dm in dan_dm]
por_dm_words = [dm[0].replace('\n', '') for dm in por_dm]
pol_dm_words = [dm[0].replace('\n', '') for dm in pol_dm]

In [29]:
eng_dm_freqs = nltk.FreqDist(eng_dm_words)
eng_dm_freqs

FreqDist({'like': 202,
          'look': 1,
          'right': 50,
          'so': 419,
          'well': 116,
          'whatever': 13})

In [30]:
kor_dm_freqs = nltk.FreqDist(kor_dm_words)
kor_dm_freqs

FreqDist({'like': 55, 'right': 33, 'so': 174, 'well': 27, 'whatever': 4})

In [31]:
fin_dm_freqs = nltk.FreqDist(fin_dm_words)
fin_dm_freqs

FreqDist({'like': 111, 'right': 10, 'so': 226, 'well': 72, 'whatever': 8})

In [32]:
tur_dm_freqs = nltk.FreqDist(tur_dm_words)
tur_dm_freqs

FreqDist({'like': 16, 'right': 15, 'so': 119, 'well': 13, 'whatever': 1})

In [33]:
dan_dm_freqs = nltk.FreqDist(dan_dm_words)
dan_dm_freqs

FreqDist({'like': 125, 'right': 30, 'so': 499, 'well': 137, 'whatever': 16})

In [34]:
por_dm_freqs = nltk.FreqDist(por_dm_words)
dm_df = pd.DataFrame(index = ['determiner proportions'], columns = ['L1=English', 'L1=Korean', 'L1=Finnish', 'L1=Turkish', 'L1=Danish', 'L1=Portuguese', 'L1=Polish'])

dm_df['L1=English'] = get_dm_percent(eng_speech, eng_dm)
dm_df['L1=Korean'] = get_dm_percent(kor_speech, kor_dm)
dm_df['L1=Finnish'] = get_dm_percent(fin_speech, fin_dm)
dm_df['L1=Turkish'] = get_dm_percent(tur_speech, tur_dm)
dm_df['L1=Danish'] = get_dm_percent(dan_speech, dan_dm)
dm_df['L1=Portuguese'] = get_dm_percent(por_speech, por_dm)
dm_df['L1=Polish'] = get_dm_percent(pol_speech, pol_dm)

dm_df

Unnamed: 0,L1=English,L1=Korean,L1=Finnish,L1=Turkish,L1=Danish,L1=Portuguese,L1=Polish
determiner proportions,0.014251,0.013229,0.012288,0.010293,0.01287,0.008287,0.017388


In [35]:
pol_dm_freqs = nltk.FreqDist(pol_dm_words)
pol_dm_freqs

FreqDist({'like': 265,
          'look': 1,
          'right': 40,
          'so': 450,
          'well': 66,
          'whatever': 28})

In [36]:
discourse_markers = eng_dm_freqs.most_common()
discourse_markers = [b[0] for b in discourse_markers]

dm_words_df = pd.DataFrame(index = discourse_markers, columns = ['L1=English', 'L1=Korean', 'L1=Finnish', 'L1=Turkish', 'L1=Danish', 'L1=Portuguese', 'L1=Polish'])

for word in discourse_markers:
    dm_words_df['L1=English'][word] = eng_dm_freqs[word]/sum(eng_dm_freqs.values())
    dm_words_df['L1=Korean'][word] = kor_dm_freqs[word]/sum(kor_dm_freqs.values())
    dm_words_df['L1=Finnish'][word] = fin_dm_freqs[word]/sum(fin_dm_freqs.values())
    dm_words_df['L1=Turkish'][word] = tur_dm_freqs[word]/sum(tur_dm_freqs.values())
    dm_words_df['L1=Danish'][word] = dan_dm_freqs[word]/sum(dan_dm_freqs.values())
    dm_words_df['L1=Portuguese'][word] = por_dm_freqs[word]/sum(por_dm_freqs.values())
    dm_words_df['L1=Polish'][word] = pol_dm_freqs[word]/sum(pol_dm_freqs.values())
    
dm_words_df

Unnamed: 0,L1=English,L1=Korean,L1=Finnish,L1=Turkish,L1=Danish,L1=Portuguese,L1=Polish
so,0.523096,0.593857,0.529274,0.72561,0.61834,0.745098,0.529412
like,0.252185,0.187713,0.259953,0.097561,0.154895,0.117647,0.311765
well,0.144819,0.0921502,0.168618,0.0792683,0.169765,0.0980392,0.0776471
right,0.062422,0.112628,0.0234192,0.0914634,0.0371747,0.0196078,0.0470588
whatever,0.0162297,0.0136519,0.0187354,0.00609756,0.0198265,0.0196078,0.0329412
look,0.00124844,0.0,0.0,0.0,0.0,0.0,0.00117647


### Getting distribution of specific discourse markers across specific L1s

In [37]:
def get_word_count(speech):
    total = 0
    for u in speech:
        total += len(u)
    return total

In [38]:
for dm in eng_dm_freqs:
    print(dm)
    print("\tEnglish: " + str(eng_dm_freqs[dm] / len(eng_dm_words)))
    print("\tFinnish: " + str(fin_dm_freqs[dm] / len(fin_dm_words)))
    print("\tKorean: " + str(kor_dm_freqs[dm] / len(kor_dm_words)))
    print("\tTurkish: " + str(tur_dm_freqs[dm] / len(tur_dm_words)))
    print("\tDanish: " + str(dan_dm_freqs[dm] / len(dan_dm_words)))
    print("\tPortuguese: " + str(por_dm_freqs[dm] / len(por_dm_words)))
    print("\tPolish: " + str(pol_dm_freqs[dm] / len(pol_dm_words)))

so
	English: 0.5230961298377028
	Finnish: 0.5292740046838408
	Korean: 0.5938566552901023
	Turkish: 0.725609756097561
	Danish: 0.6183395291201983
	Portuguese: 0.7450980392156863
	Polish: 0.5294117647058824
well
	English: 0.14481897627965043
	Finnish: 0.1686182669789227
	Korean: 0.09215017064846416
	Turkish: 0.07926829268292683
	Danish: 0.1697645600991326
	Portuguese: 0.09803921568627451
	Polish: 0.07764705882352942
like
	English: 0.25218476903870163
	Finnish: 0.25995316159250587
	Korean: 0.18771331058020477
	Turkish: 0.0975609756097561
	Danish: 0.15489467162329615
	Portuguese: 0.11764705882352941
	Polish: 0.31176470588235294
right
	English: 0.062421972534332085
	Finnish: 0.0234192037470726
	Korean: 0.11262798634812286
	Turkish: 0.09146341463414634
	Danish: 0.03717472118959108
	Portuguese: 0.0196078431372549
	Polish: 0.047058823529411764
whatever
	English: 0.016229712858926344
	Finnish: 0.01873536299765808
	Korean: 0.013651877133105802
	Turkish: 0.006097560975609756
	Danish: 0.0198265179

## Comparing Article/Determiner Use

In [39]:
#eng_speech -- (word, tag) tupes
#eng_tags -- list of tags

### Proportion of determiners

In [41]:
def num_words(li):
    count = 0
    for u in li:
        count += len(u)
    
    return count

In [42]:
def determiner_words(li):
    words = []
    
    for utterance in li:
        for pair in utterance:
            if pair[1] == "DTfDT":
                words.append(pair[0].replace("\n", ""))
    
    return words

In [43]:
det_df = pd.DataFrame(index = ['determiner proportions'], columns = ['L1=English', 'L1=Korean', 'L1=Finnish', 'L1=Turkish', 'L1=Danish', 'L1=Portuguese', 'L1=Polish'])

eng_dets = determiner_words(eng_speech)
kor_dets = determiner_words(kor_speech)
fin_dets = determiner_words(fin_speech)
tur_dets = determiner_words(tur_speech)
dan_dets = determiner_words(dan_speech)
por_dets = determiner_words(por_speech)
pol_dets = determiner_words(pol_speech)

det_df['L1=English'] = len(eng_dets)/len(eng_toks)
det_df['L1=Korean'] = len(kor_dets)/len(kor_toks)
det_df['L1=Finnish'] = len(fin_dets)/len(fin_toks)
det_df['L1=Turkish'] = len(tur_dets)/len(tur_toks)
det_df['L1=Danish'] = len(dan_dets)/len(dan_toks)
det_df['L1=Portuguese'] = len(por_dets)/len(por_toks)
det_df['L1=Polish'] = len(pol_dets)/len(pol_toks)

det_df

Unnamed: 0,L1=English,L1=Korean,L1=Finnish,L1=Turkish,L1=Danish,L1=Portuguese,L1=Polish
determiner proportions,0.092807,0.085427,0.107176,0.106683,0.106743,0.107973,0.094908


### Comparing specfic determiner words

In [44]:
eng_det_words = nltk.FreqDist(eng_dets)
kor_det_words = nltk.FreqDist(kor_dets)
fin_det_words = nltk.FreqDist(fin_dets)
tur_det_words = nltk.FreqDist(tur_dets)
dan_det_words = nltk.FreqDist(dan_dets)
por_det_words = nltk.FreqDist(por_dets)
pol_det_words = nltk.FreqDist(pol_dets)

In [45]:

det_words = eng_det_words.most_common()
det_words = [b[0] for b in det_words]

det_words_df = pd.DataFrame(index = det_words, columns = ['L1=English', 'L1=Korean', 'L1=Finnish', 'L1=Turkish', 'L1=Danish', 'L1=Portuguese', 'L1=Polish'])

for word in det_words:
    det_words_df['L1=English'][word] = eng_det_words[word]/len(eng_toks)
    det_words_df['L1=Korean'][word] = kor_det_words[word]/len(kor_toks)
    det_words_df['L1=Finnish'][word] = fin_det_words[word]/len(fin_toks)
    det_words_df['L1=Turkish'][word] = tur_det_words[word]/len(tur_toks)
    det_words_df['L1=Danish'][word] = dan_det_words[word]/len(dan_toks)
    det_words_df['L1=Portuguese'][word] = por_det_words[word]/len(por_toks)
    det_words_df['L1=Polish'][word] = pol_det_words[word]/len(pol_toks)
det_words_df

Unnamed: 0,L1=English,L1=Korean,L1=Finnish,L1=Turkish,L1=Danish,L1=Portuguese,L1=Polish
the,0.0406318,0.0398819,0.0539406,0.0626998,0.0489002,0.0549302,0.0509173
a,0.0178155,0.0127719,0.0151697,0.0140373,0.0213677,0.0163106,0.011668
that,0.0147108,0.0094584,0.0138997,0.0113858,0.0148984,0.00660511,0.00638018
this,0.00683447,0.00885596,0.0102307,0.00600484,0.00808661,0.0137494,0.0140513
some,0.00254209,0.00668715,0.00225781,0.00327536,0.0026448,0.00431354,0.00245773
an,0.00245874,0.000421712,0.00201087,0.00116977,0.00323464,0.00155018,0.00109233
all,0.00195866,0.00174709,0.00201087,0.00194962,0.00213106,0.00269596,0.00181227
these,0.0018128,0.000903669,0.00268115,0.0015597,0.00121775,0.00141538,0.000744768
any,0.000958493,0.00126514,0.00105835,0.00124776,0.000932339,0.00155018,0.00131576
those,0.000812635,0.000662691,0.00116419,0.00132574,0.00078012,0.00128058,0.00139023
