In [19]:
import pickle
import nltk
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
#Get the tokens for each corpus from their respective pickle files
f = open('VOICE_tokenized.p', 'rb')
VOICE_toks = pickle.load(f)
f.close()

f = open('BNC_tokenized.p', 'rb')
BNC_toks = pickle.load(f)
f.close()

f = open('VOICE_tagged.p', 'rb')
VOICE_tags = pickle.load(f)
f.close()

f = open('BNC_tagged.p', 'rb')
BNC_tags = pickle.load(f)
f.close()

In [3]:
def get_bigrams(dictionary):
    bigrams = []
    for file in dictionary:
        for key in dictionary[file]:
            pairs = list(nltk.bigrams(dictionary[file][key]))
            bigrams.extend(pairs)
    return bigrams

In [4]:
"""
Takes a list of bigrams, returns dictionary whose keys are bigrams containing duplicate words (e.g ('i', 'i',), 
'the', 'the') and whose values are the frequencies of each bigram
"""
def repeated_words(bigrams):
    repeated = {}
    for b in bigrams:
        if(b[0] == b[1]):
            if(b not in repeated):
                repeated[b] = 1
            else:
                repeated[b] += 1
    return repeated

"\nTakes a list of bigrams, returns dictionary whose keys are bigrams containing duplicate words (e.g ('i', 'i',), \n'the', 'the') and whose values are the frequencies of each bigram\n"

In [5]:
"""
Takes a dictionary of bigrams whose keys are bigrams containing duplicate words (e.g ('i', 'i',), 
'the', 'the') and whose values are the frequencies of each bigram. Returns the sum of the frequencies for only
bigrams that contain repeated stop words
"""
def repeated_stopwords(bigram_dict):
    count = 0
    for b in bigram_dict.keys():
        if b[0] in stopWords:
            count += bigram_dict[b]
    return count

"\nTakes a dictionary of bigrams whose keys are bigrams containing duplicate words (e.g ('i', 'i',), \n'the', 'the') and whose values are the frequencies of each bigram. Returns a dictionary of the same format, but\nonly includes bigrams that contain a stopword.\n"

In [6]:
"""
Takes a dictionary containing conversations and a user-provided tag. Returns a dictionary who keys are words that are 
associted with the tag, and whose values are the frequencies of each word
"""
def tag_counts(conv_dictionary, tag):
    tags = {}
    for file in conv_dictionary:
        for key in conv_dictionary[file]:
            for pair in conv_dictionary[file][key]:
                if(pair[1] == tag):
                    if(pair not in tags):
                        tags[pair] = 1
                    else:
                        tags[pair] += 1
    return tags

'\nTakes a dictionary containing conversations and a user-provided tag. Returns a dictionary who keys are words that are \nassocited with the tag, and whose values are the frequencies of each word\n'

In [7]:
"""
Takes a list of bigrams and a percentage between 0 and 1. Returns a dictionary whose keys are bigrams and whose values
are the frequencies of the bigrams. The bigrams included in the dictionary are only the ones that occur in the top 
given percent. For example, if the user inputs BNC_bigrams 0.3, the returned dictionary will occur the top 30% of 
bigrams in terms of their frequence. The BNC has 9378696 bigram occurrences, so the dictionary will contain around 
1875739 entries.
"""

def frequent_bigrams(bigrams, percent):
    freqs = nltk.FreqDist(bigrams)
    most_common = freqs.most_common()
    limit = len(bigrams) * percent
    num_occurrences = 0
    top_bigrams = []
    
    for b in freqs.most_common():
        num_occurrences += b[1]
        if(num_occurrences >= limit):
            return top_bigrams
        top_bigrams.append(b)
    return count_dictionary

'\nTakes a list of bigrams and a percentage between 0 and 1. Returns a dictionary whose keys are bigrams and whose values\nare the frequencies of the bigrams. The bigrams included in the dictionary are only the ones that occur in the top \ngiven percent. For example, if the user inputs BNC_bigrams 0.3, the returned dictionary will occur the top 30% of \nbigrams in terms of their frequence. The BNC has 9378696 bigram occurrences, so the dictionary will contain around \n1875739 entries.\n'

In [36]:
VOICE_top_bigrams = frequent_bigrams(VOICE_bigrams, .2)

In [38]:
BNC_top_bigrams = frequent_bigrams(BNC_bigrams, .2)

In [39]:
BNC_top = [b[0] for b in BNC_top_bigrams]
VOICE_top = [b[0] for b in VOICE_top_bigrams]

In [40]:
BNC_not_VOICE = [b for b in BNC_top if b not in VOICE_top]
VOICE_not_BNC = [b for b in VOICE_top if b not in BNC_top]

In [41]:
VOICE_not_BNC

[('yeah', 'yeah'),
 ('er', 'er'),
 ('we', 'are'),
 ('to', 'to'),
 ('in', 'in'),
 ('[', 'first'),
 ('mhm', 'mhm'),
 ('kind', 'of'),
 ('they', 'are'),
 ('but', 'er'),
 ('that', 'er'),
 ('er', 'we'),
 ('of', 'er'),
 ('er', 'in'),
 ('yes', 'yes'),
 ('hh', 'er'),
 ('is', 'not'),
 ('for', 'example'),
 ('we', 'we'),
 (']', '['),
 ('so', 'it'),
 ('hh', 'and'),
 ('you', 'are'),
 ('yeah', 'but'),
 ('[', 'org1'),
 ('org1', ']'),
 ('should', 'be'),
 ('er', 'it'),
 ('so', 'we'),
 ('have', 'the'),
 ('is', 'er'),
 ('er', '['),
 ("'s", 'er'),
 ('they', 'have'),
 ('we', 'will'),
 ('yah', 'yah'),
 ('the', '['),
 ('let', "'s"),
 ('x', 'x'),
 (']', 'and'),
 ('and', 'this'),
 ('because', 'i'),
 ('what', 'is'),
 ('in', 'this'),
 ('then', 'we'),
 ('er', 'and'),
 ('a', 'a'),
 ('we', 'should'),
 ('and', 'so'),
 ('[', 'org2'),
 ('org2', ']'),
 ('er', 'you'),
 ('it', 'it'),
 ('to', 'make'),
 ('you', 'you'),
 ('of', 'of'),
 ('er', 'to')]

In [42]:
BNC_not_VOICE

[("'ve", 'got'),
 ('i', "'ve"),
 ('he', "'s"),
 ('you', "'ve"),
 ('i', "'ll"),
 ('gon', 'na'),
 ('we', "'ve"),
 ('well', 'i'),
 ('she', "'s"),
 ("n't", 'it'),
 ('is', "n't"),
 ('used', 'to'),
 ('what', "'s"),
 ('to', 'get'),
 ('i', 'said'),
 ('a', 'bit'),
 ('to', 'go'),
 ('you', 'see'),
 ('and', 'he'),
 ('have', "n't"),
 ('got', 'a'),
 ('that', 'was'),
 ('one', 'of'),
 ('was', 'a'),
 ('he', 'was'),
 ("n't", 'you'),
 ('they', 'were'),
 ('there', 'was'),
 ('got', 'to'),
 ('i', "'d"),
 ('are', 'you'),
 ('that', 'i'),
 ('i', 'did'),
 ('for', 'a'),
 ('did', 'you'),
 ('would', "n't"),
 ('they', "'ve"),
 ("'s", 'got'),
 ('we', "'ll"),
 ('wo', "n't"),
 ("'s", 'right'),
 ('have', 'you'),
 ('what', 'you'),
 ('i', 'thought'),
 ('of', 'a'),
 ('out', 'of'),
 ('look', 'at'),
 ('was', "n't"),
 ('he', 'said'),
 ('oh', 'i'),
 ("n't", 'think'),
 ('had', 'a'),
 ("'re", 'not'),
 ("'ll", 'be'),
 ('if', 'i'),
 ('you', 'get'),
 ('by', 'the'),
 ('when', 'i'),
 ('of', 'them'),
 ("'m", 'not'),
 ('erm', 'i'),
 (

In [8]:
VOICE_bigrams = get_bigrams(VOICE_toks)

In [9]:
BNC_bigrams = get_bigrams(BNC_toks)

In [10]:
len(VOICE_bigrams)
len(BNC_bigrams)

555247

9378696

In [13]:
VOICE_repeated_words = repeated_words(VOICE_bigrams)
BNC_repeated_words = repeated_words(BNC_bigrams)

In [15]:
VOICE_total_repetitions =  sum(VOICE_repeated_words.values())
BNC_total_repetitions = sum(BNC_repeated_words.values())

In [16]:
VOICE_total_repetitions
BNC_total_repetitions

16153

101555

In [17]:
VOICE_total_repetitions/len(VOICE_bigrams)
BNC_total_repetitions/len(BNC_bigrams)

0.02909155745100829

0.010828264398376917

In [23]:
for bigram in sorted(VOICE_repeated_words, key = VOICE_repeated_words.get, reverse=True)[:20]:
    print(bigram, VOICE_repeated_words[bigram])

('yeah', 'yeah') 1264
('the', 'the') 1206
('er', 'er') 1123
('i', 'i') 1042
('no', 'no') 867
('to', 'to') 693
('in', 'in') 622
('mhm', 'mhm') 563
('and', 'and') 518
('yes', 'yes') 487
('we', 'we') 434
('yah', 'yah') 384
('x', 'x') 362
('a', 'a') 320
('it', 'it') 298
('you', 'you') 294
('of', 'of') 294
('that', 'that') 216
('xx', 'xx') 191
('for', 'for') 182


In [24]:
for bigram in sorted(BNC_repeated_words, key=BNC_repeated_words.get, reverse=True)[:20]:
    print(bigram, BNC_repeated_words[bigram])

('i', 'i') 8598
('the', 'the') 6722
('that', 'that') 3953
('no', 'no') 3948
('and', 'and') 3511
('er', 'er') 3448
('a', 'a') 3228
('mm', 'mm') 3197
('it', 'it') 2940
('you', 'you') 2922
('in', 'in') 2486
('yeah', 'yeah') 2248
('to', 'to') 2015
('we', 'we') 2004
('yes', 'yes') 1861
('they', 'they') 1742
('he', 'he') 1670
('is', 'is') 1571
('very', 'very') 1437
('what', 'what') 1342


In [20]:
VOICE_repeated_stopwords = repeated_stopwords(VOICE_repeated_words)
BNC_repeated_stopwords = repeated_stopwords(BNC_repeated_words)

In [25]:
VOICE_repeated_stopwords/VOICE_total_repetitions
BNC_repeated_stopwords/BNC_total_repetitions

0.6039125858973565

0.6675889911870415

In [31]:
VOICE_num_hesitations
BNC_num_hesitations

44012

231847

In [27]:
#VOICE: UHfUH
#BNC: UNC
VOICE_hesitations = tag_counts(VOICE_tags, "UHfUH")
BNC_hesitations = tag_counts(BNC_tags, "UNC")

In [30]:
VOICE_num_hesitations = sum(VOICE_hesitations.values())
BNC_num_hesitations = sum(BNC_hesitations.values())

In [28]:
VOICE_hesitations.keys()

dict_keys([('er', 'UHfUH'), ('oh', 'UHfUH'), ('ah', 'UHfUH'), ('erm', 'UHfUH'), ('pf', 'UHfUH'), ('oops', 'UHfUH'), ('haeh', 'UHfUH'), ('wow', 'UHfUH'), ('sh', 'UHfUH'), ('ooph', 'UHfUH'), ('ur', 'UHfUH'), ('yo', 'UHfUH'), ('whoohoo', 'UHfUH'), ('yuck', 'UHfUH'), ('huh', 'UHfUH'), ('oh-oh', 'UHfUH'), ('poah', 'UHfUH'), ('ts', 'UHfUH'), ('ow', 'UHfUH'), ('oow', 'UHfUH'), ('innit', 'UHfUH'), ('yipee', 'UHfUH'), ('mm', 'UHfUH'), ('ha', 'UHfUH'), ('yay', 'UHfUH'), ('uh', 'UHfUH'), ('ouch', 'UHfUH'), ('psh', 'UHfUH'), ('eh', 'UHfUH')])

In [29]:
BNC_hesitations.keys()

dict_keys([('erm', 'UNC'), ('er', 'UNC'), ("'s", 'UNC'), ('be', 'UNC'), ('th-', 'UNC'), ('com', 'UNC'), ('gu', 'UNC'), ('di', 'UNC'), ('creme', 'UNC'), ('non', 'UNC'), ('lieu', 'UNC'), ('en', 'UNC'), ('int', 'UNC'), ('ma', 'UNC'), ('te', 'UNC'), ('cur', 'UNC'), ('in', 'UNC'), ('s', 'UNC'), ('pa', 'UNC'), ('si', 'UNC'), ('wh', 'UNC'), ('ac', 'UNC'), ('thi', 'UNC'), ('st', 'UNC'), ("'", 'UNC'), ('papier', 'UNC'), ('mache', 'UNC'), ('mark', 'UNC'), ('p', 'UNC'), ('walk', 'UNC'), ('na', 'UNC'), ('theat', 'UNC'), ('counc', 'UNC'), ('six', 'UNC'), ('per', 'UNC'), ('cent', 'UNC'), ('org', 'UNC'), ('oth', 'UNC'), ('ver', 'UNC'), ('let', 'UNC'), ('i', 'UNC'), ('wro', 'UNC'), ('t', 'UNC'), ('pet', 'UNC'), ('even', 'UNC'), ('pub', 'UNC'), ('ev', 'UNC'), ('dor', 'UNC'), ('re', 'UNC'), ('pay', 'UNC'), ('criminalisa', 'UNC'), ('met', 'UNC'), ('execu', 'UNC'), ('kath', 'UNC'), ('ha', 'UNC'), ('go', 'UNC'), ('corres', 'UNC'), ('wom', 'UNC'), ('sep', 'UNC'), ('va', 'UNC'), ('mon', 'UNC'), ('mem', 'UNC'

In [32]:
for bigram in sorted(VOICE_hesitations, key = VOICE_hesitations.get, reverse=True):
    print(bigram, VOICE_hesitations[bigram])

('er', 'UHfUH') 33806
('erm', 'UHfUH') 7176
('oh', 'UHfUH') 1226
('ah', 'UHfUH') 882
('huh', 'UHfUH') 345
('haeh', 'UHfUH') 141
('wow', 'UHfUH') 117
('ooph', 'UHfUH') 79
('pf', 'UHfUH') 44
('ur', 'UHfUH') 32
('mm', 'UHfUH') 31
('poah', 'UHfUH') 22
('oow', 'UHfUH') 20
('oops', 'UHfUH') 15
('uh', 'UHfUH') 13
('whoohoo', 'UHfUH') 12
('ha', 'UHfUH') 12
('oh-oh', 'UHfUH') 10
('ts', 'UHfUH') 6
('sh', 'UHfUH') 5
('ow', 'UHfUH') 5
('yay', 'UHfUH') 4
('ouch', 'UHfUH') 2
('psh', 'UHfUH') 2
('yo', 'UHfUH') 1
('yuck', 'UHfUH') 1
('innit', 'UHfUH') 1
('yipee', 'UHfUH') 1
('eh', 'UHfUH') 1


In [34]:
for bigram in sorted(BNC_hesitations, key = BNC_hesitations.get, reverse=True)[:20]:
    print(bigram, BNC_hesitations[bigram])

('er', 'UNC') 88354
('erm', 'UNC') 62352
("'s", 'UNC') 16580
('th', 'UNC') 3771
('i', 'UNC') 3204
('ai', 'UNC') 2291
('a', 'UNC') 2061
('s', 'UNC') 1836
("'", 'UNC') 1451
('w', 'UNC') 1364
('yo', 'UNC') 1059
('o', 'UNC') 836
('per', 'UNC') 830
('y', 'UNC') 806
('t', 'UNC') 804
('la', 'UNC') 767
('an', 'UNC') 762
('we', 'UNC') 761
('cent', 'UNC') 722
('wh', 'UNC') 693
