In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from collections import Counter

In [None]:
data = pd.read_csv("full_data.csv")
data = data[['code', 'token', 'lstm_surp', 'ngram_surp', 'mean_rt']]
data['diff'] = data['lstm_surp'] - data['ngram_surp']
data.head()

Unnamed: 0,code,token,lstm_surp,ngram_surp,mean_rt,diff
0,17000,In,5.506053,4.57937,380.275294,0.926683
1,17002,County,12.514248,12.6541,296.042941,-0.139852
2,17004,near,13.308077,12.2238,403.553529,1.084277
3,17005,the,1.662673,1.98095,306.075882,-0.318277
4,17006,River,17.830992,15.709,289.048235,2.121992


In [None]:
lower_bound = data['diff'].mean() - 3 * data['diff'].std()
upper_bound = data['diff'].mean() + 3 * data['diff'].std()
ngram_high = data[data['diff'] < lower_bound]
lstm_high = data[data['diff'] > upper_bound]

In [None]:
surp = pd.read_csv("/content/5gram_surprisals.tsv", delimiter="\t")
surp.head()

Unnamed: 0,sentence_id,token_id,token,surprisal
0,1,1,In,4.57937
1,1,2,<unk>,7.45049
2,1,3,County,12.6541
3,1,4,<unk>,6.11317
4,1,5,near,12.2238


In [None]:
merged_lstm_high = pd.merge(lstm_high, surp, left_on=['token', 'ngram_surp'], right_on=['token', 'surprisal']).drop_duplicates()
merged_ngram_high = pd.merge(ngram_high, surp, left_on=['token', 'ngram_surp'], right_on=['token', 'surprisal']).drop_duplicates()

In [None]:
merged_lstm_high

Unnamed: 0,code,token,lstm_surp,ngram_surp,mean_rt,diff,sentence_id,token_id,surprisal
0,17112,Today,19.301353,10.21250,355.517647,9.088853,7,1,10.21250
1,17116,may,18.108437,9.87549,291.488235,8.232947,7,5,9.87549
2,17127,corps,15.566020,8.12025,443.555294,7.445770,7,16,8.12025
3,17186,eyes,14.340526,6.21132,346.834706,8.129206,10,7,6.21132
4,17186,eyes,14.340526,6.21132,346.834706,8.129206,36,11,6.21132
...,...,...,...,...,...,...,...,...,...
940,33713,By,15.465153,8.26237,250.293000,7.202783,405,1,8.26237
941,33713,By,15.465153,8.26237,250.293000,7.202783,450,1,8.26237
942,33806,I,16.159555,7.84603,295.226500,8.313525,411,3,7.84603
943,35566,South,10.930376,1.42276,294.574583,9.507616,453,14,1.42276


In [None]:
merged_ngram_high

Unnamed: 0,code,token,lstm_surp,ngram_surp,mean_rt,diff,sentence_id,token_id,surprisal
0,26125,feet,6.801941,15.4321,285.367619,-8.630159,193,12,15.4321
1,26456,hundred,9.388918,16.301,298.812857,-6.912082,214,15,16.301
2,29500,Guard,10.594954,17.6245,375.209375,-7.029546,301,10,17.6245
3,30790,keep,7.966421,14.9286,282.741875,-6.962179,326,9,14.9286
4,30819,As,7.996505,16.3629,336.030625,-8.366395,327,22,16.3629
5,32061,The,3.103844,13.6888,265.5175,-10.584956,339,32,13.6888
6,32061,The,3.103844,13.6888,265.5175,-10.584956,440,26,13.6888


In [None]:
def get_sentences(diff, data=surp):
    target_sentences = list(diff.sentence_id)
    sentence_counted = Counter(target_sentences)
    mean_occ = np.mean(list(sentence_counted.values()))
    example_sentences = set([k for k,v in list(sentence_counted.items()) if  v>=mean_occ])
    problematic_words = {i:[] for i in example_sentences}
    sentences = []
    for i in example_sentences:
        problematic_words[i].append(set(list(diff[diff.sentence_id == i].token)))
        sen = list(data[data.sentence_id == i].token)
        sen_str = ' '.join(sen)
        sentences.append(sen_str)
    return sentences, problematic_words

In [None]:
def print_sentences(sentences, problematic_words):
    words = list(problematic_words.values())
    for i, s in enumerate(sentences):
        print("_______________________________________________________")
        print(f"Surprising Words: {words[i]}")
        print(f"{i+1}) {s}")

In [None]:
ngram_higher_surp, ngram_words = get_sentences(merged_ngram_high)
lstm_higher_surp, lstm_words = get_sentences(merged_lstm_high)


In [None]:
len(ngram_higher_surp)

7

In [None]:
len(lstm_higher_surp)

63

In [None]:
print_sentences(ngram_higher_surp, ngram_words)

_______________________________________________________
Surprising Words: [{'feet'}]
1) <unk> a tower and a <unk> on the church a million feet <unk> </s>
_______________________________________________________
Surprising Words: [{'keep'}]
2) She is a closed <unk> a picture I keep on my <unk> but never look <unk> </s>
_______________________________________________________
Surprising Words: [{'As'}]
3) If <unk> kept on as <unk> been <unk> the story <unk> told <unk> would probably have been true by <unk> <unk> As <unk> <unk> bright smile greeted me at the breakfast <unk> </s>
_______________________________________________________
Surprising Words: [{'Guard'}]
4) Though the <unk> airports were <unk> the York State Guard <unk> proved unable to keep any kind of mail <unk> </s>
_______________________________________________________
Surprising Words: [{'The'}]
5) <unk> roaring up in great <unk> clouds of smoke and <unk> the fierce heat quickly drove us to the <unk> where we huddled like <un

In [None]:
print_sentences(lstm_higher_surp, lstm_words)

_______________________________________________________
Surprising Words: [{'He'}]
1) He knew just the thing for her -- a treatment from his <unk> light ozone <unk> <unk> </s>
_______________________________________________________
Surprising Words: [{'He'}]
2) He tilted his <unk> face toward the dry bed of the <unk> </s>
_______________________________________________________
Surprising Words: [{'She'}]
3) She <unk> </s>
_______________________________________________________
Surprising Words: [{'She'}]
4) She held out her hand to show that she had <unk> </s>
_______________________________________________________
Surprising Words: [{'She'}]
5) She <unk> </s>
_______________________________________________________
Surprising Words: [{'He'}]
6) He urged her to buy one of his machines -- for <unk> </s>
_______________________________________________________
Surprising Words: [{'He'}]
7) He then sold her minerals to cure her kidney <unk> a can of <unk> <unk> make her look like a girl <un