In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
import seaborn as sns
from scipy.stats import pearsonr
import time
import string
from convokit import Corpus, download
import ast
from datasets import load_dataset

In [None]:
# URL's
# https://www.cs.cornell.edu/~cristian/Cornell_Movie-Dialogs_Corpus.html
# https://convokit.cornell.edu/documentation/tutorial.html

In [None]:
NUM_OF_EPOCHS = 40

In [None]:
start = time.time()

# Closed RNN:

## Data

In [None]:
GDRIVE_DIR = "/gdrive"

In [None]:
%%bash
git clone https://github.com/omershubi/neural-complexity.git rnn
mkdir -p rnn/data/ptb

In [None]:
%%bash
mkdir -p gdrive/
mkdir -p gdrive/corpus_data/

In [None]:
!wget -qO rnn/data/ptb/ptb_tok_train.txt https://gist.githubusercontent.com/omershubi/cdd4231472d6188f03ab21e2b2729fee/raw/e1b4c764561fd038470830534baaa220b0eb4c6d/ptb_tok_train.txt
!wget -qO rnn/data/ptb/ptb_tok_dev.txt https://gist.githubusercontent.com/omershubi/31eff71b74dfb8cfe93d1a9acf8ab523/raw/094d3094b06beb92cd7fd0496710cf43273f8c64/ptb_tok_dev.txt

In [None]:
!cp rnn/data/ptb/* "gdrive/corpus_data/"

## OG RNN

In [None]:
checkpoint_to_use = f"../gdrive/ptb_model.pt"

In [None]:
!cd rnn && python main.py --cuda --model_file "../gdrive/ptb_model.pt" \
    --epochs "$NUM_OF_EPOCHS" \
    --vocab_file "../gdrive/ptb_vocab.txt" \
    --tied --data_dir "../gdrive/corpus_data" --trainfname ptb_tok_train.txt --validfname ptb_tok_dev.txt

In [None]:
!cd rnn && python main.py --cuda --model_file "$checkpoint_to_use" \
    --vocab_file "../gdrive/ptb_vocab.txt" --data_dir './data' \
    --testfname 'brown.txt' --test --words --nopp > "../gdrive/rnn_surprisals.tsv"

## Harmonizing:

In [None]:
ngram = pd.read_csv('https://gist.githubusercontent.com/omershubi/f19f77f5157f7ba7ea1adf72a72847da/raw/d5d553b1217ea70fe3261ce5d9a0532f29769817/5gram_surprisals.tsv', index_col=False, sep='\t')
readingTimes = pd.read_csv('https://gist.githubusercontent.com/omershubi/01b55eab89b81dc882055e0d27d61016/raw/046dbb7f0586b5dc1a368ee882f2cb923caad3df/brown-spr-data-for-pset.csv', index_col=0).sort_values(by='code')
rnn = pd.read_csv(f'./gdrive/rnn_surprisals.tsv',sep=' ')
rnn.rename(columns={'word': 'token'}, inplace=True)
rnn.rename(columns={'surp': 'surprisal'}, inplace=True)

In [None]:
def remove_punctuations(text: str) -> str:
    if text == '<unk>':
        return text
    return text.translate(str.maketrans('', '', string.punctuation))

def harmonize(rt_data: pd.DataFrame, surprs_data: pd.DataFrame) -> pd.DataFrame:
    # Create copies of the input DataFrames to avoid modifying the originals
    crt_data = rt_data.copy()
    csurprs_data = surprs_data.copy()
    
    # Drop unnecessary columns and remove duplicates from rt_data
    crt_data.drop(['subject', 'word_in_exp', 'time'], axis=1, inplace=True)
    crt_data.drop_duplicates(inplace=True)

    # Remove end-of-sentence tokens from surprs_data
    csurprs_data = csurprs_data[csurprs_data['token'] != '</s>']

    # Identify and flag words containing spaces in crt_data
    crt_data['remove_flag'] = crt_data['word'].apply(lambda word: ' ' in word)
    
    # Split and explode words containing spaces to match tokenization
    crt_data['word'] = crt_data['word'].apply(lambda word: word.split())
    crt_data = crt_data.explode('word')
    
    # Remove punctuation from both DataFrames
    crt_data['word'] = crt_data['word'].apply(remove_punctuations)
    csurprs_data['token'] = csurprs_data['token'].apply(remove_punctuations)

    # Reset indices for merging
    crt_data.reset_index(drop=True, inplace=True)
    csurprs_data.reset_index(drop=True, inplace=True)

    # Merge DataFrames based on matching indices and drop unnecessary columns
    if all(col in csurprs_data.columns for col in ['sentence_id', 'token_id']):
        keys_harmony = crt_data.merge(csurprs_data, left_index=True, right_index=True).drop(['token_id'], axis=1)
    else:
        keys_harmony = crt_data.merge(csurprs_data, left_index=True, right_index=True).drop([ 'entropy', 'entred'], axis=1)

    # Filter out flagged tokens and OOV words
    keys_harmony = keys_harmony[keys_harmony['remove_flag'] == False]
    keys_harmony = keys_harmony[keys_harmony['token'] != '<unk>']
    
    # Drop additional columns and merge with original rt_data
    keys_harmony.drop(['text_id', 'text_pos', 'remove_flag', 'word'], axis=1, inplace=True)
    final_harmony = keys_harmony.merge(rt_data, on='code')
    
    return final_harmony

In [None]:
# Harmonize the datasets
ngram_harm = harmonize(readingTimes, ngram)
rnn_harm = harmonize(readingTimes, rnn)

## closed analysis:

### 1

In [None]:
def compare_model_correlations(ngram, rnn, df1_key, df2_key, rt_column):
    # Calculate the correlation coefficients and p-values
    ngram_corr, _ = pearsonr(ngram[df1_key], ngram[rt_column])
    rnn_corr, _ = pearsonr(rnn[df2_key], rnn[rt_column])
    # Compare the correlation coefficients
    if ngram_corr > rnn_corr:
        percnt = (ngram_corr - rnn_corr) / rnn_corr * 100
        result = f"n_gram has a higher correlation with human reading times ( specifically {percnt:.4}% higher )."
    elif ngram_corr < rnn_corr:
        percnt = (rnn_corr - ngram_corr) / ngram_corr * 100
        result = f"RNN has a higher correlation with human reading times ( specifically {percnt:.4}% higher )."
    else:
        result = "Both models have the same correlation with human reading times."

    return result, ngram_corr, rnn_corr

In [None]:
result, ngram_correlation, rnn_correlation = compare_model_correlations(ngram_harm, rnn_harm, "surprisal", "surprisal",'time')
print(result)
print(f"\nngram Correlation:\t{ngram_correlation}\nRNN Correlation:\t{rnn_correlation}")

### 2

In [None]:
ngram_surprisals = ngram_harm['surprisal']
rnn_surprisals = rnn_harm['surprisal']

plt.scatter(ngram_surprisals, rnn_surprisals)
plt.xlabel('n-gram Model Surprisal')
plt.ylabel('RNN Surprisal')
plt.title('Relationship between n-gram Model Surprisal and RNN Surprisal')

# Add linear line
slope, intercept = np.polyfit(ngram_surprisals, rnn_surprisals, 1)
x = np.linspace(min(ngram_surprisals), max(ngram_surprisals), 100)
y = slope * x + intercept
plt.plot(x, y, color='red', label='Linear')
# plt.savefig('surprisal_relationship.png')  # Save the plot as an image file
# add the eqaution of the line
plt.text(10, 0.5, f"y = {slope:.4f}x + {intercept:.4f}", fontsize=12, color='red')
plt.show()

### 3

In [None]:
ngram_surprisals = ngram_harm['surprisal']
rnn_surprisals = rnn_harm['surprisal']

plt.scatter(ngram_surprisals, rnn_surprisals)
plt.xlabel('n-gram Model Surprisal')
plt.ylabel('RNN Surprisal')
plt.title('Relationship between n-gram Model Surprisal and RNN Surprisal')

# Add linear line
slope, intercept = np.polyfit(ngram_surprisals, rnn_surprisals, 1)
x = np.linspace(min(ngram_surprisals), max(ngram_surprisals), 100)
y = slope * x + intercept
plt.plot(x, y, color='red', label='Linear')
# plt.savefig('surprisal_relationship.png')  # Save the plot as an image file
# add the eqaution of the line
plt.text(10, 0.5, f"y = {slope:.4f}x + {intercept:.4f}", fontsize=12, color='red')
plt.show()

The most interesting points here for us are points where the difference between the surprisal value of the ngram and the RNN are the highest. Lets find the top 5 points:

In [None]:
# keep only word and surprisal for RNN, and keep only unique rows
rnn_intrest = rnn_harm[['token', 'surprisal', 'sentid']].drop_duplicates()
ngram_intrest = ngram_harm[['token', 'surprisal','sentence_id']].drop_duplicates()

k = 5
# get the top k words with the highest surprisal difference
top_k_uniqeue = pd.merge(rnn_intrest, ngram_intrest, on='token', how='inner', suffixes=('_rnn', '_ngram'))
top_k_uniqeue['diff'] = abs(top_k_uniqeue['surprisal_rnn'] - top_k_uniqeue['surprisal_ngram'])
top_k_uniqeue = top_k_uniqeue.sort_values(by='diff', ascending=False).drop_duplicates(subset='token').head(k)
top_k_uniqeue

In [None]:
top_k = pd.merge(rnn_intrest, ngram_intrest, on='token', how='inner', suffixes=('_rnn', '_ngram'))
top_k['diff'] = abs(top_k['surprisal_rnn'] - top_k['surprisal_ngram'])
top_k = top_k.sort_values(by='diff', ascending=False).head(k)
top_k

In [None]:
conc = pd.concat([top_k_uniqeue, top_k], axis=0)
in_both = conc[conc.duplicated(keep=False)].drop_duplicates()
in_both

In [None]:
ngram_surprisals = ngram_harm['surprisal']
rnn_surprisals = rnn_harm['surprisal']

plt.figure(figsize=(12, 8))

plt.scatter(ngram_surprisals, rnn_surprisals, color='blue', label='All other points')
plt.scatter(top_k['surprisal_rnn'], top_k['surprisal_ngram'], color='red', label=f'top {k} points with the highest difference')
plt.scatter(top_k_uniqeue['surprisal_rnn'], top_k_uniqeue['surprisal_ngram'], color='green', label = f'top {k} uniqeue points with the highest difference')
plt.scatter(in_both['surprisal_rnn'], in_both['surprisal_ngram'], color='yellow', label='Points in both top k and top k uniqeue')

plt.xlabel('n-gram Model Surprisal')
plt.ylabel('RNN Surprisal')
plt.title('Relationship between n-gram Model Surprisal and RNN Surprisal')

# Add linear line
slope, intercept = np.polyfit(ngram_surprisals, rnn_surprisals, 1)
x = np.linspace(min(ngram_surprisals), max(ngram_surprisals), 100)
y = slope * x + intercept
plt.plot(x, y, color='red')

# add the equation of the line
plt.text(10, 0.5, f"y = {slope:.4f}x + {intercept:.4f}", fontsize=12, color='red')

plt.legend()
plt.show()

In [None]:
top_k_sent_id = top_k['sentid'].values
top_k_uniqeue_sent_id = top_k_uniqeue['sentid'].values
in_both_sent_id = in_both['sentid'].values

In [None]:
sentances_top_k = []
sentances_top_k_uniqeue = []
sentances_in_both = []

for sent_id in top_k_sent_id:
    sent = list(rnn_harm[rnn_harm['sentid']==sent_id].drop_duplicates(subset='token')['token'])
    sentances_top_k.append(' '.join(sent))
    
for sent_id in top_k_uniqeue_sent_id:
    sent = list(rnn_harm[rnn_harm['sentid']==sent_id].drop_duplicates(subset='token')['token'])
    sentances_top_k_uniqeue.append(' '.join(sent))
    
for sent_id in in_both_sent_id:
    sent = list(rnn_harm[rnn_harm['sentid']==sent_id].drop_duplicates(subset='token')['token'])
    sentances_in_both.append(' '.join(sent))

In [None]:
sentances_in_both

In [None]:
sentances_top_k_uniqeue

In [None]:
sentances_top_k

- The sentences contain word ( or word sequences ) that are grammatically weird, lacking coherence, or just rare (e.g. "York State Guard proved unable to keep any kind of mail" ). The n-gram model - that relies on "local" patterns - struggles more with these combinations, while RNN model - that can capture "global" pattrens, might better recognize that these sentences don't make sense in a broader context, and thats why we see that it is usually giving a higher surprisal score then the ngram.

### 4

In [None]:
positive_mean_rt  = rnn_harm[rnn_harm['time'] > 0].reset_index() 
surprisal_data = positive_mean_rt['surprisal'].reset_index()

previous_surprisal = [0] 

for i in range(1, len(surprisal_data['surprisal'])):
    previous_surprisal.append(surprisal_data['surprisal'][i - 1])

surprisal_data['prev_surprisal'] = previous_surprisal

# Calculate RNN probabilities for previous and current surprisal values
rnn_prob_prev = 1 / (2 ** surprisal_data['prev_surprisal'])
rnn_prob_current = 1 / (2 ** surprisal_data['surprisal'])

# Extract reading times
reading_times = list(positive_mean_rt[positive_mean_rt['time'] > 0]['time'].values)

In [None]:
regression_coef_prev, intercept_prev = np.polyfit(rnn_prob_prev, reading_times, 1)
regression_coef_current, intercept_current = np.polyfit(rnn_prob_current, reading_times, 1)

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

# Plot RNN probability (previous word) vs next word's reading time with regression line and equation
sns.regplot(x=rnn_prob_prev, y=reading_times, ax=ax1, scatter_kws={'color': 'orange'}, line_kws={'color': 'red'})
equation_prev = f'y = {regression_coef_prev:.2f}x + {intercept_prev:.2f}'
ax1.text(0.4, 0.8, equation_prev, transform=ax1.transAxes, fontsize=12, verticalalignment='top')
ax1.set_xlabel('RNN Predicted Probability of Previous Word')
ax1.set_ylabel('Reading Time of Next Word (ms)')
ax1.set_title('RNN Probability of Previous Word vs Next Word Reading Time')

# Plot RNN probability (current word) vs current word's reading time with regression line and equation
sns.regplot(x=rnn_prob_current, y=reading_times, ax=ax2, scatter_kws={'color': 'blue'}, line_kws={'color': 'red'})
equation_current = f'y = {regression_coef_current:.2f}x + {intercept_current:.2f}'
ax2.text(0.4, 0.8, equation_current, transform=ax2.transAxes, fontsize=12, verticalalignment='top')
ax2.set_xlabel('RNN Predicted Probability of Current Word')
ax2.set_ylabel('Reading Time of Current Word (ms)')
ax2.set_title('RNN Probability of Current Word vs Current Word Reading Time')

# Set x-axis ticks
ax1.set_xticks(np.arange(0.1, 1.1, 0.1))
ax2.set_xticks(np.arange(0.1, 1.1, 0.1))

# Show plots
plt.tight_layout()
plt.show()


Both graphs show similar trends: as the RNN predicted probability increases, reading time generally decreases. There is a high concentration of data points with lower reading times (below 2000 ms) across all probabilities. The slopes of the regression lines are comparable, suggesting a similar effect of probability on reading time for both previous and current words. However, there are some outliers with very high reading times (up to about 11000 ms), particularly at lower probabilities. The spread of reading times decreases with increasing probability, suggesting more consistent reading times for high-probability words.

Overall, words with higher predicted probabilities are read faster.

# SEMI

## 1

In [None]:
df = rnn_harm.copy()
words = rnn_harm['word']
log_freq_dict = {}
for word in words:
  if word in log_freq_dict:
    log_freq_dict[word] += 1
  else:
    log_freq_dict[word] = 1
s0=[]
for word in log_freq_dict:
  log_freq_dict[word] = math.log(log_freq_dict[word])
for word in words:
  s0.append(log_freq_dict[word])

sur = rnn_harm['surprisal'].reset_index()
sur['prob'] = 1 / (2 ** sur['surprisal'])
s1=[]
s1.append(0)
for i in range(1, len(sur['prob'])):
  s1.append(sur['prob'][i-1])
df['log_frq'] = s0
df['prev_prob']=s1
df['prob'] = sur['prob']
df

In [None]:
from pygam import LinearGAM, s, f


X = df[['log_frq', 'wlen', 'surprisal', 'prob', 'prev_prob']].to_numpy()
y = df.time.values

## model
gam = LinearGAM(s(0, n_splines=5) + s(1, n_splines=5) + s(2, n_splines=5) + s(3, n_splines=5)+ s(4, n_splines=5)).fit(X, y)
titles = ['log-frequency', 'word_length', 'surprisal', 'probability', 'prev probability']
for i in range(5):
    fig, ax = plt.subplots(figsize=(8, 6))
    XX = gam.generate_X_grid(term=i)
    ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX))
    ax.plot(XX[:, i], gam.partial_dependence(term=i, X=XX, width=0.95)[1], c='r', ls='--')
    ax.set_title(titles[i])
    plt.savefig(f'pygam_{i}.png')  # Save the plot as an image file
    plt.show()

## 3

In [None]:
%%bash
mkdir -p wikitext2

In [None]:
checkpoint_to_use_wikitext2 = f"../wikitext2/wikitext2_model.pt"
model_file_wikitext2 = f"../wikitext2/wikitext2_model.pt"
vocab_file_wikitext2 = f"../wikitext2/wikitext2_vocab.txt"
data_dir_wikitext2= f"../rnn/data/wikitext-2"
trainfname_wikitext2 = f"train.txt"
validfname_wikitext2 = f"valid.txt"
surprisals_wikitext2 = f"../wikitext2/rnn_surprisals_wikitext2.tsv"

In [None]:
!cd rnn && python main.py --cuda --model_file "$model_file_wikitext2" \
--epochs "$NUM_OF_EPOCHS" \
--vocab_file "$vocab_file_wikitext2" \
--tied --data_dir "$data_dir_wikitext2" --trainfname "$trainfname_wikitext2" --validfname "$validfname_wikitext2"

In [None]:
!cd rnn && python main.py --cuda --model_file "$checkpoint_to_use_wikitext2" \
--vocab_file "$vocab_file_wikitext2" --data_dir './data' \
--testfname 'brown.txt' --test --words --nopp > "$surprisals_wikitext2"

In [None]:
rnn_surprisals_wikitext2 = pd.read_csv(f'./wikitext2/rnn_surprisals_wikitext2.tsv',sep=' ')

In [None]:
import nltk
import re
from nltk import ngrams
from collections import Counter
import math
import pandas as pd

# Regular expression pattern to preserve contractions
CONTRACTIONS_PATTERN = re.compile(r"\b(?:[A-Za-z']+)\b(?:[A-Za-z']*)")

def custom_tokenize(text):
    tokens = CONTRACTIONS_PATTERN.findall(text)
    return tokens

# Step 1: Train n-gram model on Wikitext-2

# Load and preprocess Wikitext-2 data
with open('rnn/data/wikitext-2/train.txt', 'r') as f:
    wikitext = f.read()

# Use custom tokenizer to preserve contractions
wikitext_sentences = nltk.sent_tokenize(wikitext.lower())
wikitext_tokens = [custom_tokenize(sentence) for sentence in wikitext_sentences]

# Create 5-grams for Wikitext-2 with sentence-level padding
wikitext_five_grams = []
for sentence in wikitext_tokens:
    padded_tokens = ['<s>'] * 4 + sentence + ['</s>']
    wikitext_five_grams.extend(list(ngrams(padded_tokens, 5)))

wikitext_four_grams = []
for sentence in wikitext_tokens:
    padded_tokens = ['<s>'] * 4 + sentence + ['</s>']
    wikitext_four_grams.extend(list(ngrams(padded_tokens, 4)))

# Count occurrences of 5-grams and 4-grams in Wikitext-2
five_gram_counts = Counter(wikitext_five_grams)
four_gram_counts = Counter(wikitext_four_grams)

# Step 2: Calculate surprisals on Brown corpus

# Load and preprocess Brown corpus data
with open('rnn/data/brown.txt', 'r') as f:
    brown_text = f.read()

# Use custom tokenizer to preserve contractions
brown_sentences = nltk.sent_tokenize(brown_text.lower())
brown_tokens = [custom_tokenize(sentence) for sentence in brown_sentences]

# Pad and flatten Brown tokens, and create sentence and token indices
brown_padded_tokens = []
sentence_ids = []
token_ids = []
sentence_id = 1

for sentence in brown_tokens:
    padded_sentence = ['<s>'] * 4 + sentence
    brown_padded_tokens.extend(padded_sentence)
    sentence_ids.extend([sentence_id] * len(padded_sentence))
    token_ids.extend(range(1, len(padded_sentence) + 1))
    sentence_id += 1

# Function to calculate surprisals based on Wikitext-2 model
def calculate_surprisals(tokens, five_gram_counts, four_gram_counts):
    surprisals = []
    for ngram in ngrams(tokens, 5):
        four_gram = ngram[:-1]
        if four_gram_counts[four_gram] > 0:
            prob = five_gram_counts[ngram] / four_gram_counts[four_gram]
            if prob > 0:
                surprisal = -math.log2(prob)
            else:
                surprisal = float('inf')  # Assign infinity for zero probability
        else:
            surprisal = float('inf')  # Assign infinity for unseen contexts
        surprisals.append(surprisal)
    return surprisals

# Calculate surprisals for the Brown corpus
brown_surprisals = calculate_surprisals(brown_padded_tokens, five_gram_counts, four_gram_counts)

# Adjust indices and tokens for DataFrame
adjusted_token_ids = [x - 4 for x in token_ids[4:]]

# Create DataFrame for Brown corpus surprisals
df_brown_surprisals = pd.DataFrame({
    'sentence_id': sentence_ids[4:],  # Exclude padding tokens
    'token_id': adjusted_token_ids,   # Exclude padding tokens
    'token': brown_padded_tokens[4:], # Exclude padding tokens and final `</s>`
    'surprisal': brown_surprisals
})

In [None]:
df_brown_surprisals['token'] = df_brown_surprisals.apply(lambda row: '<unk>' if row['surprisal'] == float('inf') else row['token'], axis=1)

In [None]:
readingTimes = pd.read_csv('https://gist.githubusercontent.com/omershubi/01b55eab89b81dc882055e0d27d61016/raw/046dbb7f0586b5dc1a368ee882f2cb923caad3df/brown-spr-data-for-pset.csv', index_col=0).sort_values(by='code')
rnn_wiki = pd.read_csv(f'./wikitext2/rnn_surprisals_wikitext2.tsv',sep=' ')
rnn_wiki.rename(columns={'word': 'token'}, inplace=True)
rnn_wiki.rename(columns={'surp': 'surprisal'}, inplace=True)

In [None]:
ngram = df_brown_surprisals

In [None]:
def remove_punctuations(text: str) -> str:
    if text == '<unk>':
        return text
    return text.translate(str.maketrans('', '', string.punctuation))

def harmonize(rt_data: pd.DataFrame, surprs_data: pd.DataFrame) -> pd.DataFrame:
    # Create copies of the input DataFrames to avoid modifying the originals
    crt_data = rt_data.copy()
    csurprs_data = surprs_data.copy()
    
    # Drop unnecessary columns and remove duplicates from rt_data
    crt_data.drop(['subject', 'word_in_exp', 'time'], axis=1, inplace=True)
    crt_data.drop_duplicates(inplace=True)

    # Remove end-of-sentence tokens from surprs_data
    csurprs_data = csurprs_data[csurprs_data['token'] != '</s>']

    # Identify and flag words containing spaces in crt_data
    crt_data['remove_flag'] = crt_data['word'].apply(lambda word: ' ' in word)
    
    # Split and explode words containing spaces to match tokenization
    crt_data['word'] = crt_data['word'].apply(lambda word: word.split())
    crt_data = crt_data.explode('word')
    
    # Remove punctuation from both DataFrames
    crt_data['word'] = crt_data['word'].apply(remove_punctuations)
    csurprs_data['token'] = csurprs_data['token'].apply(remove_punctuations)

    # Reset indices for merging
    crt_data.reset_index(drop=True, inplace=True)
    csurprs_data.reset_index(drop=True, inplace=True)

    # Merge DataFrames based on matching indices and drop unnecessary columns
    if all(col in csurprs_data.columns for col in ['sentence_id', 'token_id']):
        keys_harmony = crt_data.merge(csurprs_data, left_index=True, right_index=True).drop(['token_id'], axis=1)
    else:
        keys_harmony = crt_data.merge(csurprs_data, left_index=True, right_index=True).drop([ 'entropy', 'entred'], axis=1)

    # Filter out flagged tokens and OOV words
    keys_harmony = keys_harmony[keys_harmony['remove_flag'] == False]
    keys_harmony = keys_harmony[keys_harmony['token'] != '<unk>']
    
    # Drop additional columns and merge with original rt_data
    keys_harmony.drop(['text_id', 'text_pos', 'remove_flag', 'word'], axis=1, inplace=True)
    final_harmony = keys_harmony.merge(rt_data, on='code')
    
    return final_harmony

In [None]:
# Harmonize the datasets
rnn_wiki_harm = harmonize(readingTimes, rnn_wiki)
ngram_wiki_harm = harmonize(readingTimes, ngram)

In [None]:
ngram_wiki_harm.rename(columns={'sentence_id': 'sentid'}, inplace=True)
# Convert the 'token' column to lowercase
rnn_wiki_harm['token'] = rnn_wiki_harm['token'].str.lower()

In [None]:
both_ngran_rnn = pd.merge(rnn_wiki_harm.drop(columns=['surprisal']), 
                  ngram_wiki_harm.drop(columns=['surprisal']),
                  on=['code', 'sentid','subject'],
                  how='inner')

both_ngran_rnn['surprisal_1'] = rnn_wiki_harm['surprisal']
both_ngran_rnn['surprisal_2'] = ngram_wiki_harm['surprisal']

In [None]:
rnn_surprisals = both_ngran_rnn['surprisal_1']
ngram_surprisals = both_ngran_rnn['surprisal_2']


plt.scatter(ngram_surprisals, rnn_surprisals)
plt.xlabel('n-gram Model Surprisal')
plt.ylabel('RNN Surprisal')
plt.title('Relationship between n-gram Model Surprisal and RNN Surprisal')

# Add linear line
slope, intercept = np.polyfit(ngram_surprisals, rnn_surprisals, 1)
x = np.linspace(min(ngram_surprisals), max(ngram_surprisals), 100)
y = slope * x + intercept
plt.plot(x, y, color='red', label='Linear')
# plt.savefig('surprisal_relationship.png')  # Save the plot as an image file
# add the eqaution of the line
plt.text(10, 0.5, f"y = {slope:.4f}x + {intercept:.4f}", fontsize=12, color='red')
plt.show()

# OPEN

## The movie corpus RNN

## pre-proccesing

In [None]:
corpus = Corpus(filename=download("movie-corpus"))

In [None]:
utterances = corpus.get_utterances_dataframe()
conversations = corpus.get_conversations_dataframe()

## Genre Insights

In [None]:
unique_genres_set = set(genre for sublist in conversations["meta.genre"].apply(ast.literal_eval) for genre in sublist)
unique_genre_lists = pd.DataFrame(conversations["meta.genre"].apply(ast.literal_eval)).reset_index()['meta.genre'].drop_duplicates().tolist()

In [None]:
print(f"Unique genres: {len(unique_genres_set)}\nUnique genres lists: {len(unique_genre_lists)}")

In [None]:
genre_counts_single = conversations['meta.genre'].apply(ast.literal_eval).explode().value_counts().reset_index()
genre_counts_single.columns = ['genre', 'count']
genre_counts_single.head()

In [None]:
genre_counts = conversations['meta.genre'].explode().value_counts().reset_index()
genre_counts.columns = ['genre', 'count']
genre_counts.head()

## covert to txt 

In [None]:
utterances['genre'] = utterances['conversation_id'].apply(lambda cid: conversations.loc[cid]['meta.genre'] if cid in conversations.index else [])
top_5_genres = genre_counts.head(5)['genre'].tolist()

### covert to txt file top 5 genre

In [None]:
# delete the folder "movie_data" and all its contents
!rm -rf movie_data
!mkdir -p movie_data
genre_dict = {}

for i,genre in enumerate(top_5_genres):
    genre_utterances = utterances[utterances['genre'].apply(lambda genres: genre in genres)]
    
    # create a folder for each genre
    !mkdir -p movie_data/{i}
    path = f'movie_data/{i}'

    # save the text to a file
    genre_utterances['text'].to_csv(f'{path}/{i}_all.txt', index=False, header=False)
    # remove all " from the text file
    !sed -i 's/\"//g' {path}/{i}_all.txt
    
    genre_dict[i] = genre
    
    # split into train and dev
    !head -n 1000 {path}/{i}_all.txt > {path}/{i}_dev.txt
    !tail -n +1001 {path}/{i}_all.txt > {path}/{i}_train.txt
    

## The RNN

In [None]:
for key in genre_dict:
    checkpoint_to_use_movies = f"../movie_data/{key}/ptb_model_{key}.pt"
    
    model_file = f"../movie_data/{key}/ptb_model_{key}.pt"
    vocab_file = f"../movie_data/{key}/ptb_vocab.txt"
    data_dir = f"../movie_data/{key}"
    trainfname = f"{key}_train.txt"
    validfname = f"{key}_dev.txt"
    surprisals = f"../movie_data/{key}/rnn_surprisals_{key}.tsv"
    
    !cd rnn && python main.py --cuda --model_file "$model_file" \
    --epochs "$NUM_OF_EPOCHS" \
    --vocab_file "$vocab_file" \
    --tied --data_dir "$data_dir" --trainfname "$trainfname" --validfname "$validfname"
    
    
    !cd rnn && python main.py --cuda --model_file "$checkpoint_to_use_movies" \
    --vocab_file "$vocab_file" --data_dir './data' \
    --testfname 'brown.txt' --test --words --nopp > "$surprisals"

## Harmonize

In [None]:
def remove_punctuations(text: str) -> str:
    if text == '<unk>':
        return text
    return text.translate(str.maketrans('', '', string.punctuation))

def harmonize(rt_data: pd.DataFrame, surprs_data: pd.DataFrame) -> pd.DataFrame:
    # Create copies of the input DataFrames to avoid modifying the originals
    crt_data = rt_data.copy()
    csurprs_data = surprs_data.copy()
    
    # Drop unnecessary columns and remove duplicates from rt_data
    crt_data.drop(['subject', 'word_in_exp', 'time'], axis=1, inplace=True)
    crt_data.drop_duplicates(inplace=True)

    # Remove end-of-sentence tokens from surprs_data
    csurprs_data = csurprs_data[csurprs_data['token'] != '</s>']

    # Identify and flag words containing spaces in crt_data
    crt_data['remove_flag'] = crt_data['word'].apply(lambda word: ' ' in word)
    
    # Split and explode words containing spaces to match tokenization
    crt_data['word'] = crt_data['word'].apply(lambda word: word.split())
    crt_data = crt_data.explode('word')
    
    # Remove punctuation from both DataFrames
    crt_data['word'] = crt_data['word'].apply(remove_punctuations)
    csurprs_data['token'] = csurprs_data['token'].apply(remove_punctuations)

    # Reset indices for merging
    crt_data.reset_index(drop=True, inplace=True)
    csurprs_data.reset_index(drop=True, inplace=True)

    # Merge DataFrames based on matching indices and drop unnecessary columns
    if all(col in csurprs_data.columns for col in ['sentence_id', 'token_id']):
        keys_harmony = crt_data.merge(csurprs_data, left_index=True, right_index=True).drop(['token_id'], axis=1)
    else:
        keys_harmony = crt_data.merge(csurprs_data, left_index=True, right_index=True).drop([ 'entropy', 'entred'], axis=1)

    # Filter out flagged tokens and OOV words
    keys_harmony = keys_harmony[keys_harmony['remove_flag'] == False]
    keys_harmony = keys_harmony[keys_harmony['token'] != '<unk>']
    
    # Drop additional columns and merge with original rt_data
    keys_harmony.drop(['text_id', 'text_pos', 'remove_flag', 'word'], axis=1, inplace=True)
    final_harmony = keys_harmony.merge(rt_data, on='code')
    
    return final_harmony

In [None]:
harmonized = {}
for key in genre_dict:
    readingTimes = pd.read_csv('https://gist.githubusercontent.com/omershubi/01b55eab89b81dc882055e0d27d61016/raw/046dbb7f0586b5dc1a368ee882f2cb923caad3df/brown-spr-data-for-pset.csv', index_col=0).sort_values(by='code')

    rnn_movie = pd.read_csv(f'./movie_data/{key}/rnn_surprisals_{key}.tsv',sep=' ')
    rnn_movie.rename(columns={'word': 'token'}, inplace=True)
    rnn_movie.rename(columns={'surp': 'surprisal'}, inplace=True)
    
    # Harmonize the datasets
    rnn_movie_harm = harmonize(readingTimes, rnn_movie)
    harmonized[key] = rnn_movie_harm

In [None]:
for key in harmonized.keys():
    print(f"Genre {genre_dict[key]}:")
    print(harmonized[key].head())

In [None]:
rnn_movie_harm

## The human dataset

In [None]:
dataset = load_dataset("li2017dailydialog/daily_dialog", trust_remote_code=True)

In [None]:
train_data_dialogs = dataset['train']['dialog']
validation_data_dialogs = dataset['validation']['dialog']
test_data_dialogs = dataset['test']['dialog']

In [None]:
def writeDialogToFile(df, filename):
    for dialog in df:
        for line in dialog:
            with open(filename, 'a') as f:
                f.write(line.strip() + '\n')

In [None]:
# same for movies, but with the daily dialog dataset
!rm -rf human_data
!mkdir -p human_data

writeDialogToFile(train_data_dialogs, 'human_data/train.txt')
writeDialogToFile(validation_data_dialogs, 'human_data/dev.txt')
writeDialogToFile(test_data_dialogs, 'human_data/test.txt')

In [None]:
checkpoint_to_use_human = f"../human_data/ptb_model.pt"

model_file_human = f"../human_data/ptb_model.pt"
vocab_file_human = f"../human_data/ptb_vocab.txt"
data_dir_human= f"../human_data"
trainfname_human = f"train.txt"
validfname_human = f"dev.txt"
surprisals_human = f"../human_data/rnn_surprisals_human.tsv"

In [None]:
!cd rnn && python main.py --cuda --model_file "$model_file_human" \
--epochs "$NUM_OF_EPOCHS" \
--vocab_file "$vocab_file_human" \
--tied --data_dir "$data_dir_human" --trainfname "$trainfname_human" --validfname "$validfname_human"

In [None]:
!cd rnn && python main.py --cuda --model_file "$checkpoint_to_use_human" \
--vocab_file "$vocab_file_human" --data_dir './data' \
--testfname 'brown.txt' --test --words --nopp > "$surprisals_human"

In [None]:
readingTimes = pd.read_csv('https://gist.githubusercontent.com/omershubi/01b55eab89b81dc882055e0d27d61016/raw/046dbb7f0586b5dc1a368ee882f2cb923caad3df/brown-spr-data-for-pset.csv', index_col=0).sort_values(by='code')
rnn_human = pd.read_csv(f'./human_data/rnn_surprisals_human.tsv',sep=' ')
rnn_human.rename(columns={'word': 'token'}, inplace=True)
rnn_human.rename(columns={'surp': 'human_surprisal'}, inplace=True)
    
# Harmonize the datasets
rnn_human_harm = harmonize(readingTimes, rnn_human)
harmonized_human= rnn_human_harm

## ANALYSIS

## Part 1

In [None]:
def compare_rnn_model_correlations(rnn_movie, rnn_human, rnn_movie_key, rnn_human_key, rt_column):

    # Calculate the correlation coefficients and p-values
    rnn_movie_corr, _ = pearsonr(rnn_movie[rnn_movie_key], rnn_movie[rt_column])
    rnn_human_corr, _ = pearsonr(rnn_human[rnn_human_key], rnn_human[rt_column])

    # Compare the correlation coefficients
    if rnn_movie_corr > rnn_human_corr:
        percnt = (rnn_movie_corr - rnn_human_corr) / rnn_human_corr * 100
        result = f"RNN Movie has a higher correlation with human reading times (specifically {percnt:.4f}% higher)."
    elif rnn_movie_corr < rnn_human_corr:
        percnt = (rnn_human_corr - rnn_movie_corr) / rnn_movie_corr * 100
        result = f"RNN Human has a higher correlation with human reading times (specifically {percnt:.4f}% higher)."
    else:
        result = "Both RNN models have the same correlation with human reading times."

    return result, rnn_movie_corr, rnn_human_corr

In [None]:
for key, value in harmonized.items():
    print(f"Key:\t{key} - {genre_dict[key]}")
    print(f"Shape:\t{value.shape}")
    print()

In [None]:
# all harmonized correlations with human reading times
correlations = {}
for key, value in harmonized.items():
    result, rnn_movie_corr, rnn_human_corr = compare_rnn_model_correlations(value, harmonized_human, 'surprisal', 'human_surprisal', 'time')
    correlations[key] = (result, rnn_movie_corr, rnn_human_corr)
    
for key, value in correlations.items():
    print(f"Genre {genre_dict[key]}:")
    print(f"\t{value[0]}")
    print(f"\tRNN Movie correlation: {value[1]:.4f}")
    print(f"\tRNN Human correlation: {value[2]:.4f}") 

## Part 2

In [None]:
good_genres_keys = []

for key, value in harmonized.items():
    genre = genre_dict[key]
    rnn_movie_surprisals = value['surprisal']
    rnn_human_surprisals = harmonized_human['human_surprisal']

    # Ensure both series are the same length after dropping NaNs
    min_length = min(len(rnn_movie_surprisals), len(rnn_human_surprisals))
    rnn_movie_surprisals = rnn_movie_surprisals[:min_length]
    rnn_human_surprisals = rnn_human_surprisals[:min_length]

    # Plot the relationship
    plt.scatter(rnn_movie_surprisals, rnn_human_surprisals)
    plt.xlabel('RNN Movie Surprisal')
    plt.ylabel('RNN Human Surprisal')
    plt.title(f'Relationship between RNN Movie Surprisal and RNN Human Surprisal for {genre}')

    # Add linear line
    slope, intercept = np.polyfit(rnn_movie_surprisals, rnn_human_surprisals, 1)
    x = np.linspace(min(rnn_movie_surprisals), max(rnn_movie_surprisals), 100)
    y = slope * x + intercept
    plt.plot(x, y, color='red', label='Linear')
    plt.legend()

    # Add the equation of the line
    plt.text(0.05, max(rnn_human_surprisals)*0.95, f"y = {slope:.4f}x + {intercept:.4f}", fontsize=12, color='red')

    if slope > 0.01 or slope < -0.01:
        good_genres_keys.append(key)
    plt.show()


In [None]:
print(f"the genres with a significant slope are:")
for key in good_genres_keys:
    print(f"\t{genre_dict[key]}, with key {key}")

## Part 3

We will use the top genres with at least some significants.
Those are:

	['drama'], with key 0

	['comedy', 'romance'], with key 1

	['comedy', 'drama'], with key 3
	

In [None]:
# plot only the genres with a significant slope
for key in good_genres_keys:
    genre = genre_dict[key]
    rnn_movie_surprisals = harmonized[key]['surprisal']
    rnn_human_surprisals = harmonized_human['human_surprisal']

    # Ensure both series are the same length after dropping NaNs
    min_length = min(len(rnn_movie_surprisals), len(rnn_human_surprisals))
    rnn_movie_surprisals = rnn_movie_surprisals[:min_length]
    rnn_human_surprisals = rnn_human_surprisals[:min_length]

    # Plot the relationship
    plt.scatter(rnn_movie_surprisals, rnn_human_surprisals)
    plt.xlabel('RNN Movie Surprisal')
    plt.ylabel('RNN Human Surprisal')
    plt.title(f'Relationship between RNN Movie Surprisal and RNN Human Surprisal for {genre}')

    # Add linear line
    slope, intercept = np.polyfit(rnn_movie_surprisals, rnn_human_surprisals, 1)
    x = np.linspace(min(rnn_movie_surprisals), max(rnn_movie_surprisals), 100)
    y = slope * x + intercept
    plt.plot(x, y, color='red', label='Linear')
    plt.legend()

    # Add the equation of the line
    plt.text(0.05, max(rnn_human_surprisals)*0.95, f"y = {slope:.4f}x + {intercept:.4f}", fontsize=12, color='red')

    plt.show()

The most interesting points here for us are points where the difference between the surprisal values. Lets find the top 5 points for each genres:

In [None]:
k = 5

In [None]:
for key in good_genres_keys:
    # keep only word and surprisal for RNN, and keep only unique rows
    rnn_intrest = harmonized[key][['token', 'surprisal', 'sentid']].drop_duplicates()
    human_intrest = rnn_human_harm[['token', 'human_surprisal', 'sentid']].drop_duplicates()
    
    k = 5
    # get the top k words with the highest surprisal difference
    top_k_uniqeue = pd.merge(rnn_intrest, human_intrest, on='token', how='inner', suffixes=('_rnn', '_human'))
    top_k_uniqeue['diff'] = abs(top_k_uniqeue['surprisal'] - top_k_uniqeue['human_surprisal'])
    top_k_uniqeue = top_k_uniqeue.sort_values(by='diff', ascending=False).drop_duplicates(subset='token').head(k)
        
    top_k = pd.merge(rnn_intrest, rnn_human_harm, on='token', how='inner', suffixes=('_rnn', '_human'))
    top_k['diff'] = abs(top_k['surprisal'] - top_k['human_surprisal'])
    top_k = top_k.sort_values(by='diff', ascending=False).head(k)

    genre = genre_dict[key]
    rnn_movie_surprisals = harmonized[key]['surprisal']
    rnn_human_surprisals = harmonized_human['human_surprisal']
    
    # Ensure both series are the same length after dropping NaNs
    min_length = min(len(rnn_movie_surprisals), len(rnn_human_surprisals))
    rnn_movie_surprisals = rnn_movie_surprisals[:min_length]
    rnn_human_surprisals = rnn_human_surprisals[:min_length]

    plt.figure(figsize=(12, 8))

    plt.scatter(rnn_movie_surprisals, rnn_human_surprisals, color='blue', label='All other points')
    plt.scatter(top_k['surprisal'], top_k['human_surprisal'], color='red', label=f'top {k} points with the highest difference')
    plt.scatter(top_k_uniqeue['surprisal'], top_k_uniqeue['human_surprisal'], color='green', label = f'top {k} uniqeue points with the highest difference')

    plt.xlabel('RNN movie Surprisal')
    plt.ylabel("RNN human Surprisal")
    plt.title('Relationship between RNN movie and human Surprisal')

    # Add linear line
    slope, intercept = np.polyfit(rnn_human_surprisals, rnn_movie_surprisals, 1)
    x = np.linspace(min(rnn_human_surprisals), max(rnn_human_surprisals), 100)
    y = slope * x + intercept
    plt.plot(x, y, color='red')

    # add the equation of the line
    plt.text(8, 30, f"y = {slope:.4f}x + {intercept:.4f}", fontsize=12, color='red')

    plt.legend()
    plt.show()

In [None]:
for key in good_genres_keys:
    # keep only word and surprisal for RNN, and keep only unique rows
    rnn_intrest = harmonized[key][['token', 'surprisal', 'sentid']].drop_duplicates()
    human_intrest = rnn_human_harm[['token', 'human_surprisal', 'sentid']].drop_duplicates()
    
    k = 5
    # get the top k words with the highest surprisal difference
    top_k_uniqeue = pd.merge(rnn_intrest, human_intrest, on='token', how='inner', suffixes=('_rnn', '_human'))
    top_k_uniqeue['diff'] = abs(top_k_uniqeue['surprisal'] - top_k_uniqeue['human_surprisal'])
    top_k_uniqeue = top_k_uniqeue.sort_values(by='diff', ascending=False).drop_duplicates(subset='token').head(k)
        
    top_k = pd.merge(rnn_intrest, rnn_human_harm, on='token', how='inner', suffixes=('_rnn', '_human'))
    top_k['diff'] = abs(top_k['surprisal'] - top_k['human_surprisal'])
    top_k = top_k.sort_values(by='diff', ascending=False).head(k)
    
    top_k_sent_id = top_k['sentid_rnn'].values
    top_k_uniqeue_sent_id = top_k_uniqeue['sentid_rnn'].values
    
    sentances_top_k = []
    sentances_top_k_uniqeue = []
    sentances_in_both = []

    for sent_id in top_k_sent_id:
        sent = list(harmonized[key][harmonized[key]['sentid']==sent_id].drop_duplicates(subset='token')['token'])
        sentances_top_k.append(' '.join(sent))
        
    for sent_id in top_k_uniqeue_sent_id:
        sent = list(harmonized[key][harmonized[key]['sentid']==sent_id].drop_duplicates(subset='token')['token'])
        sentances_top_k_uniqeue.append(' '.join(sent))
        
    # print the sentances nicely
    print(f"Genre {genre_dict[key]}:")
    print(f"Top {k} sentances with the highest difference:")
    for i, sent in enumerate(sentances_top_k):
        print(f"\t{i+1}. {sent}")
    print()
    
    print(f"Top {k} uniqeue sentances with the highest difference:")
    for i, sent in enumerate(sentances_top_k_uniqeue):
        print(f"\t{i+1}. {sent}")
    print()    


## Part 4

Spillover for movie data

In [None]:
# Define a variable for font size
font_size = 16
title_font_size = 20
suptitle_font_size = 24

for key in good_genres_keys:
    genre = genre_dict[key]
    rnn_movie_harm = harmonized[key]
    positive_mean_rt = rnn_movie_harm[rnn_movie_harm['time'] > 0].reset_index()
    surprisal_data = positive_mean_rt[['surprisal', 'time']].reset_index(drop=True)

    # Calculate previous surprisal using pandas shift
    surprisal_data['prev_surprisal'] = surprisal_data['surprisal'].shift(1).fillna(0)

    # Calculate RNN probabilities
    surprisal_data['rnn_prob_prev'] = 1 / (2 ** surprisal_data['prev_surprisal'])
    surprisal_data['rnn_prob_current'] = 1 / (2 ** surprisal_data['surprisal'])

    # Extract reading times
    reading_times = surprisal_data['time']

    # Perform linear regression
    regression_coef_prev, intercept_prev = np.polyfit(surprisal_data['rnn_prob_prev'], reading_times, 1)
    regression_coef_current, intercept_current = np.polyfit(surprisal_data['rnn_prob_current'], reading_times, 1)

    # Create subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 12))

    # Plot RNN probability (previous word) vs next word's reading time
    sns.regplot(x='rnn_prob_prev', y='time', data=surprisal_data, ax=ax1, scatter_kws={'color': 'blue'}, line_kws={'color': 'red'})
    equation_prev = f'y = {regression_coef_prev:.2f}x + {intercept_prev:.2f}'
    ax1.text(0.4, 0.8, equation_prev, transform=ax1.transAxes, fontsize=font_size, verticalalignment='top')
    ax1.set_xlabel('RNN movie Predicted Probability of Previous Word', fontsize=font_size)
    ax1.set_ylabel('Reading Time of Next Word (ms)', fontsize=font_size)
    ax1.set_title('RNN movie Probability of Previous Word vs Next Word Reading Time', fontsize=title_font_size)

    # Plot RNN probability (current word) vs next word's reading time
    sns.regplot(x='rnn_prob_current', y='time', data=surprisal_data, ax=ax2, scatter_kws={'color': 'orange'}, line_kws={'color': 'red'})
    equation_current = f'y = {regression_coef_current:.2f}x + {intercept_current:.2f}'
    ax2.text(0.4, 0.8, equation_current, transform=ax2.transAxes, fontsize=font_size, verticalalignment='top')
    ax2.set_xlabel('RNN movie Predicted Probability of Current Word', fontsize=font_size)
    ax2.set_ylabel('Reading Time of Next Word (ms)', fontsize=font_size)
    ax2.set_title('RNN movie Probability of Current Word vs Next Word Reading Time', fontsize=title_font_size)
    
    plt.suptitle(f'{genre}', fontsize=suptitle_font_size)
    plt.tight_layout()
    plt.show()


spillover for the human data

In [None]:
positive_mean_rt = rnn_human_harm[rnn_human_harm['time'] > 0].reset_index()
surprisal_data = positive_mean_rt[['human_surprisal', 'time']].reset_index(drop=True)

# Calculate previous surprisal using pandas shift
surprisal_data['prev_surprisal'] = surprisal_data['human_surprisal'].shift(1).fillna(0)

# Calculate RNN probabilities
surprisal_data['rnn_prob_prev'] = 1 / (2 ** surprisal_data['prev_surprisal'])
surprisal_data['rnn_prob_current'] = 1 / (2 ** surprisal_data['human_surprisal'])

# Extract reading times
reading_times = surprisal_data['time']

# Perform linear regression
regression_coef_prev, intercept_prev = np.polyfit(surprisal_data['rnn_prob_prev'], reading_times, 1)
regression_coef_current, intercept_current = np.polyfit(surprisal_data['rnn_prob_current'], reading_times, 1)

# Create subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 12))

sns.regplot(x='rnn_prob_prev', y='time', data=surprisal_data, ax=ax1, scatter_kws={'color': 'blue'}, line_kws={'color': 'red'})
equation_prev = f'y = {regression_coef_prev:.2f}x + {intercept_prev:.2f}'
ax1.text(0.4, 0.8, equation_prev, transform=ax1.transAxes, fontsize=12, verticalalignment='top')
ax1.set_xlabel('RNN human Predicted Probability of Previous Word')
ax1.set_ylabel('Reading Time of Next Word (ms)')
ax1.set_title('RNN human Probability of Previous Word vs Next Word Reading Time')

sns.regplot(x='rnn_prob_current', y='time', data=surprisal_data, ax=ax2, scatter_kws={'color': 'orange'}, line_kws={'color': 'red'})
equation_current = f'y = {regression_coef_current:.2f}x + {intercept_current:.2f}'
ax2.text(0.4, 0.8, equation_current, transform=ax2.transAxes, fontsize=12, verticalalignment='top')
ax2.set_xlabel('RNN human Predicted Probability of Current Word')
ax2.set_ylabel('Reading Time of Next Word (ms)')
ax2.set_title('RNN human Probability of Current Word vs Next Word Reading Time')

plt.tight_layout()
plt.show()

More stats:

In [None]:
def word_overlap(data1: pd.DataFrame, data2: pd.DataFrame, column: str) -> float:
    # Calculate the number of unique tokens
    unique_tokens1 = data1[column].nunique()
    unique_tokens2 = data2[column].nunique()
    
    # Calculate the number of unique tokens that appear in both datasets
    unique_tokens_overlap = len(set(data1[column].unique()).intersection(set(data2[column].unique())))
    
    # Calculate the percentage of unique tokens that appear in both datasets
    overlap_percentage = unique_tokens_overlap / unique_tokens1 * 100

    return overlap_percentage

for key in good_genres_keys:
    genre = genre_dict[key]
    rnn_movie_harm = harmonized[key]
    rnn_human_harm = harmonized_human

    # word overlap
    overlap_percentage = word_overlap(rnn_movie_harm, rnn_human_harm, 'token')
    print(f"Word Overlap between RNN Movie and RNN Human ({genre}):{overlap_percentage:.4f}%")

# TIME

In [None]:
print("Done in", time.time() - start, "seconds")