In [16]:
import random

import plotly.express as px
import pandas as pd
from pathlib import Path
import numpy as np

from plotly.subplots import make_subplots
import plotly.graph_objects as go
from collections import defaultdict


In [30]:
# open word features and plot the statistics of each text
word_features_folder = Path('stimuli/word_features/')

# features to include in the boxplots
box_features = {
	'word_length': [],
	'type_length_syllables': [],
	'lemma_frequency_normalized': [],
	'type_frequency_normalized': [],
	'sent_surprisal_gpt2-large': [],
}

texts = defaultdict(list)
topics = defaultdict(list)

text_ids = []
feature_names = []

for wf_file in sorted(list(word_features_folder.glob('*.tsv'))):
	text_id = wf_file.stem.split('_')[2]
	tsv = pd.read_csv(wf_file, sep='\t')
	topic = tsv['text_domain'].iloc[0]
	
	for feat_key in box_features.keys():
		# some features will be log-transformed
		if feat_key == 'lemma_frequency_normalized' or feat_key == 'type_frequency_normalized':
			features = np.log(tsv[feat_key].tolist())
		else:
			features = tsv[feat_key].tolist()
		box_features[feat_key].extend(features)
		text_ids.extend([text_id for _ in range(len(features))])
		feature_names.extend([feat_key for _ in range(len(features))])
		texts[feat_key].extend([text_id for _ in range(len(features))])
		topics[feat_key].extend([topic for _ in range(len(features))])

In [34]:
#  long and short version
fig = make_subplots(
	#rows=3,
	rows=5,
	cols=7,
	specs=[
		[{"colspan": 6}, None, None, None, None, None, {'colspan': 1}],
		[{"colspan": 6}, None, None, None, None, None, {'colspan': 1}],
		[{"colspan": 6}, None, None, None, None, None, {'colspan': 1}],
		[{"colspan": 6}, None, None, None, None, None, {'colspan': 1}],
		[{"colspan": 6}, None, None, None, None, None, {'colspan': 1}]
	],	
	subplot_titles=('Word length (characters)', None,
					'Type length (syllables)', None,
					'Log-Frequency (lemma)', None,
					'Log-Frequency (type)', None,
					'Surprisal (GPT-2 large)', None),
	shared_yaxes=True,
	horizontal_spacing=0.01,
	vertical_spacing=0.08,
)

# colorblind palette
colors = [
	'#332288', 
	'#88CCEE', 
	'#FFC107', 
	'#117733', 
	'#882255', 
	'#CC6677', 
	'#AA4499', 
	'#44AA99'
]

for idx, feat in enumerate(box_features):

	fig.add_trace(go.Box(
		y=box_features[feat],
		x=texts[feat],
		showlegend=False,
		marker_color=colors[idx],
	), row=idx+1, col=1)
	
	fig.add_trace(go.Box(
		y=box_features[feat],
		x=topics[feat],
		marker_color=colors[idx],
		showlegend=False,
	), row=idx+1, col=7)
	
fig.update_traces(marker={'size': 3})

fig.update_layout(
    autosize=False,
    width=900,
    #height=800,
	height=1200,
	template='plotly',
)

fig.write_image('text_boxplots_long.png')
fig.show()

In [None]:
participants_file = Path('participants/participants.tsv')

pd.read_csv(participants_file, sep='\t')