# Annotated dataset statistics and tables

In [1]:
import pandas as pd
from transformers import AutoTokenizer
import break_utils

In [2]:
weights_name = 'roberta-large'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(weights_name)

In [4]:
df = break_utils.load_annotated_dataset("annotated_break_data.csv", tokenizer)

In [5]:
full_meaning_dist = df.meaning.value_counts()

In [6]:
full_meaning_dist.shape

(72,)

In [7]:
df.shape

(1042, 11)

In [8]:
print(full_meaning_dist.to_latex())

\begin{tabular}{lr}
\toprule
{} &  meaning \\
\midrule
separate\_into\_parts               &      150 \\
end                               &      126 \\
decipher                          &       62 \\
break\_down\_separate\_into\_parts    &       61 \\
violate                           &       59 \\
break\_up\_separate\_into\_parts      &       35 \\
surpass                           &       34 \\
break\_down\_destroy                &       31 \\
break\_into\_intrude                &       28 \\
reveal                            &       26 \\
appear                            &       25 \\
break\_through\_pass\_through        &       24 \\
render\_inoperable                 &       23 \\
unclassified                      &       21 \\
break\_down\_render\_inoperable      &       21 \\
break\_free\_escape                 &       19 \\
break\_down\_succumb                &       18 \\
cause\_to\_fail                     &       17 \\
break\_up\_end\_relationship         &       17 \\
bre

In [9]:
print(df['construction'].value_counts().to_latex())

\begin{tabular}{lr}
\toprule
{} &  construction \\
\midrule
causative    &           673 \\
unaccusative &           197 \\
unergative   &           172 \\
\bottomrule
\end{tabular}



In [10]:
probe_subset = df['meaning'].value_counts()

probe_subset = probe_subset[probe_subset >= 10]

In [11]:
print(probe_subset.to_latex())

\begin{tabular}{lr}
\toprule
{} &  meaning \\
\midrule
separate\_into\_parts            &      150 \\
end                            &      126 \\
decipher                       &       62 \\
break\_down\_separate\_into\_parts &       61 \\
violate                        &       59 \\
break\_up\_separate\_into\_parts   &       35 \\
surpass                        &       34 \\
break\_down\_destroy             &       31 \\
break\_into\_intrude             &       28 \\
reveal                         &       26 \\
appear                         &       25 \\
break\_through\_pass\_through     &       24 \\
render\_inoperable              &       23 \\
unclassified                   &       21 \\
break\_down\_render\_inoperable   &       21 \\
break\_free\_escape              &       19 \\
break\_down\_succumb             &       18 \\
cause\_to\_fail                  &       17 \\
break\_up\_end\_relationship      &       17 \\
break\_up\_end                   &       16 \\
break\_out\_e

In [12]:
probe_subset.shape

(27,)