# Sampling Procedures

Given full_dataset.csv, we sample the essays for different annotation tasks.

In [3]:
import os
import pandas as pd
import math
import numpy as np

In [13]:
# utility method for stratified random sampling

# strat_col: name of the column to stratify
# num_strata: number of bins to split strat_col values
# labels: list of names for each bin
# n: number to sample from each bin
def strat_sample(df, strat_col, num_strata, labels, n):
    df['strat'] = pd.qcut(df[strat_col], q=num_strata, labels=labels)
    sampled_df = pd.DataFrame()
    for bucket in labels:
        bucket_df = df[df['strat'] == bucket]
        if not bucket_df.empty:
            sampled_rows = bucket_df.sample(n=n, random_state=1)
            sampled_df = pd.concat([sampled_df, sampled_rows])
    sampled_df.reset_index(drop=True, inplace=True)
    return sampled_df

## Extended Think-Aloud CTA Interviews

Prior CTA interviews focused on literary analysis and narrative essays in middle and high school. To additionally capture persuasive writing, we select 3 middle school and 3 high school persuasive essays.

In [5]:
fullset = pd.read_csv('full_dataset.csv')
# essay_set 1 is 8th grade (middle school), essay_set 2 is 10th grade (high school)
ms_persuasive = fullset[fullset['essay_set'] == 1]
hs_persuasive = fullset[fullset['essay_set'] == 2]
ms_sample = strat_sample(ms_persuasive, 'domain1_score', 3, labels=['low', 'mid', 'high'], n=1)
hs_sample = strat_sample(hs_persuasive, 'domain1_score', 3, labels=['low', 'mid', 'high'], n=1)
ms_sample.to_csv('ms_persuasive_cta_sample.csv', index=False)
hs_sample.to_csv('hs_persuasive_cta_sample.csv', index=False)

## Generated Feedback Editing Task

We will recruit ~30 *expert teachers to revise the feedback generated by an LLM with only a base prompt. We expect 15 middle school and 15 high school teachers. Each teacher should be able to annotate 12 essays, but each selected essay should be reviewed by 2 teachers. We select 90 essays from middle school, and 90 essays from high school. Each set is 30 literary analysis, 30 persuasive, and 30 narrative.

In [27]:
ms_persuasive = fullset[fullset['essay_set'] == 1]
ms_litanalysis = fullset[fullset['essay_set'] == 5]
ms_narrative = fullset[fullset['essay_set'] == 7]
ms_essays = [ms_persuasive, ms_litanalysis, ms_narrative]

hs_persuasive = fullset[fullset['essay_set'] == 2]
hs_litanalysis = fullset[fullset['essay_set'] == 3]
hs_narrative = fullset[fullset['essay_set'] == 8]
hs_essays = [hs_persuasive, hs_litanalysis, hs_narrative]

In [39]:
# each essay gets reviewed by 2 teachers
# assign first teacher randomly
def get_teacher_assignments(n_teachers, n_essays):
    teacher_ids = np.repeat(np.arange(1, n_teachers + 1), math.ceil(n_essays/n_teachers))
    np.random.shuffle(teacher_ids)
    round1 = teacher_ids

    # assigned second teacher in two batches, avoid overassigning and duplicates
    round2_1 = []
    round2_2 = []
    for i in range(n_teachers):
        valid_teachers = np.setdiff1d(teacher_ids, np.concatenate(([round1[i]], round2_1)))
        #print(valid_teachers)
        np.random.shuffle(valid_teachers)
        round2_1.append(valid_teachers[0])
    for i in range(n_teachers):
        valid_teachers = np.setdiff1d(teacher_ids, np.concatenate(([round1[i+n_teachers-1]], round2_2)))
        #print(valid_teachers)
        np.random.shuffle(valid_teachers)
        round2_2.append(valid_teachers[0])
    round2 = np.concatenate((round2_1, round2_2))
    return round1, round2

In [None]:
ms_data = pd.DataFrame()
for set in ms_essays:
    sample = strat_sample(set, 'domain1_score', 3, labels=['low', 'mid', 'high'], n=10)
    round1, round2 = get_teacher_assignments(15, 30)
    sample['tid1'] = round1
    sample['tid2'] = round2
    ms_data = pd.concat([ms_data, sample])

hs_data = pd.DataFrame()
for set in hs_essays:
    sample = strat_sample(set, 'domain1_score', 3, labels=['low', 'mid', 'high'], n=10)
    round1, round2 = get_teacher_assignments(15, 30)
    sample['tid1'] = round1
    sample['tid2'] = round2
    hs_data = pd.concat([hs_data, sample])

ms_data.to_csv('ms_gen_edit_sample.csv', index=False)
hs_data.to_csv('hs_gen_edit_sample.csv', index=False)