# Sampling Procedures

Given full_dataset.csv, we sample the essays for different annotation tasks.

In [3]:
import os
import pandas as pd
import math
import numpy as np

In [13]:
# utility method for stratified random sampling

# strat_col: name of the column to stratify
# num_strata: number of bins to split strat_col values
# labels: list of names for each bin
# n: number to sample from each bin
def strat_sample(df, strat_col, num_strata, labels, n):
    df['strat'] = pd.qcut(df[strat_col], q=num_strata, labels=labels)
    sampled_df = pd.DataFrame()
    for bucket in labels:
        bucket_df = df[df['strat'] == bucket]
        if not bucket_df.empty:
            sampled_rows = bucket_df.sample(n=n, random_state=1)
            sampled_df = pd.concat([sampled_df, sampled_rows])
    sampled_df.reset_index(drop=True, inplace=True)
    return sampled_df

## Extended Think-Aloud CTA Interviews

Prior CTA interviews focused on literary analysis and narrative essays in middle and high school. To additionally capture persuasive writing, we select 3 middle school and 3 high school persuasive essays.

In [5]:
fullset = pd.read_csv('full_dataset.csv')
# essay_set 1 is 8th grade (middle school), essay_set 2 is 10th grade (high school)
ms_persuasive = fullset[fullset['essay_set'] == 1]
hs_persuasive = fullset[fullset['essay_set'] == 2]
ms_sample = strat_sample(ms_persuasive, 'domain1_score', 3, labels=['low', 'mid', 'high'], n=1)
hs_sample = strat_sample(hs_persuasive, 'domain1_score', 3, labels=['low', 'mid', 'high'], n=1)
ms_sample.to_csv('ms_persuasive_cta_sample.csv', index=False)
hs_sample.to_csv('hs_persuasive_cta_sample.csv', index=False)

## Generated Feedback Editing Task

We will recruit ~30 *expert teachers to revise the feedback generated by an LLM with only a base prompt. We expect 15 middle school and 15 high school teachers. Each teacher should be able to annotate 12 essays, but each selected essay should be reviewed by 2 teachers. We select 90 essays from middle school, and 90 essays from high school. Each set is 30 literary analysis, 30 persuasive, and 30 narrative.

In [27]:
ms_persuasive = fullset[fullset['essay_set'] == 1]
ms_litanalysis = fullset[fullset['essay_set'] == 5]
ms_narrative = fullset[fullset['essay_set'] == 7]
ms_essays = [ms_persuasive, ms_litanalysis, ms_narrative]

hs_persuasive = fullset[fullset['essay_set'] == 2]
hs_litanalysis = fullset[fullset['essay_set'] == 3]
hs_narrative = fullset[fullset['essay_set'] == 8]
hs_essays = [hs_persuasive, hs_litanalysis, hs_narrative]

In [56]:
# each essay gets reviewed by 2 teachers
# assign first teacher randomly
def get_teacher_assignments(n_teachers, n_essays):
    teacher_ids = np.repeat(np.arange(1, n_teachers + 1), math.ceil(n_essays/n_teachers))
    np.random.shuffle(teacher_ids)
    round1 = teacher_ids

    # assigned second teacher in two batches, avoid overassigning and duplicates
    round2 = []
    for k in range(math.ceil(n_essays/n_teachers)):
        selected = []
        for i in range(n_teachers):
            valid_teachers = np.setdiff1d(teacher_ids, np.concatenate(([round1[i+(k*n_teachers -1)]], selected)))
            print(valid_teachers)
            np.random.shuffle(valid_teachers)
            selected.append(int(valid_teachers[0]))
        round2 = np.concatenate((round2, selected))
    return round1, round2

In [52]:
ms_data = pd.DataFrame()
for set in ms_essays:
    sample = strat_sample(set, 'domain1_score', 3, labels=['low', 'mid', 'high'], n=10)
    round1, round2 = get_teacher_assignments(15, 30)
    sample['tid1'] = round1
    sample['tid2'] = round2
    ms_data = pd.concat([ms_data, sample])

hs_data = pd.DataFrame()
for set in hs_essays:
    sample = strat_sample(set, 'domain1_score', 3, labels=['low', 'mid', 'high'], n=10)
    round1, round2 = get_teacher_assignments(15, 30)
    sample['tid1'] = round1
    sample['tid2'] = round2
    hs_data = pd.concat([hs_data, sample])

ms_data.to_csv('ms_gen_edit_sample.csv', index=False)
hs_data.to_csv('hs_gen_edit_sample.csv', index=False)

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14]
[ 1  2  3  4  6  7  8  9 10 11 12 14 15]
[ 1  3  4  5  7  8  9 10 11 12 14 15]
[ 1  3  4  5  6  7  8 10 11 12 14 15]
[ 1  3  4  5  6  7  8 10 11 12 15]
[ 1  3  4  5  7 10 11 12 15]
[ 1  3  4  5  6  7 10 11]
[ 1  3  4  6  7 10 11 15]
[ 1  4  6  7 10 11 15]
[ 1  6  7 10 11 15]
[ 1  6 11 15]
[ 6  7 11 15]
[ 6  7 11]
[ 7 11]
[11]
[ 1  2  3  4  5  6  7  8 10 11 12 13 14 15]
[ 3  4  5  6  7  8  9 10 11 12 13 14 15]
[ 2  3  4  5  7  8  9 10 11 13 14 15]
[ 2  5  7  8  9 10 11 12 13 14 15]
[ 2  5  7  8  9 11 12 13 14 15]
[ 2  4  5  7  9 11 12 13 14 15]
[ 2  4  7  9 11 12 13 14 15]
[ 2  4  7  9 11 14 15]
[ 2  9 11 12 14 15]
[ 2  7  9 11 12 14]
[ 2  9 11 12 14]
[ 2  9 12]
[ 9 12 14]
[ 9 14]
[14]
[ 1  2  3  4  5  6  7  8  9 10 11 12 14 15]
[ 1  2  3  4  5  6  7  8  9 11 12 13 14]
[ 1  2  3  5  6  8  9 11 12 13 14 15]
[ 1  2  3  4  5  9 11 12 13 14 15]
[ 1  2  3  4  5  6  9 12 13 14]
[ 1  2  3  4  5 12 13 14 15]
[ 1  3  4  5  9 12 13 14 15]
[ 3  4  5  9 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['strat'] = pd.qcut(df[strat_col], q=num_strata, labels=labels)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['strat'] = pd.qcut(df[strat_col], q=num_strata, labels=labels)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['strat'] = pd.qcut(df[strat_col], q=num_strata, labels=labels)
A value

We might only want to focus this task on persuasive essays. In that case, we would select all 90 essays (for middle and high school respectively) from the persuasive essay sets.

In [57]:
ms_persuasive = fullset[fullset['essay_set'] == 1]
ms_sample = strat_sample(ms_persuasive, 'domain1_score', 3, labels=['low', 'mid', 'high'], n=30)
round1, round2 = get_teacher_assignments(15, 90)
ms_sample['tid1'] = round1
ms_sample['tid2'] = round2

hs_persuasive = fullset[fullset['essay_set'] == 2]
hs_sample = strat_sample(hs_persuasive, 'domain1_score', 3, labels=['low', 'mid', 'high'], n=30)
round1, round2 = get_teacher_assignments(15, 90)
hs_sample['tid1'] = round1
hs_sample['tid2'] = round2

[ 1  3  4  5  6  7  8  9 10 11 12 13 14 15]
[ 1  2  3  4  5  6  7  8  9 10 12 13 14 15]
[ 2  3  4  5  6  8  9 10 12 13 14 15]
[ 2  4  5  6  7  8  9 10 12 13 14]
[ 2  3  4  5  6  7  8  9 13 14]
[ 2  3  4  5  6  8  9 12 13 14]
[ 2  3  4  6  8  9 12 13 14]
[ 2  3  6  8 12 13 14]
[ 2  3  8  9 12 13]
[ 2  3  8  9 13 14]
[ 3  8  9 13 14]
[ 8  9 13 14]
[ 8 13 14]
[ 8 13]
[13]
[ 1  2  3  4  5  7  8  9 10 11 12 13 14 15]
[ 1  2  3  4  6  7  8  9 10 11 13 14 15]
[ 1  2  3  4  7  8  9 10 11 12 13 14 15]
[ 1  2  3  4  7  9 10 11 12 14 15]
[ 1  2  3  7  9 11 12 13 14 15]
[ 1  2  3  7 10 11 12 13 15]
[ 1  2  3  7 10 12 14 15]
[ 2  3  7 10 12 13 14 15]
[ 2  3  7 10 12 13 14]
[ 2  3 10 12 13 14]
[ 2  3 10 12 14]
[ 2 10 14]
[ 2 10 12]
[10 12]
[10]
[ 1  2  3  4  5  6  7  9 10 11 12 13 14 15]
[ 2  3  4  5  6  8  9 10 11 12 13 14 15]
[ 2  3  4  5  6  7  8  9 11 13 14 15]
[ 2  3  4  6  7  8  9 11 12 13 14]
[ 3  4  6  7  8  9 11 12 13 14 15]
[ 3  6  7  8  9 12 13 14 15]
[ 3  4  6  7 12 13 14 15]
[ 3  4  6  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['strat'] = pd.qcut(df[strat_col], q=num_strata, labels=labels)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['strat'] = pd.qcut(df[strat_col], q=num_strata, labels=labels)


In [60]:
ms_sample.to_csv('ms_gen_edit_persuasive_sample.csv', index=False)
hs_sample.to_csv('hs_gen_edit_persuasive_sample.csv', index=False)