In [1]:
import pandas as pd
import os

# --- CONFIGURATION ---
INPUT_CSV = "reddit_burnout_anonymized_for_annotation.csv"
OUTPUT_CSV = "gold_standard_dataset_v1.csv"
PROGRESS_FILE = "annotation_progress.txt"

# --- LOAD THE DATA ---
if not os.path.exists(INPUT_CSV):
    print(f"🔥 FATAL ERROR: Input file '{INPUT_CSV}' not found. Did you run the Day 1 script?")
else:
    df = pd.read_csv(INPUT_CSV)
    print(f"✅ Loaded {len(df)} posts to be annotated.")

# --- LOAD PROGRESS ---
start_index = 0
if os.path.exists(PROGRESS_FILE):
    with open(PROGRESS_FILE, 'r') as f:
        try:
            start_index = int(f.read())
            print(f"Resuming from post #{start_index + 1}")
        except ValueError:
            pass # Keep start_index at 0 if file is corrupt

if os.path.exists(OUTPUT_CSV):
    df_annotated = pd.read_csv(OUTPUT_CSV)
else:
    # Create an empty DataFrame with the new label column if it doesn't exist
    df_annotated = df.copy()
    df_annotated['burnout_label'] = -1 # Initialize with a placeholder

✅ Loaded 1390 posts to be annotated.


In [2]:
# In Cell 1, after loading the initial CSV
TARGET_SIZE = 600
if len(df) > TARGET_SIZE:
    print(f"Dataset is too large ({len(df)}). Taking a random, strategic sample of {TARGET_SIZE}.")
    df = df.sample(n=TARGET_SIZE, random_state=42).reset_index(drop=True)
    print(f"New dataset size: {len(df)}")

Dataset is too large (1390). Taking a random, strategic sample of 600.
New dataset size: 600


In [3]:
# --- THE ANNOTATION LOOP ---

print("--- Starting Annotation Grind ---")
print("Enter label (0=Noise, 1=Stressed, 2=Burned Out) or 'quit' to exit.")
print("---------------------------------")

# Use .loc to ensure we are modifying the correct DataFrame slice
for i in range(start_index, len(df)):
    row = df.iloc[i]
    print(f"\\n--- Post {i+1}/{len(df)} | Subreddit: r/{row['subreddit']} ---")
    print(f"TITLE: {row['clean_title']}")
    print("---------------------------------")
    print(f"BODY:\\n{row['clean_body']}")
    print("---------------------------------")

    while True:
        label = input("Label (0, 1, 2) | 'quit': ")
        if label.lower() == 'quit':
            print(f"\\n🛑 Annotation paused at post #{i+1}. Progress saved.")
            break
        try:
            label_int = int(label)
            if label_int in [0, 1, 2]:
                df_annotated.loc[i, 'burnout_label'] = label_int
                # Save progress immediately
                df_annotated.to_csv(OUTPUT_CSV, index=False)
                with open(PROGRESS_FILE, 'w') as f:
                    f.write(str(i + 1)) # Save the next index to start from
                break
            else:
                print("Invalid label. Please enter 0, 1, or 2.")
        except ValueError:
            print("Invalid input. Please enter a number or 'quit'.")

    if label.lower() == 'quit':
        break

if 'label' in locals() and label.lower() != 'quit':
    print("\\n🏆🏆🏆 ANNOTATION COMPLETE! 🏆🏆🏆")
    print(f"Gold standard dataset saved to '{OUTPUT_CSV}'")

--- Starting Annotation Grind ---
Enter label (0=Noise, 1=Stressed, 2=Burned Out) or 'quit' to exit.
---------------------------------
\n--- Post 1/600 | Subreddit: r/Professors ---
TITLE: End of the semester: How much empathy can one give?
---------------------------------
BODY:\nAlthough this is my 4th semester, I am still new to all this. Over the past couple years, my class size has increased. This semester I have about 165 students in my largest class. One thing that I have learned is that students love to wait til the end of the semester to share life’s circumstances. I understand trying to push through it, but I have asked them to let me know when things happen instead waiting til the end.

Their semester paper was posted in the syllabus with the due date. I told them over a month ago that they should be working on it. This is a huge assignment and requires me and my teaching assistants to devote a lot of time to it on top of catching up on grading other things. The long emails 