This notebook performs proportional downsampling of ELL essays
based on aggregated writing quality scores.

The goal is to obtain a smaller but balanced subset for downstream analysis.


In [1]:
!pip install pandas -q

import pandas as pd
import json
import os
from google.colab import drive

# 1. GOOGLE DRIVE CONNECTION
print("Checking Google Drive connection...")
if not os.path.exists('/content/drive'):
    drive.mount('/content/drive')
    print("Drive mounted successfully.")
else:
    print("Drive is already mounted.")

Checking Google Drive connection...
Mounted at /content/drive
Drive mounted successfully.


In [2]:
INPUT_FILE_PATH = '/content/drive/MyDrive/Fırat Projeler/Detecting AI Influence in Student Writing: Toward Reliable and Interpretable Classifiers/feedback-prize-english-language-learning/ell_essay_families_structure_V2.jsonl'
OUTPUT_SAMPLED_PATH = '/content/drive/MyDrive/Fırat Projeler/Detecting AI Influence in Student Writing: Toward Reliable and Interpretable Classifiers/feedback-prize-english-language-learning/ell_balanced_1K.jsonl'

# How many samples will be taken from each group and the total number of texts
TARGET_TOTAL = 1000
TARGET_PER_GROUP = 250


data = []
if os.path.exists(INPUT_FILE_PATH):
    print(f"Reading file: {INPUT_FILE_PATH}")
    with open(INPUT_FILE_PATH, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    print(f"Total Original Dataset Size: {len(data)}")
else:
    print(f"ERROR: File not found! Please check the path:\n{INPUT_FILE_PATH}")
    data = []

if data:
    # scoring
    df_rows = []
    for item in data:
        scores = item['source']['original_scores']

        # Average of Grammar, Vocabulary, Cohesion, and Syntax scores
        # Phraseology and Conventions are excluded
        quality_score = (scores['grammar'] + scores['vocabulary'] + scores['cohesion'] + scores['syntax']) / 4

        # 4 groups
        if quality_score < 2.5:
            group = '1_Low (1.0-2.5)'
        elif quality_score < 3.0:
            group = '2_MidLow (2.5-3.0)'
        elif quality_score < 3.5:
            group = '3_MidHigh (3.0-3.5)'
        else:
            group = '4_High (3.5-5.0)'

        df_rows.append({
            'json_data': item,
            'score': quality_score,
            'group': group
        })

    df = pd.DataFrame(df_rows)

    print("\n--- Original Data Distribution ---")
    print(df['group'].value_counts().sort_index())

    # Sampling step
    sampled_dfs = []
    groups = df['group'].unique()

    for g in groups:
        subset = df[df['group'] == g]

        # If there is not enough data in the group, take all of them
        if len(subset) < TARGET_PER_GROUP:
            print(f"Warning: Group '{g}' contains only {len(subset)} samples. Taking all of them.")
            sampled = subset
        else:
            # If more than 250 samples, select randomly
            sampled = subset.sample(n=TARGET_PER_GROUP, random_state=42)

        sampled_dfs.append(sampled)

    final_df = pd.concat(sampled_dfs)

    # If some groups have fewer texts and the total is still insufficient, randomly sample the remaining ones
    if len(final_df) < TARGET_TOTAL:
        needed = TARGET_TOTAL - len(final_df)
        print(f"\nAdding {needed} random samples from the remaining pool to reach 1000.")

        remaining = df.drop(final_df.index)  # Remove already selected samples from the pool
        extra = remaining.sample(n=needed, random_state=42)
        final_df = pd.concat([final_df, extra])

    # 7. DISPLAY AND SAVE RESULTS
    print("\n--- Selected (Balanced) Distribution ---")
    print(final_df['group'].value_counts().sort_index())
    print(f"Total Selected Samples: {len(final_df)}")

    # Write to file
    with open(OUTPUT_SAMPLED_PATH, 'w', encoding='utf-8') as f:
        for item in final_df['json_data']:
            f.write(json.dumps(item, ensure_ascii=False) + '\n')

    print(f"\n✅ SUCCESS: '1000 text samples have been saved to:\n{OUTPUT_SAMPLED_PATH}")

else:
    print("Processing could not be performed because the data could not be loaded.")


Reading file: /content/drive/MyDrive/Fırat Projeler/Detecting AI Influence in Student Writing: Toward Reliable and Interpretable Classifiers/feedback-prize-english-language-learning/ell_essay_families_structure_V2.jsonl
Total Original Dataset Size: 3911

--- Original Data Distribution ---
group
1_Low (1.0-2.5)         441
2_MidLow (2.5-3.0)     1137
3_MidHigh (3.0-3.5)    1240
4_High (3.5-5.0)       1093
Name: count, dtype: int64

--- Selected (Balanced) Distribution ---
group
1_Low (1.0-2.5)        250
2_MidLow (2.5-3.0)     250
3_MidHigh (3.0-3.5)    250
4_High (3.5-5.0)       250
Name: count, dtype: int64
Total Selected Samples: 1000

✅ SUCCESS: '1000 text samples have been saved to:
/content/drive/MyDrive/Fırat Projeler/Detecting AI Influence in Student Writing: Toward Reliable and Interpretable Classifiers/feedback-prize-english-language-learning/ell_balanced_1K.jsonl
