<a href="https://colab.research.google.com/github/Ayushee-Seeburrun/ASAG-with-Data-Augmentation/blob/main/trainset_balancing_score3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import os, time, json, requests, random
from google.colab import userdata

In [None]:
cleanfile = pd.read_csv("/content/drive/MyDrive/Data Augmentation/Dataset Splits/trainset_score2_1000_augmented.csv")
print("rows before cleaning the file: ", len(cleanfile))

rows before cleaning the file:  14765


In [None]:
#to clesan the trainset file (that has score2 already augmented) from unnecessary things like "^p"
cleanfile["EssayText"] = (
    cleanfile["EssayText"].astype(str).str.replace(r"\^p", " ", regex=True)
    .str.replace(r"\s+", " ", regex=True)
    .str.strip()
)

In [None]:
print(cleanfile["EssayText"].str.contains(r"\^p").sum())

0


In [None]:
cleanfile.to_csv("/content/drive/MyDrive/Data Augmentation/Dataset Splits/train_aug1000_s2_cleaned.csv", index=False)

In [None]:
#setting the static rules to apply prompt caching
static_sys = """
You are helping with ASAG data augmentation.
Task: Paraphrase student answers.
Rules:
-Keep the SAME meaning and the SAME level of correctness.
-Do not add or remove information.
-Do not output labels like "Student answer: " or "Paraphrased answer:" or any commentary.
-Return only valid JSON(no markdown).
-Output must be a JSON array of strings (each string is only the paraphrased answer), same order as input.
-Escape any quotes inside the strings with backslash.
-Avoid repetitive sentence starters. Vary the opening sentences each time.
- Do NOT begin answers with phrases such as:
  "To replicate the experiment", "Based on the data", "The student's data indicates".
-Do NOT use lettered formatting like "a)" "b)" or symbols like "^p".
-Do NOT use numbered lists (e.g., "1.", "2.", "3."). Write as one natural paragraph (no list formatting), unless the original answer is a list.
-Write in a more natural tone, like a student answering an exam question.
"""

In [None]:
def build_prompt(textlist):
  return(
      "Paraphrase the following student answers.\n Return only a valid JSON array of strings,same order.\n Do not add labels like 'student answer:' or 'paraphrased answer:'.\n" + json.dumps(textlist,ensure_ascii=False)
  )


In [None]:
claude_api_key = userdata.get('Claude_key')

In [None]:
def claude_call(prompt):
  resp= requests.post("https://api.anthropic.com/v1/messages",
      headers={
          "x-api-key":claude_api_key,
          "anthropic-version": "2023-06-01",
          "content-type": "application/json"
     },
      json={
          "model": "claude-3-haiku-20240307",
          "max_tokens": 2000,
          "system": [{
              "type":"text",
              "text": static_sys,
              "cache_control": {"type": "ephemeral"}
          }],
          "messages": [{
              "role":"user",
              "content": prompt
          }]
      },
  )
  if resp.status_code == 200:
    return resp.json()["content"][0]["text"]
  else:
    print("Claude API error: ", resp.status_code)
    print(resp.text[:300])
    return None

In [None]:
#converting the JSON text into a Python list of strings
def parse_json_array(text):
  arr = json.loads(text.strip())
  #checks if claude returned a list
  if not isinstance(arr, list):
    return None
  #to check if every item in the list is a string
  if not all(isinstance(x,str) for x in arr):
    return None
  return arr

In [None]:
#latest file before appending the incremental increase of score 3 to the file where score 2 has been already increased
tr = pd.read_csv('/content/drive/MyDrive/Data Augmentation/Dataset Splits/train_s2_append_s3_2000.csv')
print("Loaded data from file:\n", len(tr))
print(tr["Score2"].value_counts().sort_index())

Loaded data from file:
 16747
Score2
0    5424
1    4518
2    4225
3    2580
Name: count, dtype: int64


In [None]:
df3 = tr[tr["Score2"] == 3].copy().reset_index(drop=True)
existingdata = set(tr["EssayText"].astype(str).tolist())

In [None]:
add_s3 = 1500
#batch size = how many anchor answers are sent to Claude per request.
batch_size = 25
batch_needed = (add_s3 + batch_size - 1) // batch_size
print("batches needed: ", batch_needed)

batches needed:  60


In [None]:
paraphrased_rows_score3 = []
s3answers = df3["EssayText"].astype(str).tolist()

target = add_s3
startTime = time.time()

bad_json_streak = 0

bad_starts = (
    "to replicate the experiment",
    "based on the data",
    "in conclusion",
    "the data indicates",
    "the data suggests",
    "the student's data indicates",
    "to duplicate",
    "the information needed to replicate",
    "the key additional information needed to replicate",
    "additional information needed to replicate",
)

replicate_phrases = (
    "replicate", "replicab", "reproduc",
    "repeat this experiment", "to repeat this experiment",
    "repeat the experiment", "to repeat the experiment",
    "duplicate the experiment", "to duplicate the experiment",
    "procedure lacks", "procedure is missing", "procedure needs",
    "missing key details", "key missing details", "needed to replicate",
    "to replicate this experiment",
    "in order to replicate",
    "replicating this experiment",
    "so the experiment can be replicated",
    "the additional information needed",
    "the experiment would need more details",
    "would need to know",
    "to run this experiment",
    "the student would need",
    "the procedure needs",
)

replicate_count = 0
replicate_limit = int(0.08 * target)

ab_count = 0
ab_limit = int(0.12 * target)

while len(paraphrased_rows_score3) < target:

    k = min(batch_size, len(s3answers))
    anchors = random.sample(s3answers, k)

    prompt = build_prompt(anchors)
    result = claude_call(prompt)

    if result is None:
        print("API failed. Retrying")
        time.sleep(2)
        continue

    try:
        paraphrases = parse_json_array(result)
        if paraphrases is None:
            raise ValueError("Invalid JSON")
        bad_json_streak = 0
    except:
        bad_json_streak += 1
        print("Bad JSON from Claude. Retrying")
        print(result[:300])
        time.sleep(2)
        if bad_json_streak >= 10:
            print("Too many bad JSON responses in a row â€” stopping for safety.")
            break
        continue

    added = 0
    for para in paraphrases:
        para = str(para).strip()
        p = para.lower()

        if len(para) < 60:
            continue
        if para in existingdata:
            continue
        if "^p" in p:
            continue

        # reject obvious truncation
        if para.endswith((" to", " and", " or", " because", " but")):
            continue

        if p.startswith(("1.", "2.", "3.")):
            continue


        # avoid too many identical starters (format cues)
        if p.startswith(bad_starts):
            continue

        is_replicate = any(phrase in p for phrase in replicate_phrases)
        if is_replicate and replicate_count >= replicate_limit:
            continue

        is_ab = ("(a)" in p) or ("(b)" in p) or p.startswith(("a)", "a.", "b)", "b."))
        if is_ab and ab_count >= ab_limit:
            continue

        paraphrased_rows_score3.append({
            "EssayText": para,
            "Score2": 3,
            "type": "paraphrased"
        })

        existingdata.add(para)
        if is_replicate:
            replicate_count += 1
        if is_ab:
            ab_count += 1

        added += 1
        if len(paraphrased_rows_score3) >= target:
            break

    elapsed = time.time() - startTime
    print(f"Added {added} in this batch | Total: {len(paraphrased_rows_score3)}/{target} | {elapsed/60:.2f} mins")

    time.sleep(1.0)


Added 19 in this batch | Total: 19/1500 | 0.14 mins
Added 21 in this batch | Total: 40/1500 | 0.30 mins
Added 13 in this batch | Total: 53/1500 | 0.47 mins
Added 18 in this batch | Total: 71/1500 | 0.60 mins
Added 16 in this batch | Total: 87/1500 | 0.76 mins
Added 5 in this batch | Total: 92/1500 | 0.94 mins
Added 19 in this batch | Total: 111/1500 | 1.09 mins
Added 17 in this batch | Total: 128/1500 | 1.29 mins
Added 7 in this batch | Total: 135/1500 | 1.45 mins
Added 21 in this batch | Total: 156/1500 | 1.55 mins
Added 12 in this batch | Total: 168/1500 | 1.72 mins
Added 19 in this batch | Total: 187/1500 | 1.87 mins
Added 20 in this batch | Total: 207/1500 | 2.01 mins
Added 10 in this batch | Total: 217/1500 | 2.15 mins
Added 23 in this batch | Total: 240/1500 | 2.29 mins
Added 19 in this batch | Total: 259/1500 | 2.45 mins
Added 13 in this batch | Total: 272/1500 | 2.63 mins
Added 21 in this batch | Total: 293/1500 | 2.75 mins
Added 17 in this batch | Total: 310/1500 | 2.93 mins
A

In [None]:
#score 3 +1500 a few cases that are unacceptable, this will clean them
import re

def is_valid_s3(text):
    t = text.lower()

    # reject wrong plastic labels
    if re.search(r"\bplastic\s+type\s+[e-z]\b", t):
        return False

    # reject acid rain hallucination
    if "acid rain" in t:
        return False

    # reject incorrect trial suggestion
    if "conducted two trials" in t:
        return False

    return True

generated_s3 = pd.DataFrame(paraphrased_rows_score3)
generated_s3_clean = generated_s3[
    generated_s3["EssayText"].apply(is_valid_s3)
].reset_index(drop=True)

print("kept:", len(generated_s3_clean), "removed:", len(generated_s3) - len(generated_s3_clean))


kept: 1493 removed: 7


In [None]:
bad_phrases = [
    "likely did not make any mistakes",
    "conducted a good experiment",
    "least overall stretch"
]

generated_s3 = pd.DataFrame(paraphrased_rows_score3)
generated_s3 = generated_s3[
    ~generated_s3["EssayText"].str.lower().str.contains("|".join(bad_phrases))
].reset_index(drop=True)


In [None]:
print("New score-3 rows generated: ",len(generated_s3))
print(generated_s3["Score2"].value_counts())

tr_updated = pd.concat([tr, generated_s3], ignore_index=True)
print("after append: \n", tr_updated["Score2"].value_counts().sort_index())

New score-3 rows generated:  1500
Score2
3    1500
Name: count, dtype: int64
after append:  Score2
0    5424
1    4518
2    4225
3    4080
Name: count, dtype: int64


In [None]:
#appending the latest increase of score 3 to the file
save2folder = "/content/drive/MyDrive/Data Augmentation/Dataset Splits/train_balanced_s2_s3_final.csv"
tr_updated.to_csv(save2folder,index=False)


In [None]:
folderpath = "/content/drive/MyDrive/Data Augmentation/Dataset Splits"
train = pd.read_csv(f"{folderpath}/train_set.csv")

print("Total number of rows in training set: ", len(train))
print(train["Score2"].value_counts().sort_index())

Total number of rows in training set:  13765
Score2
0    5424
1    4518
2    3225
3     598
Name: count, dtype: int64


In [None]:
readfile = pd.read_csv("/content/drive/MyDrive/Data Augmentation/Dataset Splits/train_balanced_s2_s3_final.csv")
print("Total rows in balanced dataset: " , len(readfile), "\n", readfile["Score2"].value_counts().sort_index())


Total rows in balanced dataset:  18247 
 Score2
0    5424
1    4518
2    4225
3    4080
Name: count, dtype: int64


In [None]:
gen_s3 = pd.DataFrame(paraphrased_rows_score3)
gen_s3.head(20)

In [None]:
genrandom = pd.DataFrame(paraphrased_rows_score3)
genrandom.sample(25, random_state=42)[["EssayText"]]


Unnamed: 0,EssayText
1116,The key missing information includes the amoun...
1368,Active Transport is the movement of something ...
422,The student notes the procedure did not specif...
413,"To replicate the acid rain experiment, the stu..."
451,The student concluded plastic type B stretched...
861,(a) Plastic type B had the greatest stretchabi...
1063,Active transport requires energy to move subst...
741,"The steps are: 1) mRNA binds to the ribosome, ..."
1272,"According to the student's analysis, plastic t..."
259,Plastic type B demonstrated the highest stretc...


In [None]:
print(len(tr))


14765


In [None]:
print(len(existingdata))

14733
