<a href="https://colab.research.google.com/github/Ayushee-Seeburrun/ASAG-with-Data-Augmentation/blob/main/claude_22000.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os,json,time, random
import pandas as pd
import requests
from google.colab import userdata

In [None]:
path = "/content/drive/MyDrive/Data Augmentation/Dataset Splits"
file = pd.read_csv(f"{path}/train_balanced_clean.csv")


In [None]:
import re, unicodedata
def clean_rows(s):
    s = str(s)
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u00A0", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

file["EssayText"] = file["EssayText"].map(clean_rows)
file["type"] = file["type"].fillna("").astype(str).str.strip().str.lower()

In [None]:
#to prioritise fairness and avoid bias towards any score
total = 22000
genperscore = total // 4

#unique counts per score using (Score2, normalized EssayText)
file["EssayText_norm"] = file["EssayText"].map(clean_rows)

uniqueScoreCount = (
    file.drop_duplicates(subset=["Score2", "EssayText_norm"])
        .groupby("Score2").size()
        .reindex([0,1,2,3], fill_value=0)
)

generateScore = (genperscore - uniqueScoreCount).clip(lower=0).astype(int)

print("Current original rows counts per score:", file["Score2"].value_counts().sort_index().to_dict())
print("Current unique counts per score:", uniqueScoreCount.to_dict())
print("Number of unique new paraphrases to be added per score:", generateScore.to_dict())



Current original rows counts per score: {0: 5424, 1: 4518, 2: 4194, 3: 3089}
Current unique counts per score: {0: 5376, 1: 4465, 2: 4168, 3: 3089}
Number of unique new paraphrases to be added per score: {0: 124, 1: 1035, 2: 1332, 3: 2411}


In [None]:
print("Sanity check (unique + to_generate):")
for s in [0,1,2,3]:
    print(s, uniqueScoreCount[s], "+", generateScore[s], "=", uniqueScoreCount[s] + generateScore[s])


Sanity check (unique + to_generate):
0 5376 + 124 = 5500
1 4465 + 1035 = 5500
2 4168 + 1332 = 5500
3 3089 + 2411 = 5500


In [None]:
#selecting all original rows to avoid claude learning from claude.
original = file[file["type"] != "paraphrased"].copy()

def strip_caret_p(text: str) -> str:
    return str(text).replace("^p", " ").replace("^P", " ")

score_anchors = {
    s: [strip_caret_p(t) for t in original[original["Score2"] == s]["EssayText"].tolist()]
    for s in [0, 1, 2, 3]
}
for s in score_anchors:
  print(f"Score {s} original anchors: ", len(score_anchors[s]))

Score 0 original anchors:  5424
Score 1 original anchors:  4518
Score 2 original anchors:  3225
Score 3 original anchors:  598


In [None]:
#all the answers that already exist in the dataset
existing_data = set(zip(file["Score2"], file["EssayText_norm"].str.lower()))

#this function checks if a paraphrase is safe to accept , seen_batch tracks what has already been accepted within the current response from Claude
def new_phrase(score, text, seen_batch) :

  #claening the claude output
  text = clean_rows(text)
  #if text has same score and same normalized text ---> its a duplicate
  key = (score, text.lower())
  return(key not in existing_data) and (key not in seen_batch), text

In [None]:
#setting the static rules to apply prompt caching
static_sys = """
You are helping with ASAG data augmentation.
Task: Paraphrase student answers.
Rules:
-Keep the SAME meaning and the SAME level of correctness.
-Do not add or remove information.
-Do not reuse any 4 consecutive words from the original answer.
-Do not output labels like "Student answer: " or "Paraphrased answer:" or any commentary.
-Return only valid JSON(no markdown).
-Output must be a JSON array of strings (each string is only the paraphrased answer), same order as input.
-Escape any quotes inside the strings with backslash.
-Avoid repetitive sentence starters. Vary the opening sentences each time.
- DO NOT begin answers with phrases such as:
  "To replicate the experiment", "Based on the data", "The student's data indicates".
-DO NOT use lettered formatting like "a)" "b)" or symbols like "^p".
-DO NOT use numbered lists (e.g., "1.", "2.", "3."). Write as one natural paragraph (no list formatting), unless the original answer is a list.
-DO NOT start with: "The student", "The student's", "This answer", "The answer", "Based on", "To replicate".
-DO NOT refer to the response as a student's work (no "the student", no "the student's answer", no "The student responds").
-Write strictly in English.
-Write in a more natural tone, like a student answering an exam question.
"""

In [None]:
def dynamic_prompt(textlist):
  return(
      "Paraphrase the following student answers.\n Return only a valid JSON array of strings,same order.\n DO NOT add labels like 'student answer:' or 'paraphrased answer:'.\n" + json.dumps(textlist,ensure_ascii=False)
  )


In [None]:
claude_api_key = userdata.get('Claude_key')

In [None]:
def claude_call(prompt):
  resp= requests.post("https://api.anthropic.com/v1/messages",
      headers={
          "x-api-key":claude_api_key,
          "anthropic-version": "2023-06-01",
          "content-type": "application/json"
     },
      json={
          "model": "claude-3-haiku-20240307",
          "max_tokens": 2500,
          "system": [{
              "type":"text",
              "text": static_sys,
              "cache_control": {"type": "ephemeral"}
          }],
          "messages": [{
              "role":"user",
              "content": [{"type": "text", "text" : prompt}]
          }]
      },
  )
  if resp.status_code == 200:
    return resp.json()["content"][0]["text"]
  else:
    print("Claude API error: ", resp.status_code)
    print(resp.text[:300])
    return None

In [None]:
def parse_json_array(text):
    try:
        data = json.loads(text)
        if isinstance(data, list) and all(isinstance(x, str) for x in data):
            return data
        return None
    except:
        return None

In [None]:
#here we're tracking where we are in the anchor list of each score
anchor_in = {s: 0 for s in [0,1,2,3]}
for s in score_anchors :
   random.shuffle(score_anchors[s])   #randomly choosing the anchor order to avoid giving the same patterns to clsude


def next_anchor(score, k):
  pool = score_anchors[score]

  if not pool:
    return[]

  out = []
  for i in range(k):
    out.append(pool[anchor_in[score] % len(pool)])
    anchor_in[score] += 1

  return out

In [None]:
batch_size = 30

accepted_para = {0:0, 1:0, 2:0, 3:0}
new_rows = []

bad_json_streak = 0
startTime = time.time()
def print_progress(extra_note=""):
    elapsed = (time.time() - startTime) / 60
    totalAdded = sum(accepted_para.values())
    totalNeeded = int(generateScore.sum())
    print(
        f"Progress: {totalAdded}/{totalNeeded} | "
        f"accepted={accepted_para} | Time elapsed={elapsed:.2f} mins {extra_note}"
    )

#here it keeps generating paraphrases until the required number has been added for all scores
while sum(accepted_para.values()) < int(generateScore.sum()):


    remaining = {s: int(generateScore[s] - accepted_para[s]) for s in [0,1,2,3]}
    #here the scores completed are being removed and only those that still need augmentation are kept
    remaining = {s:r for s,r in remaining.items() if r > 0}

    #if all scores completed, the loop is exited
    if not remaining:
        break

    #pick the score with the biggesdt gap to fill
    score = max(remaining, key=remaining.get)

    #get anchors by cycling through the originals rows to avoid repeating the same anchors each time
    k = min(batch_size, len(score_anchors[score]))
    anchors = next_anchor(score, k)

    #it converts the anchor list -->  json-based prompt
    prompt = dynamic_prompt(anchors)
    #sends the request to Claude , returns a JSON array of paraphrases
    result = claude_call(prompt)

    if result is None:
        print("API failed. Retrying...")
        time.sleep(6)
        continue

    #converts the responses from Claude into a lsit
    paraphrases = parse_json_array(result)

    if paraphrases is None:
        bad_json_streak += 1
        print("Bad JSON from Claude. Retrying. (streak:", bad_json_streak, ")")
        print(result[:250])
        time.sleep(4)

        if bad_json_streak >= 10:
            print("Too many bad JSON responses. Stopping for safety.")
            break
        continue

    bad_json_streak = 0

    #this blocks dupes within the same CLaude response
    seen_batch = set()
    added = 0

    for para in paraphrases:
        if accepted_para[score] >= int(generateScore[score]):
            break
        if para is None:
            continue

        #cleaning the Claude output and checks for dupes
        ok, cleaned = new_phrase(score, para, seen_batch)
        if not ok:
            continue

        p = cleaned.lower()

        p0 = p.lstrip()
        if p0.startswith((
            "the student", "the student's",
            "this answer", "the answer",
            "based on", "to replicate"
        )):
          continue



        if "^p" in p:
            continue
        if p.startswith(("1.", "2.", "3.")):
            continue
        if "(a)" in p or "(b)" in p or p.startswith(("a)", "b)", "a.", "b.")):
            continue
        if cleaned.endswith((" to", " and", " or", " because", " but")):
            continue
        if len(cleaned) < 60:
            continue

        # accept
        new_rows.append({
            "EssayText": cleaned,
            "Score2": score,
            "type": "paraphrased"
        })

        #below ensures that the future batches dont introduce same paraphrases again
        existing_data.add((score, clean_rows(cleaned)))
        seen_batch.add((score, clean_rows(cleaned)))


        accepted_para[score] += 1
        added += 1

    print_progress(extra_note=f"| Score {score} +{added}")
    time.sleep(1.0)

print("Generation done. New rows created:", len(new_rows))


Progress: 10/4902 | accepted={0: 0, 1: 0, 2: 0, 3: 10} | Time elapsed=0.15 mins | Score 3 +10
Progress: 21/4902 | accepted={0: 0, 1: 0, 2: 0, 3: 21} | Time elapsed=0.30 mins | Score 3 +11
Progress: 36/4902 | accepted={0: 0, 1: 0, 2: 0, 3: 36} | Time elapsed=0.47 mins | Score 3 +15
Bad JSON from Claude. Retrying. (streak: 1 )
[
"Polymer plastic B demonstrates the greatest stretchability, as it has the longest length in both trials. To improve the experiment, the student could have specified the amount of weight added to each plastic sample, and repeated the trials more th
Progress: 50/4902 | accepted={0: 0, 1: 0, 2: 0, 3: 50} | Time elapsed=0.90 mins | Score 3 +14
Progress: 72/4902 | accepted={0: 0, 1: 0, 2: 0, 3: 72} | Time elapsed=1.09 mins | Score 3 +22
Progress: 82/4902 | accepted={0: 0, 1: 0, 2: 0, 3: 82} | Time elapsed=1.32 mins | Score 3 +10
Progress: 87/4902 | accepted={0: 0, 1: 0, 2: 0, 3: 87} | Time elapsed=1.45 mins | Score 3 +5
Progress: 101/4902 | accepted={0: 0, 1: 0, 2: 0

In [None]:
file_new = pd.DataFrame(new_rows)

combined = pd.concat([file, file_new], ignore_index=True)

combined["EssayText"] = combined["EssayText"].map(clean_rows)
combined["type"] = combined["type"].fillna("").astype(str).str.strip().str.lower()

orig = combined[combined["type"] != "paraphrased"].copy()
para = combined[combined["type"] == "paraphrased"].copy()

# Remove paraphraseâ€“paraphrase duplicates within the same score (normalized) TEMPORARILY
para["_norm"] = para["EssayText"].map(clean_rows)
para = para.drop_duplicates(subset=["Score2", "_norm"], keep="first").copy()
para = para.drop(columns=["_norm"])

# Remove paraphrases that replicate an original within the same score (normalized) TEMPORARILY
orig["_norm"] = orig["EssayText"].map(clean_rows)
orig_keys = set(zip(orig["Score2"], orig["_norm"]))

para["_norm"] = para["EssayText"].map(clean_rows)
para = para[~para.apply(lambda r: (r["Score2"], r["_norm"]) in orig_keys, axis=1)].copy()

para = para.drop(columns=["_norm"])
orig = orig.drop(columns=["_norm"])

final_22000 = pd.concat([orig, para], ignore_index=True)

print("Original rows: ", len(orig))
print("Total rows in dataset after augmenting to 22000: ", len(final_22000))
print(final_22000["Score2"].value_counts().sort_index())

# Final check to see if there's no paraphrased dupes (TEMPORARY norm column)
paraRows = final_22000[final_22000["type"] == "paraphrased"].copy()
paraRows["_norm"] = paraRows["EssayText"].map(clean_rows)
dupe_para = paraRows.duplicated(subset=["Score2", "_norm"], keep=False).sum()
print("Paraphrased duplicate rows involved:", dupe_para)

# Paraphrased duplicate rows per score (TEMPORARY norm column)
dupes_per_score = (
    paraRows.duplicated(subset=["Score2", "_norm"], keep=False)
           .groupby(paraRows["Score2"])
           .sum()
)

print("Paraphrased duplicate rows per score:")
print(dupes_per_score)

# Total number of extra duplicate paraphrase copies (TEMPORARY norm column)
dupes = paraRows.duplicated(subset=["Score2", "_norm"]).sum()

counts = final_22000["Score2"].value_counts().sort_index()

print("Dupes per score (paraphrased only):", dupes)
print("Counts:", counts.to_dict(), "Total:", len(final_22000))

# Clean up temp column in paraRows (final_22000 never had it)
paraRows = paraRows.drop(columns=["_norm"])


Original rows:  13765
Total rows in dataset after augmenting to 22000:  22127
Score2
0    5548
1    5553
2    5526
3    5500
Name: count, dtype: int64
Paraphrased duplicate rows involved: 0
Paraphrased duplicate rows per score:
Score2
0    0
1    0
2    0
3    0
dtype: int64
Dupes per score (paraphrased only): 0
Counts: {0: 5548, 1: 5553, 2: 5526, 3: 5500} Total: 22127


In [None]:
save = "/content/drive/MyDrive/Data Augmentation/Claude/claude_22000"
outfile = f"{save}/complete_augment_22000.csv"

#keep ONLY these columns (this removes EssayText_norm if it exists)
final_22000 = final_22000[["EssayText", "Score2", "type"]].copy()

final_22000.to_csv(outfile, index=False)
print("Saved:", outfile)

Saved: /content/drive/MyDrive/Data Augmentation/Claude/claude_22000/complete_augment_22000.csv


In [None]:
#      !!!!!!!!!!!!    NO NEED TO HAVE THIS  !!!!!!!!!!
#The augmented dataset can be a bit more than 22000 as long as there is not a big gap between the scores

target = 5500

# originals stay EXACTLY as they are
original = final_22000[final_22000["type"] != "paraphrased"].copy()

# paraphrases only (order preserved)
paraphrased = final_22000[final_22000["type"] == "paraphrased"].copy()

# trim paraphrases per score WITHOUT reordering
kept_paras = []

for s in [0,1,2,3]:
    orig_count = (original["Score2"] == s).sum()
    para_needed = target - orig_count

    if para_needed < 0:
        raise ValueError(f"Score {s} has more than {target} originals")

    para_s = paraphrased[paraphrased["Score2"] == s]

    # keep ONLY what is needed, in original order
    kept_paras.append(para_s.iloc[:para_needed])

# concatenate paraphrases (still in original appearance order)
paraphrased_trimmed = pd.concat(kept_paras)

# FINAL ORDER: originals first, paraphrases after
final_exact_22000 = pd.concat(
    [original, paraphrased_trimmed],
    ignore_index=True
)

print("Final counts:")
print(final_exact_22000["Score2"].value_counts().sort_index())
print("Total rows:", len(final_exact_22000))




Final counts:
Score2
0    5500
1    5500
2    5500
3    5500
Name: count, dtype: int64
Total rows: 22000


In [None]:
savepath = "/content/drive/MyDrive/Data Augmentation/Claude/claude_22000"
outpath = f"{savepath}/final_exact_22000.csv"

# keep ONLY these columns (this removes EssayText_norm if it exists)
final_exact_22000 = final_exact_22000[["EssayText", "Score2", "type"]].copy()

final_exact_22000.to_csv(outpath, index=False)
print("Saved:", outpath)
print("Rows:", len(final_exact_22000))
print(final_exact_22000["Score2"].value_counts().sort_index())


Saved: /content/drive/MyDrive/Data Augmentation/Claude/claude_22000/final_exact_22000.csv
Rows: 22000
Score2
0    5500
1    5500
2    5500
3    5500
Name: count, dtype: int64


In [None]:
file = pd.read_csv("/content/drive/MyDrive/Data Augmentation/Claude/claude_22000/final_claude_22000.csv")
fd_3 = file[file["Score2"].isin([0])]

s = 2
fd = file[file["Score2"] == s].copy()

dupes = fd.duplicated(subset=["EssayText"], keep=False).sum()
print(f"Exact duplicates for score {s}: {dupes}")

Exact duplicates for score 2: 52


In [None]:
file = pd.read_csv("/content/drive/MyDrive/Data Augmentation/Claude/claude_22000/augment_22000_(HASDUPES).csv")
print("Saved: ", len(file))
print(file["Score2"].value_counts().sort_index())

Saved:  22000
Score2
0    5500
1    5500
2    5500
3    5500
Name: count, dtype: int64
