<a href="https://colab.research.google.com/github/Ayushee-Seeburrun/ASAG-with-Data-Augmentation/blob/main/claude_27000.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os,json,time, random
import pandas as pd
import requests
from google.colab import userdata

In [None]:
readpath = "/content/drive/MyDrive/Data Augmentation/Claude/claude_22000"
file = pd.read_csv(f"{readpath}/complete_augment_22000.csv")


In [None]:
import re, unicodedata
def clean_rows(s):
    s = str(s)
    s = unicodedata.normalize("NFKC", s)
    s = s.replace("\u00A0", " ")
    s = re.sub(r"\s+", " ", s).strip()

    #removing "^p" or "^P" characters from the remaining original records and anywhere else if there are any (a few)
    s = s.replace("^p", " ").replace("^P", " ")
    s = re.sub(r"\s+", " ", s).strip()

    return s

file["EssayText"] = file["EssayText"].map(clean_rows)

file["type"] = file["type"].fillna("").astype(str).str.strip().str.lower()

#creating a standardised version of each answer to detect duplicates , normalisation is done to prevent a sentence from being treated as two different things
file["EssayText_normalise"] = file["EssayText"].map(clean_rows).map(lambda x: x.lower())

In [None]:
df = file.copy()
df["_norm"] = df["EssayText"].map(clean_rows).map(str.lower)

# mark duplicates within score
df["is_dup"] = df.duplicated(subset=["Score2", "_norm"], keep=False)

# see where duplicates come from
dup_source_counts = (
    df[df["is_dup"]]
    .groupby("type")
    .size()
)

print(dup_source_counts)

type
               268
paraphrased     11
dtype: int64


In [None]:
#to prioritise fairness and avoid bias towards any score
total = 27000
genperscore = total // 4

#counting the number of unique answers per score that exist in the dataset
uniqueScoreCount = (
    file.drop_duplicates(subset=["Score2", "EssayText_normalise"])      #looking at each answer within a score, if two answers have the same score and normalised text, only one is kept
      .groupby("Score2").size()     #counts the number of rows in each score group (unique counts)
      .reindex([0,1,2,3], fill_value=0)       #ensures that all the four scoers appear, even those with zero rows (it will be filled with "0")
)

generateScore = (genperscore - uniqueScoreCount).clip(lower=0).astype(int)

print("The current totalrows per score: ", file["Score2"].value_counts().sort_index().to_dict())
print("Current unique rows per score: ", uniqueScoreCount.to_dict())
print("The number of unique paraphrases to be added per score: ", generateScore.to_dict())

print("Sanity check: \n")
for s in [0,1,2,3]:
  print(s,uniqueScoreCount[s], "+" , generateScore[s], "=", uniqueScoreCount[s] + generateScore[s])


The current totalrows per score:  {0: 5548, 1: 5553, 2: 5526, 3: 5500}
Current unique rows per score:  {0: 5491, 1: 5500, 2: 5497, 3: 5496}
The number of unique paraphrases to be added per score:  {0: 1259, 1: 1250, 2: 1253, 3: 1254}
Sanity check: 

0 5491 + 1259 = 6750
1 5500 + 1250 = 6750
2 5497 + 1253 = 6750
3 5496 + 1254 = 6750


In [None]:
#selecting all original rows to build anchor pool to avoid claude learning from claude.
original = file[file["type"] != "paraphrased"].copy()
original["EssayText"] = original["EssayText"].map(clean_rows)


#detecting anchors that are incomplete, weak and not good learning materials before sending them as anchors
weak_resp = re.compile(
    #r is a raw string - prevents backslash issues
    r"\b(i\s*don't\s*know|i\s*do\s*not\s*know|idk|not\s*sure|no\s*idea|unsure|can't\s*remember)\b",    #creating a pattern to identify answers like i dont know unsure etc...
    re.IGNORECASE
)

def is_incomplete_weak(text: str) -> bool:
  t = text.strip()

  if len(t) < 40:
    return True

  bad_ends = (" to", " and", " or", " because"," but", " so", " when", " if", " which" )
  if t.lower().endswith(bad_ends):
    return True

  if weak_resp.search(t):   #looks for the weak responses and reject them
    return True

  if t.lower() in ("^p","a)","b)","a.","b.","1.","2.","1)","2)"):
    return True

  return False      #runs when neither of the bad conditions are met


score_anchors = {
    s: original.loc[original["Score2"] == s, "EssayText"].tolist()
    for s in [0, 1, 2, 3]
}
for s in [0, 1, 2, 3]:
  anc_ans_before = len(score_anchors[s])
  score_anchors[s] = [
      clean_rows(x)
      for x in score_anchors[s]
      if not is_incomplete_weak(x)
  ]

  seen = set()
  unq = []
  for x in score_anchors[s]:
    key = clean_rows(x).lower()
    if key in seen:
      continue
    seen.add(key)
    unq.append(x)
  score_anchors[s] = unq

  anc_ans_after = len(score_anchors[s])
  print(f"Score {s} anchors filtered: {anc_ans_before} ==> {anc_ans_after}")

for s in [0,1,2,3]:
   random.shuffle(score_anchors[s])


Score 0 anchors filtered: 5424 ==> 4667
Score 1 anchors filtered: 4518 ==> 4418
Score 2 anchors filtered: 3225 ==> 3184
Score 3 anchors filtered: 598 ==> 592


In [None]:
#all the answers that already exist in the dataset
existing_data = set(zip(file["Score2"], file["EssayText_normalise"]))

#this function checks if a paraphrase is safe to accept , seen_batch tracks what has already been accepted within the current response from Claude
def new_phrase(score, text, seen_batch) :

  #claening the claude output
  text = clean_rows(text)
  #if text has same score and same normalized text ---> its a duplicate
  key = (score, text.lower())
  return(key not in existing_data) and (key not in seen_batch), text

In [None]:
#strictly blocking starter commentary like "To replicate, based on, the student"
bad_start = re.compile(
    r'^\s*["\'(\[]*\s*('
    r'the student\b|the student\'s\b|this answer\b|the answer\b|'
    r'based on\b|to replicate\b|overall\b|in conclusion\b|according to\b'
    r')',
    re.IGNORECASE
)

def has_badstart(t: str) -> bool:
  return bad_start.search(t) is not None

def sentence_cutoff(t:str) -> bool:
  t = t.strip()
  bad_endings = (" to", " and", " or", " because"," but", " so", " when", " if", " which" )

  if t.lower().endswith(bad_endings):
    return True

  if len(t) < 40 and t[-1] not in ".?!":
    return True

  return False
def valid_paraphrase(text:str) -> bool:
  t = clean_rows(text)
  t1 = t.lower()

  if "^p" in t1:
    return False
  if has_badstart(t):
    return False
  if sentence_cutoff(t):
    return False
  if len(t) < 60:
    return False
  if re.match(r"^\s*\d+\.", t):
    return False
  if re.match(r"^\s*[a-dA-D][\)\.]\s+", t):
    return False
  if "(a)" in t1 or "(b)" in t1:
    return False
  if weak_resp.search(t1):
    return False

  return True

In [None]:
#setting the static rules to apply prompt caching
static_sys = """
You are helping with ASAG data augmentation.
Task: Paraphrase student answers.
Rules:
-Keep the SAME meaning and the SAME level of correctness.
-Do not add or remove information.
-Do not reuse any 4 consecutive words from the original answer.
-Do not output labels like "Student answer: " or "Paraphrased answer:" or any commentary.
-Return only valid JSON(no markdown).
-Output must be a JSON array of strings (each string is only the paraphrased answer), same order as input.
-Escape any quotes inside the strings with backslash.
-Write in a more natural tone, like a student answering an exam question.
-Write strictly in English.
-Avoid repetitive sentence starters. Vary the opening sentences each time.
- Do NOT begin answers with phrases such as: "To replicate the experiment", "Based on the data", "The student's data indicates", "To replicate".
- DO NOT refer to the response as a student's work (no "the student", no "the student's answer").
-Do NOT use lettered formatting like "a)" "b)".
-DO NOT output symbols like "^p".
-Do NOT use numbered lists (e.g., "1.", "2.", "3.") unless the original answer is a list.
"""

In [None]:
def dynamic_prompt(textlist):
  return(
      "Paraphrase the following student answers.\n Return only a valid JSON array of strings,same order.\n Do not add labels like 'student answer:' or 'paraphrased answer:'.\n" + json.dumps(textlist,ensure_ascii=False)
  )


In [None]:
claude_api_key = userdata.get('Claude_key')

In [None]:
def claude_call(prompt, max_retries=10):
  for attempt in range(max_retries):
    resp= requests.post("https://api.anthropic.com/v1/messages",
        headers={
            "x-api-key":claude_api_key,
            "anthropic-version": "2023-06-01",
            "content-type": "application/json"
      },
        json={
            "model": "claude-3-haiku-20240307",
            "max_tokens": 2500,
            "system": [{
                "type":"text",
                "text": static_sys,
                "cache_control": {"type": "ephemeral"}
            }],
            "messages": [{
                "role":"user",
                "content": [{"type": "text", "text" : prompt}]
            }]
        },
        timeout=90
    )
    if resp.status_code == 200:
      return resp.json()["content"][0]["text"]
    if resp.status_code in (429, 500, 502, 503, 529):
        wait = min(90, (2 ** attempt)) + random.uniform(0.5, 3.0)
        print(f"Claude busy ({resp.status_code}). Backing off {wait:.1f}s (attempt {attempt+1}/{max_retries})")
        time.sleep(wait)
        continue

    print("Claude API error: ", resp.status_code)
    print(resp.text[:300])
    return None
  print("Claude API: exceeded retries; skipping this batch.")
  return None

In [None]:
def parse_json_array(text):
    if text is None:
        return None
    t = text.strip()
    #a quick sanity checks to say that the sentence must start with [ and end with ]
    if not (t.startswith("[") and t.endswith("]")):
        return None
    try:
        data = json.loads(text)
        if isinstance(data, list) and all(isinstance(x, str) for x in data):
            return data
        return None
    except:
        return None

In [None]:
anchor_in = {s: 0 for s in [0,1,2,3]}

def next_anchor(score,k):
  anchor_pool = score_anchors[score]
  if not anchor_pool:
    return []

  out = []
  for x in range(k):
    out.append(anchor_pool[anchor_in[score] % len(anchor_pool)])
    anchor_in[score] += 1
  return out


In [None]:
batch_size = 30
checkpt = 200

accepted_para = {0:0, 1:0, 2:0, 3:0}
new_rows = []

bad_json_streak = 0
startTime = time.time()

def print_progress(extra_note=""):
    elapsed = (time.time() - startTime) / 60
    totalAdded = sum(accepted_para.values())
    totalNeeded = int(generateScore.sum())

    print(
        f"Progress: {totalAdded}/{totalNeeded} | "
        f"accepted={accepted_para} | Time elapsed={elapsed:.2f} mins {extra_note}"
    )

#keeps generating paraphrases until the required number has been added for all scores
while sum(accepted_para.values()) < int(generateScore.sum()):


    remaining = {s: int(generateScore[s] - accepted_para[s]) for s in [0,1,2,3]}
    #here the scores completed are being removed and only those that still need augmentation are kept
    remaining = {s:r for s,r in remaining.items() if r > 0}

    #if all scores completed, the loop is exited
    if not remaining:
        break

    #pick the most underfilled score
    score = max(remaining, key=remaining.get)

    # get anchors by cycling through originals to avoid repeating the same anchors too often
    k = min(batch_size, len(score_anchors[score]))

    if k == 0:
      print(f"No usable anchors left for score {score}. Marking score as complete to avoid infinite loop.")
      accepted_para[score] = int(generateScore[score])
      continue

    anchors = next_anchor(score, k)

    #converts anchor list -->  json-based prompt
    prompt = dynamic_prompt(anchors)
    #sends the request to Claude , returns a JSON array of paraphrases
    result = claude_call(prompt)

    if result is None:
        print("API failed. Retrying...")
        time.sleep(30)
        continue

    #converts the responses from Claude into a lsit
    paraphrases = parse_json_array(result)

    if paraphrases is None:
        bad_json_streak += 1
        print("Bad JSON from Claude. Retrying. (streak:", bad_json_streak, ")")
        print(result[:250])
        time.sleep(6)

        if bad_json_streak >= 10:
            print("Too many bad JSON responses. Stopping for safety.")
            break
        continue

    bad_json_streak = 0

    #this blocks dupes within the same CLaude response
    seen_batch = set()
    added = 0

    for para in paraphrases:
        if accepted_para[score] >= int(generateScore[score]):
            break
        if para is None:
            continue

        #cleaning the Claude output and checks for dupes
        ok, cleaned = new_phrase(score, para, seen_batch)
        if not ok:
            continue

        #strict validation (starters, ^p, formatting, cutoff, too short, weak phrases)
        if not valid_paraphrase(cleaned):
            continue


        # accept
        new_rows.append({
            "EssayText": cleaned,
            "Score2": score,
            "type": "paraphrased"
        })

        #below ensures that the future batches dont introduce same paraphrases again
        n = (score, clean_rows(cleaned).lower())
        existing_data.add(n)
        seen_batch.add(n)

        accepted_para[score] += 1
        added += 1

    print_progress(extra_note=f"| Score {score} +{added}")
    time.sleep(1.0)

print("Generation done. New rows created:", len(new_rows))


Progress: 26/5016 | accepted={0: 26, 1: 0, 2: 0, 3: 0} | Time elapsed=0.11 mins | Score 0 +26
Claude busy (529). Backing off 3.1s (attempt 1/10)
Progress: 31/5016 | accepted={0: 26, 1: 0, 2: 0, 3: 5} | Time elapsed=0.33 mins | Score 3 +5
Progress: 57/5016 | accepted={0: 26, 1: 0, 2: 26, 3: 5} | Time elapsed=0.54 mins | Score 2 +26
Progress: 83/5016 | accepted={0: 26, 1: 26, 2: 26, 3: 5} | Time elapsed=0.69 mins | Score 1 +26
Progress: 91/5016 | accepted={0: 26, 1: 26, 2: 26, 3: 13} | Time elapsed=0.90 mins | Score 3 +8
Bad JSON from Claude. Retrying. (streak: 1 )
[
"The initial step involves mRNA attaching to the ribosome to hold it in place. Next, tRNA brings the complementary anticodons and they match with the base pairs. In the third step, another tRNA brings another anticodon and the amino acids at the en
Progress: 100/5016 | accepted={0: 26, 1: 26, 2: 26, 3: 22} | Time elapsed=1.28 mins | Score 3 +9
Progress: 126/5016 | accepted={0: 52, 1: 26, 2: 26, 3: 22} | Time elapsed=1.45 min

In [None]:
file_new = pd.DataFrame(new_rows)

combined = pd.concat([file, file_new], ignore_index=True)

combined["EssayText"] = combined["EssayText"].map(clean_rows)
combined["type"] = combined["type"].fillna("").astype(str).str.strip().str.lower()

orig = combined[combined["type"] != "paraphrased"].copy()
para = combined[combined["type"] == "paraphrased"].copy()

orig["_normalise"] = orig["EssayText"].map(clean_rows).map(str.lower)
para["_normalise"] = para["EssayText"].map(clean_rows).map(str.lower)

para = para.drop_duplicates(subset=["Score2", "_normalise"], keep="first")

orig_keys = set(zip(orig["Score2"], orig["_normalise"]))
para = para[~para.apply(lambda r: (r["Score2"], r["_normalise"]) in orig_keys, axis=1)]

orig = orig.drop(columns=["_normalise"])
para = para.drop(columns=["_normalise"])

final_27000 = pd.concat([orig, para], ignore_index=True)

print("Total rows in dataset after augmenting to 22000: ", len(final_27000))
print(final_27000["Score2"].value_counts().sort_index())

#final check to see if there's no paraphrased dueps
paraRows = final_27000[final_27000["type"] == "paraphrased"].copy()

paraRows["_normalise"] = paraRows["EssayText"].map(clean_rows).map(str.lower)
dupe_para = paraRows.duplicated(subset=["Score2", "_normalise"], keep=False).sum()
print("Paraphrased duplicate rows involved:", dupe_para)

Total rows in dataset after augmenting to 22000:  27136
Score2
0    6807
1    6803
2    6776
3    6750
Name: count, dtype: int64
Paraphrased duplicate rows involved: 0


In [None]:
folder = "/content/drive/MyDrive/Data Augmentation/Claude/claude_27000"
os.makedirs(folder,exist_ok=True)

In [None]:
savepath = "/content/drive/MyDrive/Data Augmentation/Claude/claude_27000"
outpath = f"{savepath}/augment_claude_27000_final.csv"
final_27000 = final_27000[["EssayText", "Score2", "type"]].copy()
final_27000.to_csv(outpath, index=False)
print("Saved:", outpath)

Saved: /content/drive/MyDrive/Data Augmentation/Claude/claude_27000/augment_claude_27000_final.csv
