<a href="https://colab.research.google.com/github/Ayushee-Seeburrun/ASAG-with-Data-Augmentation/blob/main/datasetSplit_and_balancing_score2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os, requests, json, time, random
from google.colab import userdata
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import files
files.upload()

Saving train.tsv to train.tsv




In [None]:
#  ---------SPLITING THE DATASET INTO TRAINING(80%) AND TESTING(20%) -------------

In [None]:
df = pd.read_csv("/content/train.tsv", sep="\t")
df = df[["EssayText", "Score2"]].dropna()

print("Total rows: ",len(df))
print("Score2 classes:", sorted(df["Score2"].unique()))

Total rows:  17207
Score2 classes: [np.int64(0), np.int64(1), np.int64(2), np.int64(3)]


In [None]:
#splitting the dataset where 80% is for training, 20% is for testing, stratify ensures that both have the same score distribution for fairness
train_set, eval_set = train_test_split(df, test_size=0.20, random_state=42, stratify=df["Score2"])
train_set = train_set.reset_index(drop=True)
eval_set = eval_set.reset_index(drop=True)

print("Trainig dataset rows: ", len(train_set))
print("Evaluation dataset rows: ", len(eval_set))

Trainig dataset rows:  13765
Evaluation dataset rows:  3442


In [None]:
driveloc = "/content/drive/MyDrive"
datasetsplit = f"{driveloc}/Data Augmentation/Dataset Splits"
os.makedirs(datasetsplit, exist_ok=True)

train_set.to_csv(f"{datasetsplit}/train_set.csv", index=False)
eval_set.to_csv(f"{datasetsplit}/test_set.csv", index=False)

print ("saving the split dataset to: ", datasetsplit)

saving the split dataset to:  /content/drive/MyDrive/Data Augmentation/Dataset Splits


In [None]:
!ls "/content/drive/MyDrive/Data Augmentation/Dataset Splits"

'Copy of train_set (1).csv'   test_set.csv
'Copy of train_set.csv'       train_set.csv


In [None]:
splitpath = "/content/drive/MyDrive/Data Augmentation/Dataset Splits"
train = pd.read_csv(f"{splitpath}/train_set.csv")

print("Total rows in training set: ", len(train))

#counting the number of rows that have the score 0,1,2,3
print(train["Score2"].value_counts().sort_index())

Total rows in training set:  13765
Score2
0    5424
1    4518
2    3225
3     598
Name: count, dtype: int64


In [None]:
df2 = train[train["Score2"] == 2].copy().reset_index(drop=True)
print("Total rows: ", len(train))
print("rows with score 2: ",len(df2))

Total rows:  13765
rows with score 2:  3225


In [None]:
add_s2 = 1000
batch_size = 25
batch_needed = (add_s2 + batch_size - 1) // batch_size
print("Batches needed : ",batch_needed)

Batches needed :  40


In [None]:
#setting the static rules to apply prompt caching
static_sys = """
You are helping with ASAG data augmentation.
Task: Paraphrase student answers.
Rules:
-Keep the SAME meaning and the SAME level of correctness.
-Do not add or remove information.
-Do not output labels like "Student answer: " or "Paraphrased answer:" or any commentary.
-Return only valid JSON(no markdown).
-Output must be a JSON array of strings (each string is only the paraphrased answer), same order as input.
-Escape any quotes inside the strings with backslash.
"""

In [None]:
def build_prompt(textlist):
  return(
      "Paraphrase the following student answers.\n Return only a valid JSON array of strings,same order.\n Do not add labels like 'student answer:' or 'paraphrased answer:'.\n" + json.dumps(textlist,ensure_ascii=False)
  )


In [None]:
claude_api_key = userdata.get('Claude_key')

In [None]:
def claude_call(prompt):
  resp= requests.post("https://api.anthropic.com/v1/messages",
      headers={
          "x-api-key":claude_api_key,
          "anthropic-version": "2023-06-01",
          "content-type": "application/json"
     },
      json={
          "model": "claude-3-haiku-20240307",
          "max_tokens": 3000,
          "system": [{
              "type":"text",
              "text": static_sys,
              "cache_control": {"type": "ephemeral"}
          }],
          "messages": [{
              "role":"user",
              "content": [{"type": "text", "text": prompt}]
          }]
      },
  )
  if resp.status_code == 200:
    return resp.json()["content"][0]["text"]
  else:
    print("Claude API error: ", resp.status_code)
    print(resp.text[:300])
    return None

In [None]:
#converting the JSON text into a Python list of strings
def parse_json_array(text):
  arr = json.loads(text.strip())
  #checks if claude returned a list
  if not isinstance(arr, list):
    return None
  #to check if every item in the list is a string
  if not all(isinstance(x,str) for x in arr):
    return None
  return arr

In [None]:
paraphrased_rows = []
#using a set to prevent duplicates
existingdata = set(train["EssayText"].astype(str).tolist())
#retrieve all answers with score 2 and putting them in a list
s2answers = df2["EssayText"].astype(str).tolist()

target = add_s2
startTime = time.time()

while len(paraphrased_rows) < target:
  #Pick the smaller between how many answers I want in one batch and how many Score-2 answers I actually have in my trainset
  k = min(batch_size, len(s2answers))
  #picking real student answers with score 2 to give to claude
  anchors = random.sample(s2answers, k)

  prompt = build_prompt(anchors)
  result = claude_call(prompt)

  if result is None:
    print("API failed. Retrying")
    time.sleep(2)
    continue

  try:
    paraphrases = parse_json_array(result)
    if paraphrases is None:
      raise ValueError("Invalid JSON")
  except:
    print("Bad JSON from Claude. Retrying")
    print(result[:300])
    time.sleep(2)
    continue

  added = 0
  for para in paraphrases:
    para = para.strip()

    #rejecting short answers
    if len(para) < 5:
      continue
    #rejecting dupes
    if para in existingdata:
      continue

    paraphrased_rows.append({
        "EssayText": para,
        "Score2": 2,
        "type": "paraphrased"
    })

    existingdata.add(para)
    added += 1

    if len(paraphrased_rows) >= target:
      break

  elapsed = time.time() - startTime
  print(f"Added {added} in this batch | Total: {len(paraphrased_rows)}/{target} | Time elapsed: {elapsed/60:.2f}mins" )

  time.sleep(0.5)

Added 22 in this batch | Total: 22/1000 | Time elapsed: 0.14mins
Added 25 in this batch | Total: 47/1000 | Time elapsed: 0.30mins
Added 25 in this batch | Total: 72/1000 | Time elapsed: 0.42mins
Added 24 in this batch | Total: 96/1000 | Time elapsed: 0.56mins
Added 24 in this batch | Total: 120/1000 | Time elapsed: 0.70mins
Added 24 in this batch | Total: 144/1000 | Time elapsed: 0.86mins
Added 25 in this batch | Total: 169/1000 | Time elapsed: 1.04mins
Added 16 in this batch | Total: 185/1000 | Time elapsed: 1.14mins
Added 16 in this batch | Total: 201/1000 | Time elapsed: 1.24mins
Added 24 in this batch | Total: 225/1000 | Time elapsed: 1.48mins
Added 20 in this batch | Total: 245/1000 | Time elapsed: 1.61mins
Added 25 in this batch | Total: 270/1000 | Time elapsed: 1.78mins
Added 24 in this batch | Total: 294/1000 | Time elapsed: 1.94mins
Added 20 in this batch | Total: 314/1000 | Time elapsed: 2.17mins
Added 22 in this batch | Total: 336/1000 | Time elapsed: 2.32mins
Added 25 in th

In [None]:
#converting the list into a table
generated_s2 = pd.DataFrame(paraphrased_rows)
print("Generated rows are: ", len(generated_s2))
generated_s2.head(20)

In [None]:
train_aug_s2 = pd.concat([train,generated_s2], ignore_index=True)
print("Before: ", train["Score2"].value_counts().sort_index())
print("After: ",train_aug_s2["Score2"].value_counts().sort_index())

Before:  Score2
0    5424
1    4518
2    3225
3     598
Name: count, dtype: int64
After:  Score2
0    5424
1    4518
2    4225
3     598
Name: count, dtype: int64


In [None]:
savedrive = f"{splitpath}/trainset_score2_1000_augmented.csv"
train_aug_s2.to_csv(savedrive, index=False)


In [None]:
print(f"Total rows in the dataset after augmenting score 2 moved from {len(train)} to {len(train_aug_s2)}")

Total rows in the dataset after augmenting score 2 moved from 13765 to 14765
