In [1]:
import pandas as pd
import requests
import time
from tqdm import tqdm
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")

In [3]:
DATASET_PATH = '../dataset/split/not_guilty_split_1.csv'

AUGMENTED_PATH = './dataset/augmented/augmented_not_guilty_split_1.csv'

In [4]:
df = pd.read_csv(DATASET_PATH)

df

Unnamed: 0,Trial_ID,Date,Defendant_Gender,Num_Defendants,Victim_Gender,Num_Victims,Offence,Offence_Subcategory,Verdict,Text_Length,Year,Trial_Text
0,t18300218-8,1830-02-18,male,1,female,3,theft,burglary,notGuilty,11025,1830,james webber indict feloniously sacrialigiousl...
1,t18300218-19,1830-02-18,male,1,male,1,theft,pocketpicking,notGuilty,2552,1830,william jnes indict steal january handkerchief...
2,t18300218-27,1830-02-18,male,1,female,1,theft,simpleLarceny,notGuilty,4542,1830,william abercrmbie indict steal december handk...
3,t18300218-36,1830-02-18,female,1,male,1,theft,simpleLarceny,notGuilty,267,1830,agnes cnnell indict steal ebruary lb weight ha...
4,t18300218-53,1830-02-18,male,1,male,1,theft,simpleLarceny,notGuilty,2359,1830,jhn taylr indict steal january set gig harness...
...,...,...,...,...,...,...,...,...,...,...,...,...
995,t17650116-22,1765-01-16,male,1,male,2,violentTheft,highwayRobbery,notGuilty,1361,1765,john baptist rosa indict king highway elizabet...
996,t17650116-23,1765-01-16,male,1,female,1,theft,burglary,notGuilty,2843,1765,joseph wiggins indict december hour one night ...
997,t17650116-27,1765-01-16,female,1,male,1,theft,grandLarceny,notGuilty,290,1765,judith barden spinster indict steal ten linen ...
998,t17650116-37,1765-01-16,male,3,female,3,theft,burglary,notGuilty,12817,1765,john robinson john rouson indict january hour ...


In [5]:
df.shape

(1000, 12)

In [6]:
def generate_paraphrases(text, num_variations=2):
    paraphrases = []
    
    for i in range(num_variations):
        try:
            prompt = f"""
            I need you to paraphrase the following trial text. 
            Keep all the facts but rewrite it, I'm using it for text dataset augmentation. 
            IMPORTANT: Your response must ONLY contain the rewritten text with no explanations, no markdown, no notes, and no meta-commentary.
            
            Original text: {text}
            
            Paraphrased version:
            """

            url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.5-flash:generateContent?key=" + GEMINI_API_KEY
            headers = {"Content-Type": "application/json"}
            data = {
                "contents": [{"parts": [{"text": prompt}]}],
                "generationConfig": {
                    "temperature": 1,
                    "maxOutputTokens": 500
                }
            }
            
            response = requests.post(url, headers=headers, json=data)
            
            if response.status_code == 200:
                response_json = response.json()
                if "candidates" in response_json and response_json["candidates"]:
                    if "content" in response_json["candidates"][0] and "parts" in response_json["candidates"][0]["content"]:
                        if response_json["candidates"][0]["content"]["parts"]:
                            paraphrased_text = response_json["candidates"][0]["content"]["parts"][0]["text"]
                            paraphrases.append(paraphrased_text)
                        else:
                            print(f"Error: unexpected JSON response structure for variation {i+1}")
                            paraphrases.append(None)
                else:
                    print(f"Error: No candidates found in the response for variation {i+1}")
                    paraphrases.append(None)
            else:
                print(f"Error: Received status code {response.status_code} for variation {i+1}")
                paraphrases.append(None)
            
            time.sleep(3)
            
        except Exception as e:
            print(f"Error generating paraphrase {i+1}: {e}")
            paraphrases.append(None)
    
    return paraphrases 

In [7]:
import random

augmented_data = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    trial_id = row['Trial_ID']
    original_text = row['Trial_Text']
    
    num_variations = random.choice([1, 2])
    paraphrases = generate_paraphrases(original_text, num_variations=num_variations)
    
    augmented_data.append({
        'Trial_ID': trial_id,
        'Date': row['Date'],
        'Defendant_Gender': row['Defendant_Gender'],
        'Num_Defendants': row['Num_Defendants'],
        'Victim_Gender': row['Victim_Gender'],
        'Num_Victims': row['Num_Victims'],
        'Offence': row['Offence'],
        'Offence_Subcategory': row['Offence_Subcategory'],
        'Verdict': row['Verdict'],
        'Text_Length': row['Text_Length'],
        'Year': row['Year'],
        'Trial_Text': original_text,
    })
    
    for i, paraphrase in enumerate(paraphrases, 1):
        if paraphrase and "**" not in paraphrase:
            paraphrase = paraphrase.replace("\n", "")
            augmented_data.append({
                'Trial_ID': f"{trial_id}_p{i}",
                'Date': row['Date'],
                'Defendant_Gender': row['Defendant_Gender'],
                'Num_Defendants': row['Num_Defendants'],
                'Victim_Gender': row['Victim_Gender'],
                'Num_Victims': row['Num_Victims'],
                'Offence': row['Offence'],
                'Offence_Subcategory': row['Offence_Subcategory'],
                'Verdict': row['Verdict'],
                'Text_Length': len(paraphrase),
                'Year': row['Year'],
                'Trial_Text': paraphrase,
            })


  0%|          | 3/1000 [00:33<3:14:35, 11.71s/it]

Error: Received status code 429 for variation 2


  0%|          | 4/1000 [00:43<3:05:25, 11.17s/it]

Error: Received status code 429 for variation 1
Error: Received status code 429 for variation 2


  0%|          | 5/1000 [00:50<2:40:10,  9.66s/it]

Error: Received status code 429 for variation 1


  1%|          | 6/1000 [00:53<2:04:51,  7.54s/it]

Error: Received status code 429 for variation 1


  1%|          | 7/1000 [00:57<1:42:53,  6.22s/it]

Error: Received status code 429 for variation 1


  1%|▏         | 13/1000 [01:42<1:59:06,  7.24s/it]

Error: Received status code 429 for variation 1


  1%|▏         | 14/1000 [01:45<1:39:57,  6.08s/it]

Error: Received status code 429 for variation 1
Error: Received status code 429 for variation 2


  2%|▏         | 15/1000 [01:52<1:44:05,  6.34s/it]

Error: Received status code 429 for variation 1


  2%|▏         | 16/1000 [01:56<1:29:52,  5.48s/it]

Error: Received status code 429 for variation 1


  2%|▏         | 20/1000 [02:39<2:46:00, 10.16s/it]

Error: Received status code 429 for variation 1
Error: Received status code 429 for variation 2


  2%|▏         | 21/1000 [02:46<2:30:10,  9.20s/it]

Error: Received status code 429 for variation 1
Error: Received status code 429 for variation 2


  2%|▏         | 22/1000 [02:53<2:19:04,  8.53s/it]

Error: Received status code 429 for variation 1
Error: Received status code 429 for variation 2


  3%|▎         | 26/1000 [03:21<1:56:39,  7.19s/it]

Error: Received status code 429 for variation 1
Error: Received status code 429 for variation 2


  3%|▎         | 27/1000 [03:28<1:55:42,  7.14s/it]

Error: Received status code 429 for variation 1


  3%|▎         | 28/1000 [03:31<1:37:53,  6.04s/it]

Error: Received status code 429 for variation 1


  3%|▎         | 29/1000 [03:34<1:25:17,  5.27s/it]

Error: Received status code 429 for variation 1
Error: Received status code 429 for variation 2


  3%|▎         | 30/1000 [03:42<1:33:54,  5.81s/it]

Error: Received status code 429 for variation 1
Error: Received status code 429 for variation 2


  3%|▎         | 31/1000 [03:49<1:39:24,  6.16s/it]

Error: Received status code 429 for variation 1


  3%|▎         | 32/1000 [03:52<1:26:04,  5.33s/it]

Error: Received status code 429 for variation 1


  3%|▎         | 33/1000 [03:55<1:16:48,  4.77s/it]

Error: Received status code 429 for variation 1
Error: Received status code 429 for variation 2


  3%|▎         | 34/1000 [04:02<1:27:17,  5.42s/it]

Error: Received status code 429 for variation 1


  4%|▎         | 35/1000 [04:06<1:17:54,  4.84s/it]

Error: Received status code 429 for variation 1
Error: Received status code 429 for variation 2


  4%|▎         | 36/1000 [04:13<1:28:02,  5.48s/it]

Error: Received status code 429 for variation 1
Error: Received status code 429 for variation 2


  4%|▎         | 37/1000 [04:20<1:34:08,  5.87s/it]

Error: Received status code 429 for variation 1


  4%|▎         | 37/1000 [04:20<1:53:07,  7.05s/it]


KeyboardInterrupt: 

In [None]:
augmented_df = pd.DataFrame(augmented_data)

In [None]:
augmented_df

Unnamed: 0,Trial_ID,Date,Verdict,Offence,Year,Trial_Text
0,t18360509-1202,1836-05-09,notGuilty,theft,1836,1202 . JOHN MURPHY was indicted for stealing o...
1,t18360509-1202_p1,1836-05-09,notGuilty,theft,1836,John Murphy was accused of stealing John Cross...
2,t18360509-1202_p2,1836-05-09,notGuilty,theft,1836,John Murphy was accused of stealing John Cross...
3,t18360509-1209,1836-05-09,notGuilty,theft,1836,1209 . MARY ATTERWELL MARY DAVIS and MARGARET ...
4,t18360509-1209_p1,1836-05-09,notGuilty,theft,1836,"Mary Atterwell, Mary Davis, and Margaret Drisc..."
...,...,...,...,...,...,...
295,t18351214-100_p1,1835-12-14,notGuilty,theft,1835,LYDIA BURGESS was accused on October 18th of s...
296,t18351214-100_p2,1835-12-14,notGuilty,theft,1835,LYDIA BURGESS was accused of theft on October ...
297,t18351214-195,1835-12-14,notGuilty,theft,1835,195 . JOHN SEREY was indicted for stealing on ...
298,t18351214-195_p1,1835-12-14,notGuilty,theft,1835,John Serey faced charges for the December 3rd ...


In [None]:
augmented_df.tail(20)

Unnamed: 0,Trial_ID,Date,Verdict,Offence,Year,Trial_Text
280,t18360404-1118_p1,1836-04-04,notGuilty,violentTheft,1836,"On March 28th, James Clements was accused of a..."
281,t18360404-1118_p2,1836-04-04,notGuilty,violentTheft,1836,James Clements was accused of attacking Eliza ...
282,t18360404-1119,1836-04-04,notGuilty,theft,1836,1119 . ANN RICH was indicted for stealing on t...
283,t18360404-1119_p1,1836-04-04,notGuilty,theft,1836,"Ann Rich faced charges of theft on March 27th,..."
284,t18360404-1119_p2,1836-04-04,notGuilty,theft,1836,"On March 27th, 1119, ANN RICH was charged with..."
285,t18360404-1127,1836-04-04,notGuilty,theft,1836,1127 . EDWARD LOADER was indicted for embezzle...
286,t18360404-1127_p1,1836-04-04,notGuilty,theft,1836,Edward Loader was on trial for alleged embezzl...
287,t18360404-1127_p2,1836-04-04,notGuilty,theft,1836,Edward Loader faced charges for embezzlement. ...
288,t18360404-1128,1836-04-04,notGuilty,theft,1836,1128 . EDWARD LOADER was again indicted for st...
289,t18360404-1128_p1,1836-04-04,notGuilty,theft,1836,Edward Loader was charged with the theft of a ...


In [None]:
augmented_df.to_csv(AUGMENTED_PATH, index=False)

In [None]:
augmented_df.shape

(300, 6)