In [1]:
import pandas as pd
import time
from tqdm import tqdm
import os
from dotenv import load_dotenv
from openai import OpenAI

In [2]:
load_dotenv()
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
DATASET_PATH = '../dataset/split/not_guilty_split.csv'
AUGMENTED_PATH = './dataset/split/not_guilty_split_augmented.csv'

In [4]:
df = pd.read_csv(DATASET_PATH)

df

Unnamed: 0,Trial_ID,Date,Defendant_Gender,Num_Defendants,Victim_Gender,Num_Victims,Offence,Offence_Subcategory,Verdict,Text_Length,Year,Trial_Text
0,t18300218-8,1830-02-18,male,1,female,3,theft,burglary,notGuilty,11025,1830,james webber indict feloniously sacrialigiousl...
1,t18300218-19,1830-02-18,male,1,male,1,theft,pocketpicking,notGuilty,2552,1830,william jnes indict steal january handkerchief...
2,t18300218-27,1830-02-18,male,1,female,1,theft,simpleLarceny,notGuilty,4542,1830,william abercrmbie indict steal december handk...
3,t18300218-36,1830-02-18,female,1,male,1,theft,simpleLarceny,notGuilty,267,1830,agnes cnnell indict steal ebruary lb weight ha...
4,t18300218-53,1830-02-18,male,1,male,1,theft,simpleLarceny,notGuilty,2359,1830,jhn taylr indict steal january set gig harness...
...,...,...,...,...,...,...,...,...,...,...,...,...
995,t17650116-22,1765-01-16,male,1,male,2,violentTheft,highwayRobbery,notGuilty,1361,1765,john baptist rosa indict king highway elizabet...
996,t17650116-23,1765-01-16,male,1,female,1,theft,burglary,notGuilty,2843,1765,joseph wiggins indict december hour one night ...
997,t17650116-27,1765-01-16,female,1,male,1,theft,grandLarceny,notGuilty,290,1765,judith barden spinster indict steal ten linen ...
998,t17650116-37,1765-01-16,male,3,female,3,theft,burglary,notGuilty,12817,1765,john robinson john rouson indict january hour ...


In [5]:
df.shape

(1000, 12)

In [6]:
def generate_paraphrases(text, num_variations=2):
    paraphrases = []

    if text is None:
        return [None] * num_variations
    text = str(text)

    for i in range(num_variations):
        try:
            prompt = (
                "I need you to paraphrase the following trial text. "
                "Keep all the facts but rewrite it; I'm using it for text dataset augmentation. "
                "IMPORTANT: Your response must ONLY contain the rewritten text with no explanations, "
                "no markdown, no notes, and no meta-commentary.\n\n"
                f"Original text:\n{text}\n\n"
                "Paraphrased version:"
            )

            response = client.chat.completions.create(
                model="gpt-5-nano",
                messages=[{"role": "user", "content": prompt}],
                temperature=1,
                max_completion_tokens=500,
            )

            content = (response.choices[0].message.content or "").strip()
            paraphrases.append(content if content else None)

            time.sleep(1)

        except Exception as e:
            print(f"Error generating paraphrase {i+1}: {e}")
            paraphrases.append(None)

    return paraphrases

In [7]:
import random

augmented_data = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    trial_id = row['Trial_ID']
    original_text = row['Trial_Text']
    
    num_variations = random.choice([1, 2])
    paraphrases = generate_paraphrases(original_text, num_variations=num_variations)
    
    augmented_data.append({
        'Trial_ID': trial_id,
        'Date': row['Date'],
        'Defendant_Gender': row['Defendant_Gender'],
        'Num_Defendants': row['Num_Defendants'],
        'Victim_Gender': row['Victim_Gender'],
        'Num_Victims': row['Num_Victims'],
        'Offence': row['Offence'],
        'Offence_Subcategory': row['Offence_Subcategory'],
        'Verdict': row['Verdict'],
        'Text_Length': row['Text_Length'],
        'Year': row['Year'],
        'Trial_Text': original_text,
    })
    
    for i, paraphrase in enumerate(paraphrases, 1):
        if paraphrase and "**" not in paraphrase:
            paraphrase = paraphrase.replace("\n", "")
            augmented_data.append({
                'Trial_ID': f"{trial_id}_p{i}",
                'Date': row['Date'],
                'Defendant_Gender': row['Defendant_Gender'],
                'Num_Defendants': row['Num_Defendants'],
                'Victim_Gender': row['Victim_Gender'],
                'Num_Victims': row['Num_Victims'],
                'Offence': row['Offence'],
                'Offence_Subcategory': row['Offence_Subcategory'],
                'Verdict': row['Verdict'],
                'Text_Length': len(paraphrase),
                'Year': row['Year'],
                'Trial_Text': paraphrase,
            })

  3%|▎         | 32/1000 [06:18<3:10:40, 11.82s/it]


KeyboardInterrupt: 

In [8]:
augmented_df = pd.DataFrame(augmented_data)

In [11]:
augmented_df['Trial_Text'][0]

'james webber indict feloniously sacrialigiously break enter certain chapel mary abbotts kensington ebruary feloniously steal therein scarf value bottle value pint wine value handkerchief value drinking glass value linen cloth value corkscrew value good rances harrison others cunt call good william taylor cunt call good ann warner ann warner pew opener brompton chapel stand parish mary abbotts kensington episcopal chapel sunday january leave chapel twenty minute nine clock night last person make door safe window shut go monday three four clock discover wax candle take pulpit vestry door open chapel key take door lead vestry passage side chapel open unbolt unlocked think person must secrete chapel out vestry door unbolt unlocked appearance break outside chapel first miss candle pulpit go vestry miss glass thing clergyman robe box break open lock force two handkerchief go box two glass cloth nothing else lose box see sunday even scarf go wine cupboard wine cork screw take cupboard piece 

In [12]:
augmented_df['Trial_Text'][1]

'william jnes indict steal january handkerchief value good ralph charles price person ralph charles price live william street blackfriar son ralph price january three clock ludgate hill handkerchief safe half hour leave home felt pull pocket turn round saw handkerchief prisoner hand immediately collar secure appear another man follow compter return saw outside gate delivered handkerchief officer quite sure mine officer prisoner defence go look situation meet person gracechurch street ask see funeral sir thomas lawrence stand corner ludgate hill call william man pick gentleman pocket man throw handkerchief run away take ran man tap gentleman shoulder tell man run collared price tell collar another person take till collar tap shoulder time funeral eleanr itch live harcourt street skinner street some town ostrich feather maker go minories purchase feather saw prisoner know three four year alone saw handkerchief take gentleman pocket say except prisoner hold arm ask assist far go take ill 

In [None]:
augmented_df.tail(20)

Unnamed: 0,Trial_ID,Date,Defendant_Gender,Num_Defendants,Victim_Gender,Num_Victims,Offence,Offence_Subcategory,Verdict,Text_Length,Year,Trial_Text
980,t17430114-52,1743-01-14,female,3,male,1,theft,theftFromPlace,notGuilty,653,1743,william woodcock mary wife indict steal coat v...
981,t17600709-2,1760-07-09,female,1,female,1,theft,grandLarceny,notGuilty,1093,1760,bridget callahan spinster indict steal one yar...
982,t17600709-11,1760-07-09,female,1,male,1,theft,grandLarceny,notGuilty,2069,1760,elizabeth stevens spinster indict steal one li...
983,t17600709-14,1760-07-09,female,1,male,1,theft,pocketpicking,notGuilty,1562,1760,catherine messenger spinster indict steal one ...
984,t17600709-19,1760-07-09,male,1,male,1,deception,fraud,notGuilty,8812,1760,john carver indict take false oath ducarrel pr...
985,t17600709-20,1760-07-09,female,1,male,1,violentTheft,robbery,notGuilty,7998,1760,margaret edward widow indict make assault upon...
986,t17600709-22,1760-07-09,female,1,male,1,theft,grandLarceny,notGuilty,4565,1760,susannah hanby widow indict steal cloth coat v...
987,t17600709-24,1760-07-09,male,1,male,1,theft,theftFromPlace,notGuilty,1318,1760,jer wakefield harcourt indict steal one coat w...
988,t17600709-26,1760-07-09,male,1,male,1,theft,grandLarceny,notGuilty,485,1760,william wake indict steal one copper pot cover...
989,t17600709-28,1760-07-09,male,1,male,1,theft,grandLarceny,notGuilty,219,1760,john elliot indict steal one piece muslin valu...


In [None]:
augmented_df.to_csv(AUGMENTED_PATH, index=False)

OSError: Cannot save file into a non-existent directory: 'dataset/augmented'

In [None]:
augmented_df.shape

(300, 6)