In [35]:
import pandas as pd
import numpy as np
import json
import os
import re
import platform
import tiktoken
import itertools
import requests
import time
import sys

from datetime import datetime
from openai import OpenAI
# from parascore import ParaScorer

In [36]:
computing_system = platform.system()

if computing_system == "Darwin":
    print("Running on Mac")
    base_loc = "/Volumes/BCross/datasets/author_verification"
elif computing_system == "Windows":
    print("Running on Windows")
    base_loc = "//bc_nas_storage/BCross/datasets/author_verification"
elif computing_system == "Linux":
    print("Running on Linux")
    base_loc = "/mnt/BCross/datasets/author_verification"
else:
    print("Unknown System")

Running on Linux


In [37]:
credential_loc = "../../credentials.json"
encoding = tiktoken.encoding_for_model('gpt-4o-mini')

data_type = "training"
corpus = "Wiki"

data_loc = f"{base_loc}/{data_type}/{corpus}"

raw_data_loc = f"{base_loc}/{data_type}/{corpus}/known_raw.jsonl"

save_loc = f"{data_loc}/full_doc_paraphrase/"
os.makedirs(save_loc, exist_ok=True)

## Initialise OpenAI Client

In [38]:
with open(credential_loc, 'r') as f:
    data = json.load(f)
    
os.environ["OPENAI_API_KEY"] = data['OPENAI_API_KEY']
os.environ["PUSHOVER_USER_KEY"] = data['PUSHOVER_USER_KEY']
os.environ["PUSHOVER_API_TOKEN"] = data['PUSHOVER_API_TOKEN']

client = OpenAI(
    max_retries=3
)

# scorer = ParaScorer(lang="en", model_type='bert-large-uncased')

In [39]:
system_prompt = """
Your role is to function as an advanced paraphrasing assistant. Your task is to generate a fully paraphrased version of a given document that preserves its original meaning, tone, genre, and style, while exhibiting significantly heightened lexical diversity and structural transformation. The aim is to produce a document that reflects a broad, globally influenced language profile for authorship verification research.

Guidelines:

1. **Preserve Core Meaning & Intent:**  
   - Ensure that the paraphrased text maintains the original document’s logical flow, factual accuracy, and overall message.  
   - Retain the tone, style, and genre to match the source content precisely.

2. **Maximize Lexical Diversity:**  
   - Use an extensive range of synonyms, idiomatic expressions, and alternative phrasings to replace common expressions.  
   - Avoid repetitive language; introduce varied vocabulary throughout the document to ensure a fresh linguistic perspective.

3. **Transform Structural Elements:**  
   - Reorganize sentences and paragraphs: invert sentence structures, vary sentence lengths, and use different clause orders.  
   - Experiment with alternative grammatical constructions and narrative flows without compromising clarity or meaning.

4. **Preserve Critical Terms & Proper Nouns:**  
   - Do not alter technical terms, names, or key references unless explicitly instructed.  
   - Ensure these elements remain intact to maintain the document's integrity.

5. **Ensure Naturalness & Cohesion:**  
   - Despite extensive lexical and structural changes, the paraphrased document must remain coherent, natural, and easily understandable.  
   - Strive for a balanced output that is both distinct in language and faithful to the original content.

6. **Output Format:**  
   - Provide only the paraphrased document without any extra commentary or explanations.  
   - The output must be structured in JSON format as follows:  

     {"new_document": <paraphrased_document>}

Instructions:
- Prioritize high lexical variation and significant syntactic reordering.
- Create a paraphrase that is distinct in wording and structure from the source while fully retaining its meaning, tone, and intent.
"""

In [40]:
def send_pushover_notification(message):
    data = {
        "token": os.getenv("PUSHOVER_API_TOKEN"),
        "user": os.getenv("PUSHOVER_USER_KEY"),
        "message": message
    }

    for _ in range(3):
        try:
            response = requests.post("https://api.pushover.net/1/messages.json", data=data)
            if response.status_code == 200:
                return
        except requests.exceptions.RequestException:
            pass

### Data Prep Helper Functions

In [41]:
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Parse the line as JSON
            parsed_line = json.loads(line)
            # If the line is a single-element list, extract the first element
            if isinstance(parsed_line, list) and len(parsed_line) == 1:
                data.append(parsed_line[0])
            else:
                data.append(parsed_line)
    
    # Convert to a DataFrame
    data = pd.DataFrame(data)
    return data

def write_jsonl(data, output_file_path):
    with open(output_file_path, 'w') as file:
        for _, row in data.iterrows():
            json.dump(row.to_dict(), file)
            file.write('\n')
            
def create_temp_doc_id(input_text):
    # Extract everything between the brackets
    match = re.search(r'\[(.*?)\]', input_text)
    if match:
        extracted_text = match.group(1)
        # Replace all punctuation and spaces with "_"
        cleaned_text = re.sub(r'[^\w]', '_', extracted_text)
        # Replace multiple underscores with a single "_"
        final_text = re.sub(r'_{2,}', '_', cleaned_text)
        return final_text.lower()
    return None

### Paraphrase Helper Functions

In [42]:
def paraphrase_call(text, system_prompt, client, n=10, model="gpt-4o-mini", temperature=0.7, top_p=0.9, **kwargs):
    """
    Calls the LLM with the specified hyperparameters.
    Returns the completion response.
    """
    completion = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": text}
        ],
        response_format={"type": "json_object"},
        n=n,
        temperature=temperature,
        top_p=top_p,
        **kwargs
    )
    return completion

def paraphrase_dataframe(df, system_prompt, client, n=10, m=1, **llm_params):
    """
    Iterates over the DataFrame rows and for each text calls the paraphrase LLM m times,
    with n completions each time. For every generated paraphrase, it creates a new row that
    contains the original data, the paraphrased text, and all provided hyperparameter settings.
    """
    expanded_rows = []
    for _, row in df.iterrows():
        text = row['text']
        for _ in range(m):
            response = paraphrase_call(text, system_prompt, client, n=n, **llm_params)
            # Process each completion (choice) from the LLM response.
            for choice in response.choices:
                paraphrased_text = json.loads(choice.message.content)['new_document']
                new_row = row.copy()
                new_row['paraphrased_text'] = paraphrased_text
                # Save every hyperparameter in the row.
                for param, value in llm_params.items():
                    new_row[param] = value
                expanded_rows.append(new_row)
    return pd.DataFrame(expanded_rows)

def grid_search_paraphrases(df, system_prompt, client, param_grid, sample_size=10, m=1, n_completions=10):
    """
    Selects a random subset of rows from the DataFrame (sample_size) and then iterates over
    all hyperparameter combinations from param_grid. For each combination, it calls paraphrase_dataframe,
    ensuring that each generated paraphrase row includes the hyperparameter values used.
    All results are concatenated into a single DataFrame.
    
    Parameters:
      - df: Original DataFrame (must contain a 'text' column).
      - system_prompt: The system prompt for the LLM.
      - client: The LLM client instance.
      - param_grid: A dictionary where keys are hyperparameter names and values are lists of choices.
      - sample_size: Number of random rows to sample.
      - m: Number of repeats per row.
      - n_completions: Number of completions per LLM call.
      
    Returns:
      - A DataFrame that includes original text, paraphrased_text, and columns for each hyperparameter.
    """
    # Sample a subset of rows for quick evaluation.
    sample_df = df.sample(n=sample_size, random_state=42)
    results = []
    # Generate all combinations of hyperparameter values.
    keys = list(param_grid.keys())
    for values in itertools.product(*param_grid.values()):
        param_dict = dict(zip(keys, values))
        # Generate paraphrases with the current hyperparameter combination.
        result_df = paraphrase_dataframe(sample_df, system_prompt, client, n=n_completions, m=m, **param_dict)
        results.append(result_df)
    # Concatenate all results; every row will include hyperparameter columns.
    final_df = pd.concat(results, ignore_index=True)
    # Optional: ensure all hyperparameter keys appear as columns (fill missing ones with None).
    for key in keys:
        if key not in final_df.columns:
            final_df[key] = None
    return final_df

def compute_parascore(row):
    """
    Compute the parascore for a given row.
    The function calls the scorer.score method with:
      - cands: a list containing the paraphrased_text from the row,
      - refs: a list containing the original text from the row,
    and returns the third element (index 2) of the resulting score.
    """
    # Create lists as required: first value is paraphrased_text and second is text.
    score = scorer.score(
        cands=[row["paraphrased_text"]],
        refs=[row["text"]],
        batch_size=16
    )
    return score[2].item()

### Data Prep

In [43]:
df = read_jsonl(raw_data_loc)

# Rename doc_id to orig_doc_id first
df.rename(columns={'doc_id': 'orig_doc_id'}, inplace=True)

# Create the new doc_id column directly
df['doc_id'] = df['orig_doc_id'].apply(create_temp_doc_id)
df['tokens'] = df['text'].apply(lambda x: len(encoding.encode(x)))

# Move the new doc_id column to the front
cols = ['doc_id'] + [col for col in df.columns if col not in ['doc_id', 'text']] + ['text']

df = df[cols]

df = df.sort_values(by='tokens', ascending=True)

In [44]:
docs = df['doc_id']

completed_files = [
    os.path.splitext(f)[0]  # Removes the file extension
    for f in os.listdir(save_loc)
    if os.path.isfile(os.path.join(save_loc, f)) and f.endswith('.jsonl')
]

files_to_be_processed = list(set(docs) - set(completed_files))
files_to_be_processed = sorted(files_to_be_processed)

print(f"Number of documents to process in raw data: {len(docs)}")
print(f"Files Complete: {len(completed_files)}")
print(f"Files to be Processed: {len(files_to_be_processed)}")

Number of documents to process in raw data: 225
Files Complete: 204
Files to be Processed: 21


In [45]:
df = df[~df['doc_id'].isin(completed_files)]

In [46]:
df

Unnamed: 0,doc_id,orig_doc_id,corpus,author,texttype,tokens,text
118,dennis_brown_text_10,known [Dennis_Brown - Text-10].txt,Wiki,Dennis_Brown,known,685,There is no ownership by me or anyone else.\nI...
49,bfigura_text_3,known [Bfigura - Text-3].txt,Wiki,Bfigura,known,686,For the re-close on the BLP RfC. Hopefully peo...
154,essjay_text_2,known [Essjay - Text-2].txt,Wiki,Essjay,known,692,"If you have questions, feel free to leave a ta..."
219,headleydown_text_2,known [HeadleyDown - Text-2].txt,Wiki,HeadleyDown,known,699,I made these points Remove redundancy Meta mod...
58,bjenks_text_4,known [Bjenks - Text-4].txt,Wiki,Bjenks,known,701,ADM 167 142 Board of Admiralty 1955 Warship 20...
179,fragments_of_jade_text_4,known [Fragments_of_Jade - Text-4].txt,Wiki,Fragments_of_Jade,known,706,It was not an assumption-the editor actually s...
194,gtadoc_text_4,known [Gtadoc - Text-4].txt,Wiki,Gtadoc,known,708,"Eric may have directed you to read those, have..."
135,edjohnston_text_2,known [EdJohnston - Text-2].txt,Wiki,EdJohnston,known,710,"Thank you, If you have limited space and you c..."
192,gtadoc_text_1,known [Gtadoc - Text-1].txt,Wiki,Gtadoc,known,711,While its fine to have some criticism those ed...
50,bfigura_text_4,known [Bfigura - Text-4].txt,Wiki,Bfigura,known,711,BLP-PROD into a proposal that can put forward ...


### Gridsearch to find top parameters

This was used originally to generate optimal parameters across the Enron dataset.

In [47]:
# param_grid = {
#     "temperature": [0.7, 0.8, 0.9],
#     'top_p': [0.8, 0.9, 1.0],
#     # "frequency_penalty": [0.0, 0.5, 1.0],
# }

# gridsearch_results = grid_search_paraphrases(df, system_prompt, client, param_grid, sample_size=10, m=1, n_completions=10)
# gridsearch_results["parascore"] = gridsearch_results.apply(compute_parascore, axis=1)
# gridsearch_results.head(5)

In [48]:
# # Group by the hyperparameter columns and compute the mean parascore
# grouped = gridsearch_results.groupby(["temperature", "top_p"])["parascore"].mean()
# print("Grouped Mean Parascores:")
# print(grouped)

# # Determine the combination with the highest mean parascore
# best_params = grouped.idxmax()  # This returns a tuple, e.g., (0.8, 0.9)
# best_score = grouped.loc[best_params]

# print("\nBest hyperparameter combination (by average score):")
# print(f"Temperature = {best_params[0]}")
# print(f"Top p = {best_params[1]}")
# print(f"Average Parascore = {best_score}")

In [49]:
# Selected values are temperature = 0.7, top_p = 0.8 with average ParaScore = 0.7910261923074722

### Function to process dataframe by document

In [50]:
def process_by_doc_id(df, system_prompt, client, save_loc, n=10, m=1, max_fails=3, **llm_params):
    """
    For each unique doc_id in the DataFrame:
      1. Print a header: "Document: <doc_id>, Doc <i> out of <total_docs>"
      2. Filter the DataFrame to include only rows with that doc_id.
      3. Loop m times, printing for each iteration: "Iteration: <current iteration> out of <m>".
      4. Call paraphrase_dataframe (with m=1 per iteration), handling failures gracefully.
      5. If an iteration fails, move to the next. If max_fails in a row occur, move to the next document.
      6. Concatenate the successful results and save them to a JSONL file with the doc_id appended to save_loc.
    
    Parameters:
      - df: Input DataFrame containing at least 'doc_id' and 'text' columns.
      - system_prompt: The system prompt for the LLM.
      - client: The LLM client instance.
      - save_loc: Base save location (filename prefix); the doc_id is appended.
      - n: Number of completions per LLM call.
      - m: Number of iterations per doc_id.
      - max_fails: Maximum allowed consecutive failures before skipping the document.
      - **llm_params: Additional hyperparameters for the LLM call.
    """
    unique_doc_ids = df['doc_id'].unique()
    num_docs = len(unique_doc_ids)
    
    for idx, doc_id in enumerate(unique_doc_ids, start=1):
        doc_start_time = time.time()
        
        print(f"Document: {doc_id} - Doc {idx} out of {num_docs}")
        
        filtered_df = df[df['doc_id'] == doc_id]
        iter_dfs = []
        fail_count = 0
        
        for i in range(m):
            if fail_count >= max_fails:
                print(f"Skipping document {doc_id} due to {fail_count} consecutive failures.")
                break
            
            print(f"    Iteration: {i+1} out of {m}")
            
            try:
                iter_df = paraphrase_dataframe(filtered_df, system_prompt, client, n=n, m=1, **llm_params)
                iter_dfs.append(iter_df)
                fail_count = 0  # Reset failure count on success
            except Exception as e:
                print(f"    Error in iteration {i+1} for document {doc_id}: {e}")
                fail_count += 1
                continue
                
        doc_time_taken = round(time.time() - doc_start_time, 2)

        # If there is data returned, try to save and send different message depending on if all iterations successful
        if iter_dfs:
            result_df = pd.concat(iter_dfs, ignore_index=True)
            file_path = f"{save_loc}{doc_id}.jsonl"
            
            try:
                write_jsonl(result_df, file_path)

                # Check for the number of iterations complete
                if i+1 == m:
                    status = "Complete"
                else:
                    status = f"Failed on iteration {i+1}"
                    
                message = f"Doc {idx} out of {num_docs} - {doc_id} - Status: {status} - Time taken: {doc_time_taken}s"
                print(f"    Saved results for document {doc_id} to {file_path}")
                
            except:
                message = f"ERROR SAVING FILE: Doc {idx} out of {num_docs} - {doc_id} CHECK NAS STATUS"
                print(f"    ERROR SAVING FILE: {doc_id} to {file_path} CHECK NAS STATUS")
                sys.exit()
                
        else:
            message = f"Doc {idx} out of {num_docs} - {doc_id} - Status: Failed with no data - Time taken: {doc_time_taken}s"
            print(f"    No successful iterations for document {doc_id}, skipping save.")

        # Send push notification to phone
        send_pushover_notification(message)

In [51]:
process_by_doc_id(df, system_prompt, client, save_loc, m=100, n=10, max_fails=5, temperature=0.7, top_p=0.8)

Document: dennis_brown_text_10 - Doc 1 out of 21
    Iteration: 1 out of 100
    Iteration: 2 out of 100
    Iteration: 3 out of 100
    Iteration: 4 out of 100
    Iteration: 5 out of 100
    Iteration: 6 out of 100
    Iteration: 7 out of 100
    Iteration: 8 out of 100
    Iteration: 9 out of 100
    Iteration: 10 out of 100
    Iteration: 11 out of 100
    Iteration: 12 out of 100
    Iteration: 13 out of 100
    Iteration: 14 out of 100
    Iteration: 15 out of 100
    Iteration: 16 out of 100
    Iteration: 17 out of 100
    Iteration: 18 out of 100
    Iteration: 19 out of 100
    Iteration: 20 out of 100
    Iteration: 21 out of 100
    Iteration: 22 out of 100
    Iteration: 23 out of 100
    Iteration: 24 out of 100
    Iteration: 25 out of 100
    Iteration: 26 out of 100
    Iteration: 27 out of 100
    Iteration: 28 out of 100
    Iteration: 29 out of 100
    Iteration: 30 out of 100
    Iteration: 31 out of 100
    Iteration: 32 out of 100
    Iteration: 33 out of 100
   