In [344]:
import json
import os
import re
import time

import pandas as pd

from openai import OpenAI

In [345]:
credential_loc = "../../credentials.json"

data_type = "training"
corpus = "The Telegraph"

base_loc = f"/Volumes/BCross/datasets/author_verification/"
data_loc = f"{base_loc}{data_type}/{corpus}/"
batch_loc = f"{data_loc}batch_sentence_preprocessed/"
raw_data_loc = f"{data_loc}known_raw.jsonl"
processed_data_loc = f"{data_loc}known_processed.jsonl"

temp_local_location = f"/Users/user/Documents/temp_datasets/author_verification/{data_type}/{corpus}/"
os.makedirs(temp_local_location, exist_ok=True)

# Location for data when sent to batch
batch_sent_loc = f"{data_loc}batch_sentence_sent/"
os.makedirs(batch_sent_loc, exist_ok=True)

# Location once batch complete
batch_complete_loc = f"{data_loc}batch_sentence_complete/"
os.makedirs(batch_complete_loc, exist_ok=True)

# Location once batch complete
batch_fail_loc = f"{data_loc}batch_sentence_fail/"
os.makedirs(batch_fail_loc, exist_ok=True)

# Location to save the reasons for failure
batch_fail_reason_loc = f"{data_loc}batch_fail_reasons/"
os.makedirs(batch_fail_reason_loc, exist_ok=True)

# Phone number for WhatsApp notifications
phone_number = "+447756976114"

### Initialise OpenAI Clients

In [346]:
with open(credential_loc, 'r') as f:
    data = json.load(f)
    
os.environ["OPENAI_API_KEY"] = data['OPENAI_API_KEY']

client = OpenAI()

### Helper Functions

In [347]:
def read_jsonl(file_path):
    """
    Reads a JSONL file and converts it into a pandas DataFrame.

    Parameters:
    - file_path: Path to the JSONL file to read.

    Returns:
    - A pandas DataFrame containing the data from the JSONL file.
    """
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Parse the line as JSON
            parsed_line = json.loads(line)
            # If the line is a single-element list, extract the first element
            if isinstance(parsed_line, list) and len(parsed_line) == 1:
                data.append(parsed_line[0])
            else:
                data.append(parsed_line)
    
    # Convert to a DataFrame
    data = pd.DataFrame(data)
    return data

def write_jsonl(data, output_file_path):
    with open(output_file_path, 'w') as file:
        for _, row in data.iterrows():
            json.dump(row.to_dict(), file)
            file.write('\n')

def create_temp_doc_id(input_text):
    # Extract everything between the brackets
    match = re.search(r'\[(.*?)\]', input_text)
    if match:
        extracted_text = match.group(1)
        # Replace all punctuation and spaces with "_"
        cleaned_text = re.sub(r'[^\w]', '_', extracted_text)
        # Replace multiple underscores with a single "_"
        final_text = re.sub(r'_{2,}', '_', cleaned_text)
        final_text = 'batch_' + final_text
        return final_text.lower()
        
    return None

In [348]:
def get_latest_fail_reasons(batch_fail_reason_loc):

    fail_reasons = os.listdir(batch_fail_reason_loc)

    # Sort in descending order by timestamp
    sorted_files = sorted(fail_reasons, key=lambda x: int(x.split('_')[-1].split('.')[0]), reverse=True)

    # Read the latest created document
    df = read_jsonl(f"{batch_fail_reason_loc}{sorted_files[0]}")

    return df

def get_files_list(loc):

    files = [
        f for f in os.listdir(loc)
        if os.path.isfile(os.path.join(loc, f)) and f.endswith('.jsonl')
    ]

    strings_before_filetype = [file.split('.json')[0] for file in files]
    
    return strings_before_filetype

In [349]:
def filter_known_for_fails(processed_data_loc, batch_fail_loc, batch_complete_loc, temp_local_location):

    # Load data and create new id which is the same as batch id's
    df = read_jsonl(processed_data_loc)
    
    df['temp_doc_id'] = df['doc_id'].apply(lambda x: create_temp_doc_id(x))
    df['temp_doc_id'] = df['temp_doc_id'].fillna('batch_' + df['corpus'] + '_' + df['author'])

    failed_files = get_files_list(batch_fail_loc)
    failed_files = [re.sub(r'[^\w\s]', '_', s) for s in failed_files]
    
    complete_files = get_files_list(batch_complete_loc)
    complete_files = [re.sub(r'[^\w\s]', '_', s) for s in complete_files]

    temp_local_files = get_files_list(temp_local_location)
    temp_local_files = [re.sub(r'[^\w\s]', '_', s) for s in temp_local_files]
    # df = df[df['temp_doc_id'].isin(failed_files)]

    df = df[~(df['temp_doc_id'].isin(complete_files))]
    df = df[~(df['temp_doc_id'].isin(temp_local_files))]
    
    return df

In [350]:
df=filter_known_for_fails(processed_data_loc, batch_fail_loc, batch_complete_loc, temp_local_location)

In [351]:
df

Unnamed: 0,corpus,doc_id,chunk_id,author,texttype,sentence,temp_doc_id
2605,The Telegraph,known [CharlesSpencer - Text 1].txt,1,CharlesSpencer,known,Having now seen both works within a couple of ...,batch_charlesspencer_text_1
2606,The Telegraph,known [CharlesSpencer - Text 1].txt,2,CharlesSpencer,known,"In contrast, Coward had absolutely no intentio...",batch_charlesspencer_text_1
2607,The Telegraph,known [CharlesSpencer - Text 1].txt,3,CharlesSpencer,known,"Hay Fever is a comic masterpiece, with a fizzi...",batch_charlesspencer_text_1
2608,The Telegraph,known [CharlesSpencer - Text 1].txt,4,CharlesSpencer,known,This one goes at a terrific lick with a runnin...,batch_charlesspencer_text_1
2609,The Telegraph,known [CharlesSpencer - Text 1].txt,5,CharlesSpencer,known,As a result its frivolity never outstays its w...,batch_charlesspencer_text_1
...,...,...,...,...,...,...,...
6446,The Telegraph,known [HoratiaHarrod - Text 2].txt,37,HoratiaHarrod,known,So that was a relief.,batch_horatiaharrod_text_2
6447,The Telegraph,known [HoratiaHarrod - Text 2].txt,38,HoratiaHarrod,known,A relief on my lungs as well.,batch_horatiaharrod_text_2
6448,The Telegraph,known [HoratiaHarrod - Text 2].txt,39,HoratiaHarrod,known,Later this month Pearce will release his debut...,batch_horatiaharrod_text_2
6449,The Telegraph,known [HoratiaHarrod - Text 2].txt,40,HoratiaHarrod,known,He took a part in Iron Man 3 last year.,batch_horatiaharrod_text_2


In [352]:
df['temp_doc_id'].nunique()

44

In [353]:
system_prompt = """You are a paraphrasing assistant. Your task is to generate paraphrased sentences that retain the original meaning, tone, and style but demonstrate maximum lexical and structural variety.
Each paraphrase should use distinct vocabulary and sentence structures, prioritizing as much lexical difference as possible.

Guidelines:
- Create AT LEAST TWENTY unique paraphrases.
- Avoid repeating words or phrases across paraphrases, unless they are critical to meaning (e.g., names or specific technical terms).
- Use varied synonyms, alter phrasing, and experiment with different sentence structures to ensure each paraphrase feels fresh and unique.
- Examples of strategies to achieve this include: using metaphors or idioms, reordering clauses, shifting perspectives, and exploring different grammatical constructions.
- Preserve the original intent and style without adding new information or altering names.

DO NOT INCLUDE ANY NOTES OR ADDITIONAL TEXT IN THE OUTPUT.

Example in JSON format:

input: "Although the skill appears easy at first, it can take a long time to master."

Output:
{
  "original": "Although the skill appears easy at first, it can take a long time to master.",
  "paraphrase_1": "Initially, the skill may seem effortless, yet true mastery demands a lengthy commitment.",
  "paraphrase_2": "What begins as a simple-looking skill often turns into a time-consuming mastery process.",
  "paraphrase_3": "While appearing simple at the outset, mastering this skill typically requires extended effort.",
  "paraphrase_4": "Despite an easy start, reaching mastery in this skill can be a prolonged journey.",
  "paraphrase_5": "This skill, while seemingly straightforward at first glance, requires considerable time to excel in.",
  "paraphrase_6": "Even if it looks easy at the beginning, achieving expertise in this skill may be time-intensive.",
  "paraphrase_7": "Though simple in appearance, the skill demands time and practice to truly master.",
  "paraphrase_8": "Achieving proficiency in this skill can take substantial time, even if it seems easy initially.",
  "paraphrase_9": "While the skill might look easy at the start, honing it to perfection can require considerable time.",
  "paraphrase_10": "It might seem straightforward to pick up, yet mastering this skill is often a slow process.",
  "paraphrase_11": "Perfecting this seemingly easy skill can actually be a long and demanding task.",
  "paraphrase_12": "Though it appears simple to learn, achieving mastery in this skill often takes a significant amount of time.",
  "paraphrase_13": "Initially, the skill may come across as effortless, but true proficiency is typically time-consuming.",
  "paraphrase_14": "Mastering this skill is a lengthy pursuit, despite its initial simplicity.",
  "paraphrase_15": "While it looks uncomplicated at first, gaining full mastery of this skill can be a long journey.",
  "paraphrase_16": "Even though this skill seems straightforward, becoming proficient usually takes an extended period.",
  "paraphrase_17": "Mastery of this seemingly simple skill often requires more time than one might expect.",
  "paraphrase_18": "Though it may appear easy at first glance, mastering this skill can be a drawn-out process.",
  "paraphrase_19": "Although appearing effortless at first, this skill demands time and patience for true mastery.",
  "paraphrase_20": "While this skill may look easy initially, true expertise often requires a great deal of time to develop."
}
"""

In [354]:
df = df.rename(columns={'doc_id': 'orig_doc_id',
                        'temp_doc_id': 'doc_id',
                        'sentence': 'text'})

In [355]:
df.head(5)

Unnamed: 0,corpus,orig_doc_id,chunk_id,author,texttype,text,doc_id
2605,The Telegraph,known [CharlesSpencer - Text 1].txt,1,CharlesSpencer,known,Having now seen both works within a couple of ...,batch_charlesspencer_text_1
2606,The Telegraph,known [CharlesSpencer - Text 1].txt,2,CharlesSpencer,known,"In contrast, Coward had absolutely no intentio...",batch_charlesspencer_text_1
2607,The Telegraph,known [CharlesSpencer - Text 1].txt,3,CharlesSpencer,known,"Hay Fever is a comic masterpiece, with a fizzi...",batch_charlesspencer_text_1
2608,The Telegraph,known [CharlesSpencer - Text 1].txt,4,CharlesSpencer,known,This one goes at a terrific lick with a runnin...,batch_charlesspencer_text_1
2609,The Telegraph,known [CharlesSpencer - Text 1].txt,5,CharlesSpencer,known,As a result its frivolity never outstays its w...,batch_charlesspencer_text_1


In [356]:
def paraphrase_call(text, system_prompt, client, n=10):

    completion = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": text}
    ],
    response_format={ "type": "json_object" },
    n = n
    )

    return completion

In [357]:
def openai_paraphrase_generation(text, system_prompt, client, n=10):
    try:
        # Call the paraphrase generation function
        result = paraphrase_call(text, system_prompt, client, n=n)
        
        # Initialize the final result with defaults
        final_result = {
            'original_sentence': text,  # Use the input text as the default original
            'rephrased': []  # Default to an empty list
        }
        
        # Check if result.choices exists and contains valid choices
        if not result or not hasattr(result, 'choices') or not result.choices:
            return final_result  # Return the default if the response is invalid
        
        # Use a set to collect unique paraphrases
        unique_paraphrases = set()
        
        for choice in result.choices:
            # Parse the JSON content from each choice
            try:
                json_object = json.loads(choice.message.content)
            except (json.JSONDecodeError, AttributeError):
                continue  # Skip invalid JSON or missing content
            
            # Set the original text (only set once)
            if not final_result['original_sentence']:
                final_result['original_sentence'] = json_object.get('original', text)
            
            # Collect paraphrases
            for key, value in json_object.items():
                if key.startswith('paraphrase_'):
                    unique_paraphrases.add(value)
        
        # Update the final result
        final_result['rephrased'] = list(unique_paraphrases)
        return final_result
    
    except Exception as e:
        # Handle unexpected errors and return a default result
        print(f"Error during paraphrase generation: {e}")
        return {
            'original_sentence': text,
            'rephrased': []
        }

In [358]:
def process_dataframe(df, system_prompt, client, n=10):
    
    # Define a wrapper function to process each row with openai_paraphrase_generation
    def process_response(row):
        response = openai_paraphrase_generation(row['text'], system_prompt, client, n=n)
        original_sentence = response.get('original_sentence', '')
        rephrased = response.get('rephrased', [])
        return original_sentence, rephrased

    # Apply the processing function to each row and unpack the results into new columns
    df['original_sentence'], df['rephrased'] = zip(*df.apply(process_response, axis=1))
    
    # Select relevant columns
    df = df[['doc_id', 'chunk_id', 'original_sentence', 'rephrased']]
    
    return df


In [359]:
def process_and_save_by_doc_id(df, system_prompt, client, save_loc=batch_complete_loc, n=10, delay=1):

    df['custom_id'] = df['doc_id'].astype(str) + '_' + df['chunk_id'].astype(str)
    
    # Group by doc_id to process each document individually
    grouped = df.groupby('doc_id')
    
    for doc_id, group in grouped:
        print(f"Processing doc_id: {doc_id}")
        
        # Initialize a list to collect expanded rows for this document
        expanded_rows = []
        
        for _, row in group.iterrows():
            try:
                # Generate paraphrases for each row
                response = openai_paraphrase_generation(row['text'], system_prompt, client, n=n)
                original_sentence = response.get('original_sentence', row['text'])  # Default to text if no response
                rephrased_sentences = response.get('rephrased', [])
            except Exception as e:
                print(f"Error processing row {row['custom_id']}: {e}")
                original_sentence = row['text']
                rephrased_sentences = []
            
            # Add one row per rephrased sentence
            for rephrased in rephrased_sentences:
                expanded_rows.append({
                    'doc_id': row['doc_id'],
                    'chunk_id': row['chunk_id'],
                    'original': original_sentence,
                    'rephrased': rephrased
                })

            print(f"    Completed chunk_id: {row['chunk_id']} for doc_id: {doc_id}")

            # Optional delay between API calls
            time.sleep(delay)
        
        # Create a DataFrame for the current doc_id
        doc_df = pd.DataFrame(expanded_rows)
        
        # Save the results to a JSONL file
        save_path = f"{save_loc}{doc_id}.jsonl"
        try:
            write_jsonl(doc_df, save_path)
            print(f"Saved results for doc_id: {doc_id} to {save_path}")
        except Exception as e:
            print(f"Error saving doc_id: {doc_id} to {save_path}: {e}")


In [360]:
def process_and_save_by_doc_id(df, system_prompt, client, save_loc=batch_complete_loc, n=10, delay=1):
    
    df['custom_id'] = df['doc_id'].astype(str) + '_' + df['chunk_id'].astype(str)
    
    # Group by doc_id to process each document individually
    grouped = df.groupby('doc_id')
    
    for doc_id, group in grouped:
        print(f"Processing doc_id: {doc_id}")
        
        # Initialize a list to collect expanded rows for this document
        expanded_rows = []
        
        # Get total chunks for progress tracking
        total_chunks = len(group)
        
        for i, (_, row) in enumerate(group.iterrows(), start=1):  # `i` starts at 1 for 1-based indexing
            print(f"    Processing chunk {i} out of {total_chunks} for doc_id: {doc_id}")
            try:
                # Generate paraphrases for each row
                response = openai_paraphrase_generation(row['text'], system_prompt, client, n=n)
                original_sentence = response.get('original_sentence', row['text'])  # Default to text if no response
                rephrased_sentences = response.get('rephrased', [])
            except Exception as e:
                print(f"Error processing row {row['custom_id']}: {e}")
                original_sentence = row['text']
                rephrased_sentences = []
            
            # Add one row per rephrased sentence
            for rephrased in rephrased_sentences:
                expanded_rows.append({
                    'doc_id': row['doc_id'],
                    'chunk_id': row['chunk_id'],
                    'original': original_sentence,
                    'rephrased': rephrased
                })
            
            # Optional delay between API calls
            time.sleep(delay)
        
        # Create a DataFrame for the current doc_id
        doc_df = pd.DataFrame(expanded_rows)
        
        # Save the results to a JSONL file
        save_path = f"{save_loc}{doc_id}.jsonl"
        try:
            write_jsonl(doc_df, save_path)
            print(f"Saved results for doc_id: {doc_id} to {save_path}")
        except Exception as e:
            print(f"Error saving doc_id: {doc_id} to {save_path}: {e}")


In [361]:
process_and_save_by_doc_id(df, system_prompt, client, save_loc=temp_local_location, n=10, delay=10)

Processing doc_id: batch_charlesspencer_text_1
    Processing chunk 1 out of 27 for doc_id: batch_charlesspencer_text_1
    Processing chunk 2 out of 27 for doc_id: batch_charlesspencer_text_1
    Processing chunk 3 out of 27 for doc_id: batch_charlesspencer_text_1
    Processing chunk 4 out of 27 for doc_id: batch_charlesspencer_text_1
    Processing chunk 5 out of 27 for doc_id: batch_charlesspencer_text_1
    Processing chunk 6 out of 27 for doc_id: batch_charlesspencer_text_1
    Processing chunk 7 out of 27 for doc_id: batch_charlesspencer_text_1
    Processing chunk 8 out of 27 for doc_id: batch_charlesspencer_text_1
    Processing chunk 9 out of 27 for doc_id: batch_charlesspencer_text_1
    Processing chunk 10 out of 27 for doc_id: batch_charlesspencer_text_1
    Processing chunk 11 out of 27 for doc_id: batch_charlesspencer_text_1
    Processing chunk 12 out of 27 for doc_id: batch_charlesspencer_text_1
    Processing chunk 13 out of 27 for doc_id: batch_charlesspencer_text_1
