In [103]:
import pandas as pd
import numpy as np
import json
import os
import re

from datetime import datetime
from pathlib import Path

In [104]:
%run "../read_and_write_docs.py"

In [105]:
def list_files(location, exact_name=None):
    """
    Lists all files in the specified location, optionally filtering by file type.

    Parameters:
    - location (str): The directory to search in.
    - file_type (str, optional): The file extension to filter by (e.g., ".jsonl").

    Returns:
    - list: A list of full file paths that match the file type.
    """
    # Initialize an empty list to store file paths
    file_list = []
    
    # Walk through the directory
    for root, dirs, files in os.walk(location):
        for file_name in files:
            # Match exact file name if specified
            if exact_name and file_name == exact_name:
                file_list.append(os.path.join(root, file_name))
            # If no exact_name is provided, include all files
            elif not exact_name:
                file_list.append(os.path.join(root, file_name))
    
    return file_list

In [106]:
system_prompt = """You are a paraphrasing assistant. Your task is to generate paraphrased sentences that retain the original meaning, tone, and style but demonstrate maximum lexical and structural variety.
Each paraphrase should use distinct vocabulary and sentence structures, prioritizing as much lexical difference as possible.

Guidelines:
- Create AT LEAST TWENTY unique paraphrases.
- Avoid repeating words or phrases across paraphrases, unless they are critical to meaning (e.g., names or specific technical terms).
- Use varied synonyms, alter phrasing, and experiment with different sentence structures to ensure each paraphrase feels fresh and unique.
- Examples of strategies to achieve this include: using metaphors or idioms, reordering clauses, shifting perspectives, and exploring different grammatical constructions.
- Preserve the original intent and style without adding new information or altering names.

DO NOT INCLUDE ANY NOTES OR ADDITIONAL TEXT IN THE OUTPUT.

Example in JSON format:

input: "Although the skill appears easy at first, it can take a long time to master."

Output:
{
  "original": "Although the skill appears easy at first, it can take a long time to master.",
  "paraphrase_1": "Initially, the skill may seem effortless, yet true mastery demands a lengthy commitment.",
  "paraphrase_2": "What begins as a simple-looking skill often turns into a time-consuming mastery process.",
  "paraphrase_3": "While appearing simple at the outset, mastering this skill typically requires extended effort.",
  "paraphrase_4": "Despite an easy start, reaching mastery in this skill can be a prolonged journey.",
  "paraphrase_5": "This skill, while seemingly straightforward at first glance, requires considerable time to excel in.",
  "paraphrase_6": "Even if it looks easy at the beginning, achieving expertise in this skill may be time-intensive.",
  "paraphrase_7": "Though simple in appearance, the skill demands time and practice to truly master.",
  "paraphrase_8": "Achieving proficiency in this skill can take substantial time, even if it seems easy initially.",
  "paraphrase_9": "While the skill might look easy at the start, honing it to perfection can require considerable time.",
  "paraphrase_10": "It might seem straightforward to pick up, yet mastering this skill is often a slow process.",
  "paraphrase_11": "Perfecting this seemingly easy skill can actually be a long and demanding task.",
  "paraphrase_12": "Though it appears simple to learn, achieving mastery in this skill often takes a significant amount of time.",
  "paraphrase_13": "Initially, the skill may come across as effortless, but true proficiency is typically time-consuming.",
  "paraphrase_14": "Mastering this skill is a lengthy pursuit, despite its initial simplicity.",
  "paraphrase_15": "While it looks uncomplicated at first, gaining full mastery of this skill can be a long journey.",
  "paraphrase_16": "Even though this skill seems straightforward, becoming proficient usually takes an extended period.",
  "paraphrase_17": "Mastery of this seemingly simple skill often requires more time than one might expect.",
  "paraphrase_18": "Though it may appear easy at first glance, mastering this skill can be a drawn-out process.",
  "paraphrase_19": "Although appearing effortless at first, this skill demands time and patience for true mastery.",
  "paraphrase_20": "While this skill may look easy initially, true expertise often requires a great deal of time to develop."
}
"""

In [107]:
def create_temp_doc_id(input_text):
    
    # Extract everything between the brackets
    match = re.search(r'\[(.*)\]', input_text)

    if match:
        extracted_text = extracted_text = match.group(1)
    else:
        extracted_text = input_text
        
    # Replace punctuation with "_" and spaces with "_", then reduce multiple underscores to one
    cleaned_text = re.sub(r'[^\w]', '_', extracted_text)
    cleaned_text = re.sub(r'_+', '_', cleaned_text)

    # Remove any leading or training '_'
    final_text = cleaned_text.strip('_')
    
    return final_text.lower()

In [108]:
def row_to_jsonl(row):
    custom_id = row['custom_id']
    user_text = row['sentence']
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_text}
            ],
            "max_tokens": 5000,
            "temperature": 1,
            "top_p": 1,
            "response_format": { "type": 'json_object' }
        }
    }

In [109]:
def save_batch_jsonl(df, batch_loc):
    unique_doc_ids = df['temp_doc_id'].unique()
    
    for doc_id in unique_doc_ids:
        filtered_df = df[df['temp_doc_id'] == doc_id]
        jsonl_data = [row_to_jsonl(row) for _, row in filtered_df.iterrows()]
        file_name = f"batch_{doc_id}.jsonl"
        file_path = os.path.join(batch_loc, file_name)
        
        with open(file_path, 'w') as f:
            for item in jsonl_data:
                f.write(json.dumps(item) + '\n')

In [110]:
data_type = "test"
data_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/"

In [111]:
file_list = list_files(data_loc, "known_processed.jsonl")

In [112]:
file_list

['/Volumes/BCross/datasets/author_verification/test/StackExchange/known_processed.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/Amazon/known_processed.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/The Telegraph/known_processed.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/Yelp/known_processed.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/Wiki/known_processed.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/All-the-news/known_processed.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/IMDB/known_processed.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/Reddit/known_processed.jsonl',
 "/Volumes/BCross/datasets/author_verification/test/Koppel's Blogs/known_processed.jsonl",
 '/Volumes/BCross/datasets/author_verification/test/Perverted Justice/known_processed.jsonl',
 '/Volumes/BCross/datasets/author_verification/test/TripAdvisor/known_processed.jsonl',
 '/Volumes/BCross/datasets/author_verification/te

In [113]:
for file in file_list:

    # Get the folder name for saving
    folder_name = Path(file).parent.name

    # temp_doc_id something like StackExchange_12345
    df = read_jsonl(file)

    df['temp_doc_id'] = df['doc_id'].apply(create_temp_doc_id)

    current_date = datetime.now().strftime("%Y%m%d%H%M%S")

    # Repeat each row 10 times
    df_repeated = df.loc[np.repeat(df.index, 10)].reset_index()
    
    # Add a column for the repetition number
    df_repeated['repetition'] = df_repeated.groupby('index').cumcount() + 1
    
    # Drop the old index column as it's no longer needed
    df_repeated = df_repeated.drop(columns='index')
    
    # Create the custom_id column
    df_repeated['custom_id'] = (
        current_date + '_' +
        'doc_' + df_repeated['temp_doc_id'].astype(str) +
        '_chunk_' + df_repeated['chunk_id'].astype(str) +
        '_' + df_repeated['repetition'].astype(str)
    )
    
    batch_loc = f"{data_loc}{folder_name}/batch_sentence_preprocessed/"

    # Ensure the directory exists
    os.makedirs(batch_loc, exist_ok=True)
    
    save_batch_jsonl(df_repeated, batch_loc)

    print(f"Corpus: {folder_name} Batch Preprocessing Complete!")

Corpus: StackExchange Batch Preprocessing Complete!
Corpus: Amazon Batch Preprocessing Complete!
Corpus: The Telegraph Batch Preprocessing Complete!
Corpus: Yelp Batch Preprocessing Complete!
Corpus: Wiki Batch Preprocessing Complete!
Corpus: All-the-news Batch Preprocessing Complete!
Corpus: IMDB Batch Preprocessing Complete!
Corpus: Reddit Batch Preprocessing Complete!
Corpus: Koppel's Blogs Batch Preprocessing Complete!
Corpus: Perverted Justice Batch Preprocessing Complete!
Corpus: TripAdvisor Batch Preprocessing Complete!
Corpus: ACL Batch Preprocessing Complete!
Corpus: The Apricity Batch Preprocessing Complete!
Corpus: Enron Batch Preprocessing Complete!
