In [17]:
import pandas as pd
import numpy as np
import json
import os

from datetime import datetime

In [18]:
%run "../read_and_write_docs.py"

In [19]:
data_loc = "../../../../datasets/blogger/raw/rephrased_preprocessed.jsonl"
batch_loc = "../../../../datasets/blogger/batch_lexical_difference/"

In [20]:
system_prompt = """You are a paraphrasing assistant. Your task is to generate paraphrased sentences that retain the original meaning, tone, and style but demonstrate maximum lexical and structural variety.
Each paraphrase should use distinct vocabulary and sentence structures, prioritizing as much lexical difference as possible.

Guidelines:
- Create AT LEAST TWENTY unique paraphrases.
- **Avoid repeating words or phrases across paraphrases,** unless they are critical to meaning (e.g., names or specific technical terms).
- Use varied synonyms, alter phrasing, and experiment with different sentence structures to ensure each paraphrase feels fresh and unique.
- Examples of strategies to achieve this include: using metaphors or idioms, reordering clauses, shifting perspectives, and exploring different grammatical constructions.
- Preserve the original intent and style without adding new information or altering names.

DO NOT INCLUDE ANY NOTE OR ADDITIONAL TEXT IN THE OUTPUT.

Example in JSON format:

input: "Although the skill appears easy at first, it can take a long time to master."

Output:
{
  "original": "Although the skill appears easy at first, it can take a long time to master.",
  "paraphrase_1": "Initially, the skill may seem effortless, yet true mastery demands a lengthy commitment.",
  "paraphrase_2": "What begins as a simple-looking skill often turns into a time-consuming mastery process.",
  "paraphrase_3": "While appearing simple at the outset, mastering this skill typically requires extended effort.",
  "paraphrase_4": "Despite an easy start, reaching mastery in this skill can be a prolonged journey.",
  "paraphrase_5": "This skill, while seemingly straightforward at first glance, requires considerable time to excel in.",
  "paraphrase_6": "Even if it looks easy at the beginning, achieving expertise in this skill may be time-intensive.",
  "paraphrase_7": "Though simple in appearance, the skill demands time and practice to truly master.",
  "paraphrase_8": "Achieving proficiency in this skill can take substantial time, even if it seems easy initially.",
  "paraphrase_9": "While the skill might look easy at the start, honing it to perfection can require considerable time.",
  "paraphrase_10": "It might seem straightforward to pick up, yet mastering this skill is often a slow process.",
  "paraphrase_11": "Perfecting this seemingly easy skill can actually be a long and demanding task.",
  "paraphrase_12": "Though it appears simple to learn, achieving mastery in this skill often takes a significant amount of time.",
  "paraphrase_13": "Initially, the skill may come across as effortless, but true proficiency is typically time-consuming.",
  "paraphrase_14": "Mastering this skill is a lengthy pursuit, despite its initial simplicity.",
  "paraphrase_15": "While it looks uncomplicated at first, gaining full mastery of this skill can be a long journey.",
  "paraphrase_16": "Even though this skill seems straightforward, becoming proficient usually takes an extended period.",
  "paraphrase_17": "Mastery of this seemingly simple skill often requires more time than one might expect.",
  "paraphrase_18": "Though it may appear easy at first glance, mastering this skill can be a drawn-out process.",
  "paraphrase_19": "Although appearing effortless at first, this skill demands time and patience for true mastery.",
  "paraphrase_20": "While this skill may look easy initially, true expertise often requires a great deal of time to develop."
}
"""

In [21]:
def row_to_jsonl(row):
    custom_id = row['custom_id']
    user_text = row['text']
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_text}
            ],
            "max_tokens": 5000,
            "temperature": 1,
            "top_p": 1,
            "response_format": { "type": 'json_object' }
        }
    }

In [22]:
def save_batch_jsonl(df, batch_loc):
    unique_doc_ids = df['doc_id'].unique()
    
    for doc_id in unique_doc_ids:
        filtered_df = df[df['doc_id'] == doc_id]
        jsonl_data = [row_to_jsonl(row) for _, row in filtered_df.iterrows()]
        file_name = f"batch_{doc_id}.jsonl"
        file_path = os.path.join(batch_loc, file_name)
        
        with open(file_path, 'w') as f:
            for item in jsonl_data:
                f.write(json.dumps(item) + '\n')

    print("JSONL files created successfully.")

In [23]:
df = read_jsonl(data_loc)

In [24]:
df

Unnamed: 0,doc_id,author_id,gender,age,topic,sign,date,text,chunk_id
0,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",Junoon_____over the years 1991 saw the birth o...,1
1,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",I wouldnt be exaggerating if I called them a r...,2
2,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",They entered the mainstream at the time when j...,3
3,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",They are indeed the pioneers of Pakistani rock.,4
4,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",While mummy daddies refused to recognize junoo...,5
...,...,...,...,...,...,...,...,...,...
3252,678838,1241231,male,37,Technology,Aries,"27,March,2003",And now Blairs troops are firing their wire-gu...,24
3253,678838,1241231,male,37,Technology,Aries,"27,March,2003","I have walked the citys streets, along a road ...",25
3254,678838,1241231,male,37,Technology,Aries,"27,March,2003","The casualties were children, of course, becau...",26
3255,678838,1241231,male,37,Technology,Aries,"27,March,2003",I held a handkerchief over my face as I stood ...,27


In [25]:
current_date = datetime.now().strftime("%d%m%Y")

# Repeat each row 10 times
df_repeated = df.loc[np.repeat(df.index, 10)].reset_index()

# Add a column for the repetition number
df_repeated['repetition'] = df_repeated.groupby('index').cumcount() + 1

# Drop the old index column as it's no longer needed
df_repeated = df_repeated.drop(columns='index')

# Create the custom_id column
df_repeated['custom_id'] = (
    current_date + '_' +
    'doc_' + df_repeated['doc_id'].astype(str) +
    '_chunk_' + df_repeated['chunk_id'].astype(str) +
    '_' + df_repeated['repetition'].astype(str)
)

In [26]:
df_repeated

Unnamed: 0,doc_id,author_id,gender,age,topic,sign,date,text,chunk_id,repetition,custom_id
0,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",Junoon_____over the years 1991 saw the birth o...,1,1,12112024_doc_16188_chunk_1_1
1,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",Junoon_____over the years 1991 saw the birth o...,1,2,12112024_doc_16188_chunk_1_2
2,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",Junoon_____over the years 1991 saw the birth o...,1,3,12112024_doc_16188_chunk_1_3
3,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",Junoon_____over the years 1991 saw the birth o...,1,4,12112024_doc_16188_chunk_1_4
4,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",Junoon_____over the years 1991 saw the birth o...,1,5,12112024_doc_16188_chunk_1_5
...,...,...,...,...,...,...,...,...,...,...,...
32565,678838,1241231,male,37,Technology,Aries,"27,March,2003",The dust blew in from the southern battlefield...,28,6,12112024_doc_678838_chunk_28_6
32566,678838,1241231,male,37,Technology,Aries,"27,March,2003",The dust blew in from the southern battlefield...,28,7,12112024_doc_678838_chunk_28_7
32567,678838,1241231,male,37,Technology,Aries,"27,March,2003",The dust blew in from the southern battlefield...,28,8,12112024_doc_678838_chunk_28_8
32568,678838,1241231,male,37,Technology,Aries,"27,March,2003",The dust blew in from the southern battlefield...,28,9,12112024_doc_678838_chunk_28_9


In [27]:
save_batch_jsonl(df_repeated, batch_loc)

JSONL files created successfully.


In [28]:
# {
#     "custom_id": "request-1",
#     "method": "POST",
#     "url": "/v1/chat/completions",
#     "body":{
#         "model": "gpt-3.5-turbo-0125",
#         "messages": [{"role": "system", "content": "You are a helpful assistant."},
#                      {"role": "user", "content": "Hello world!"}],
#         "max_tokens": 1000
#     }
# }