In [8]:
import pandas as pd
import numpy as np
import json
import os

In [9]:
%run "../read_and_write_docs.py"

In [10]:
data_loc = "../../../../datasets/blogger/raw/rephrased_preprocessed.jsonl"
batch_loc = "../../../../datasets/blogger/batch/"

In [11]:
system_prompt = (
    "You are a paraphrasing assistant, given a sentence generate as many paraphrased "
    "sentences as possible while preserving the original semantic meaning and style. "
    "Return the rephrased sentences as a Python list. Aim for AT LEAST TWENTY sentences. "
    "DO NOT INCLUDE ANY NOTE OR ADDITIONAL TEXT IN THE OUTPUT. "
    "Make sure to WRAP ALL SENTENCES IN DOUBLE QUOTES AND USE ESCAPED SINGLE QUOTES INSIDE THEM. "
    "If there are NAMED ENTITIES in the sentence DO NOT change the name."
)

# Convert the text to a single line string
system_prompt = repr(system_prompt)

In [12]:
system_prompt = """
You are a paraphrasing assistant, given a sentence generate as many paraphrased sentences as possible while preserving 
the original semantic meaning and style. 
Aim for AT LEAST TWENTY sentences. 
DO NOT INCLUDE ANY NOTE OR ADDITIONAL TEXT IN THE OUTPUT. 
If there are NAMED ENTITIES in the sentence DO NOT change the name. 

An Example output in JSON format is:

input:  "Although the skill appears easy at first, it can take a long time to master."

Output: {"original": "Although the skill appears easy at first, it can take a long time to master.",
  "paraphrase_1": "While the skill seems simple initially, mastering it may take a considerable amount of time.",
  "paraphrase_2": "Though the skill looks easy at first glance, it can require a long time to master.",
  "paraphrase_3": "Even though the skill appears straightforward, it might take a lot of time to fully master.",
  "paraphrase_4": "The skill may seem easy at the beginning, but it can take a long time to perfect.",
  "paraphrase_5": "Though it seems easy at first, mastering the skill can be time-consuming.",
  "paraphrase_6": "Although the skill looks simple initially, it often takes a long time to master.",
  "paraphrase_7": "Even if the skill seems easy at first, it can take a considerable time to master.",
  "paraphrase_8": "Though the skill seems easy at the start, mastering it can take a significant amount of time.",
  "paraphrase_9": "While it may seem easy at first, the skill can take a long time to master.",
  "paraphrase_10": "Although it appears simple at first, mastering the skill can take a lot of time.",
  "paraphrase_11": "At first, the skill might seem easy, but mastering it can take a long time.",
  "paraphrase_12": "Even if the skill appears simple at the outset, it may require a long time to master.",
  "paraphrase_13": "Though the skill might seem straightforward initially, mastering it can take a significant amount of time.",
  "paraphrase_14": "Although it looks easy at the beginning, mastering the skill often takes a long time.",
  "paraphrase_15": "The skill may seem simple at first, but it can take a long time to master.",
  "paraphrase_16": "Although it appears easy initially, mastering the skill can be a lengthy process.",
  "paraphrase_17": "The skill may look simple at first, but it can take quite some time to master.",
  "paraphrase_18": "Even though it seems easy at the start, mastering the skill can take a lot of time.",
  "paraphrase_19": "Although the skill appears simple, mastering it can require a long time.",
  "paraphrase_20": "While the skill might seem easy at first, it can take a significant amount of time to master."}

"""

In [13]:
def row_to_jsonl(row):
    custom_id = row['custom_id']
    user_text = row['text']
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_text}
            ],
            "max_tokens": 5000,
            "temperature": 1,
            "top_p":1,
            "response_format": { "type": 'json_object' }
        }
    }

In [14]:
def save_batch_jsonl(df, batch_loc):
    unique_doc_ids = df['doc_id'].unique()
    
    for doc_id in unique_doc_ids:
        filtered_df = df[df['doc_id'] == doc_id]
        jsonl_data = [row_to_jsonl(row) for _, row in filtered_df.iterrows()]
        file_name = f"batch_{doc_id}.jsonl"
        file_path = os.path.join(batch_loc, file_name)
        
        with open(file_path, 'w') as f:
            for item in jsonl_data:
                f.write(json.dumps(item) + '\n')

    print("JSONL files created successfully.")

In [15]:
df = read_jsonl_file(data_loc)

In [16]:
df

Unnamed: 0,doc_id,author_id,gender,age,topic,sign,date,text,chunk_id
0,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",Junoon_____over the years 1991 saw the birth o...,1
1,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",I wouldnt be exaggerating if I called them a r...,2
2,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",They entered the mainstream at the time when j...,3
3,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",They are indeed the pioneers of Pakistani rock.,4
4,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",While mummy daddies refused to recognize junoo...,5
...,...,...,...,...,...,...,...,...,...
3252,678838,1241231,male,37,Technology,Aries,"27,March,2003",And now Blairs troops are firing their wire-gu...,24
3253,678838,1241231,male,37,Technology,Aries,"27,March,2003","I have walked the citys streets, along a road ...",25
3254,678838,1241231,male,37,Technology,Aries,"27,March,2003","The casualties were children, of course, becau...",26
3255,678838,1241231,male,37,Technology,Aries,"27,March,2003",I held a handkerchief over my face as I stood ...,27


In [17]:
# Repeat each row 10 times
df_repeated = df.loc[np.repeat(df.index, 10)].reset_index()

# Add a column for the repetition number
df_repeated['repetition'] = df_repeated.groupby('index').cumcount() + 1

# Drop the old index column as it's no longer needed
df_repeated = df_repeated.drop(columns='index')

# Create the custom_id column
df_repeated['custom_id'] = (
    'doc_' + df_repeated['doc_id'].astype(str) +
    '_chunk_' + df_repeated['chunk_id'].astype(str) +
    '_' + df_repeated['repetition'].astype(str)
)

In [18]:
df_repeated

Unnamed: 0,doc_id,author_id,gender,age,topic,sign,date,text,chunk_id,repetition,custom_id
0,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",Junoon_____over the years 1991 saw the birth o...,1,1,doc_16188_chunk_1_1
1,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",Junoon_____over the years 1991 saw the birth o...,1,2,doc_16188_chunk_1_2
2,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",Junoon_____over the years 1991 saw the birth o...,1,3,doc_16188_chunk_1_3
3,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",Junoon_____over the years 1991 saw the birth o...,1,4,doc_16188_chunk_1_4
4,16188,3952922,male,16,Student,Sagittarius,"17,August,2004",Junoon_____over the years 1991 saw the birth o...,1,5,doc_16188_chunk_1_5
...,...,...,...,...,...,...,...,...,...,...,...
32565,678838,1241231,male,37,Technology,Aries,"27,March,2003",The dust blew in from the southern battlefield...,28,6,doc_678838_chunk_28_6
32566,678838,1241231,male,37,Technology,Aries,"27,March,2003",The dust blew in from the southern battlefield...,28,7,doc_678838_chunk_28_7
32567,678838,1241231,male,37,Technology,Aries,"27,March,2003",The dust blew in from the southern battlefield...,28,8,doc_678838_chunk_28_8
32568,678838,1241231,male,37,Technology,Aries,"27,March,2003",The dust blew in from the southern battlefield...,28,9,doc_678838_chunk_28_9


In [19]:
save_batch_jsonl(df_repeated, batch_loc)

JSONL files created successfully.


In [None]:
# {
#     "custom_id": "request-1",
#     "method": "POST",
#     "url": "/v1/chat/completions",
#     "body":{
#         "model": "gpt-3.5-turbo-0125",
#         "messages": [{"role": "system", "content": "You are a helpful assistant."},
#                      {"role": "user", "content": "Hello world!"}],
#         "max_tokens": 1000
#     }
# }