In [88]:
import pandas as pd
import ast
import json

from lark import Lark, UnexpectedInput

In [89]:
%run "../read_and_write_docs.py"

In [170]:
# g_drive_loc = "/Users/user/Library/CloudStorage/GoogleDrive-benjcross1995@gmail.com/My Drive/"
# batch_complete_loc = f"{g_drive_loc}datasets/blogger/batch_complete/"
batch_complete_loc = "../../../../datasets/blogger/batch_complete/"
batch_save_loc = "../../../../datasets/blogger/batch_4o/"
batch_fail_loc = "../../../../datasets/blogger/batch_4o_fail/"

In [171]:
grammar_loc = "../../grammars/list_enf_format.lark"

In [172]:
def validate_and_parse_response(df, grammar_file):
    # Load the grammar
    with open(grammar_file, 'r') as file:
        grammar = file.read()
    
    parser = Lark(grammar, start='root', parser='earley')

In [173]:
# Function to extract the desired parts
def split_custon_id(custon_id):
    parts = custon_id.split('_')
    doc_id = parts[1]
    chunk_id = parts[3]
    repetition = parts[4]
    return pd.Series([doc_id, chunk_id, repetition])

In [174]:
def parse_response(response_str):
    # Convert the JSON string to a Python dictionary
    response_dict = json.loads(response_str)
    
    # Extract the 'original' sentence
    original_sentence = response_dict.get('original', '')
    
    # Extract other keys and add them to the list with 'repetition_i' format
    rephrased = []
    for key, value in response_dict.items():
        if key != 'original':
            rephrased.append(value)
    
    return original_sentence, rephrased

def process_dataframe(df):

    df[['doc_id', 'chunk_id', 'repetition']] = df['custom_id'].apply(split_custon_id)
    
    # Apply the parse_response function to each row of the dataframe
    df['original_sentence'], df['rephrased'] = zip(*df['response'].apply(parse_response))

    df = df[['doc_id', 'chunk_id', 'original_sentence', 'rephrased']]
    
    return df

In [175]:
def combine_and_unique(group):
    # Combine all lists in the 'rephrased' column into one list
    combined_list = sum(group['rephrased'], [])
    
    # Remove duplicates by converting to a set and back to a list
    unique_list = list(set(combined_list))
    
    # Return a DataFrame where each unique rephrased sentence is a new row
    return pd.DataFrame({
        'doc_id': group['doc_id'].iloc[0],
        'chunk_id': group['chunk_id'].iloc[0],
        'original': group['original_sentence'].iloc[0],
        'rephrased': unique_list
    })

def process_rephrased_sentences(df):
    # Group by 'doc_id' and 'chunk_id' and apply the combine_and_unique function
    result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
    return result_df

In [176]:
def process_file_list(files, base_read_loc, base_write_loc, base_fail_loc):

    for file in files:
        
        response = read_jsonl_file(f"{base_read_loc}{file}")
        print(f"{file} Loaded")
        
        processed_dataframe = process_dataframe(response)
        print(f"{file} Processed")

        final_df = process_rephrased_sentences(processed_dataframe)
        print(f"{file} List Split")
        
        save_loc = f"{base_write_loc}{file.replace('batch', 'doc')}"

        save_as_jsonl(final_df, save_loc)

In [177]:
import os
import shutil

def process_file_list(files, base_read_loc, base_write_loc, base_fail_loc):
    for file in files:
        try:
            # Step 1: Read the JSONL file
            response = read_jsonl_file(f"{base_read_loc}{file}")
            print(f"{file} Loaded")
            
            # Step 2: Process the DataFrame
            processed_dataframe = process_dataframe(response)
            print(f"{file} Processed")

            # Step 3: Split the list into separate rows
            final_df = process_rephrased_sentences(processed_dataframe)
            print(f"{file} List Split")
            
            # Step 4: Save the processed DataFrame to a new location
            save_loc = f"{base_write_loc}{file.replace('batch', 'doc')}"
            save_as_jsonl(final_df, save_loc)
            print(f"{file} Saved to {save_loc}")
            
        except Exception as e:
            # Step 5: If any error occurs, save the file to the fail location
            fail_loc = f"{base_fail_loc}{file}"
            print(f"Error processing {file}: {e}")
            shutil.move(f"{base_read_loc}{file}", fail_loc)
            print(f"{file} Moved to {fail_loc}")
            continue  # Move to the next file in case of an error
        
        # Step 6: Remove the original file from the read location if processing succeeds
        os.remove(f"{base_read_loc}{file}")
        print(f"{file} Removed from {base_read_loc}")

In [178]:
files_to_be_processed = [
    f for f in os.listdir(batch_complete_loc)
    if os.path.isfile(os.path.join(batch_complete_loc, f)) and f.endswith('.jsonl')
]

files_processed = [
    f for f in os.listdir(batch_save_loc)
    if os.path.isfile(os.path.join(batch_save_loc, f)) and f.endswith('.jsonl')
]

# Extracting the numeric part from files_processed and transforming it back to batch form
processed_batches = {f"batch_{file.split('_')[1]}" for file in files_processed}
processed_ids = {file.split('_')[1].split('.')[0] for file in files_processed}

# Finding files in files_to_be_processed that are not in processed_batches
files_not_processed = [file for file in files_to_be_processed if file.split('_')[1].split('.')[0] not in processed_ids]

In [179]:
len(files_not_processed)

100

In [180]:
process_file_list(files_not_processed, batch_complete_loc, batch_save_loc, batch_fail_loc)

batch_617540.jsonl Loaded
batch_617540.jsonl Processed
batch_617540.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_617540.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_617540.jsonl
batch_617540.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_579188.jsonl Loaded
batch_579188.jsonl Processed
batch_579188.jsonl List Split
batch_579188.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_579188.jsonl
batch_579188.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_448015.jsonl Loaded
batch_448015.jsonl Processed


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_448015.jsonl List Split
batch_448015.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_448015.jsonl
batch_448015.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_671397.jsonl Loaded
batch_671397.jsonl Processed
batch_671397.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_671397.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_671397.jsonl
batch_671397.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_543206.jsonl Loaded
batch_543206.jsonl Processed
batch_543206.jsonl List Split
batch_543206.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_543206.jsonl
batch_543206.jsonl Removed from ../../../../datasets/blogger/batch_complete/


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_17850.jsonl Loaded
batch_17850.jsonl Processed
batch_17850.jsonl List Split
batch_17850.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_17850.jsonl
batch_17850.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_429760.jsonl Loaded
batch_429760.jsonl Processed
batch_429760.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_429760.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_429760.jsonl
batch_429760.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_357437.jsonl Loaded
batch_357437.jsonl Processed
batch_357437.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_357437.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_357437.jsonl
batch_357437.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_445957.jsonl Loaded
batch_445957.jsonl Processed
batch_445957.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_445957.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_445957.jsonl
batch_445957.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_349665.jsonl Loaded
batch_349665.jsonl Processed
batch_349665.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_349665.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_349665.jsonl
batch_349665.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_323518.jsonl Loaded
batch_323518.jsonl Processed
batch_323518.jsonl List Split
batch_323518.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_323518.jsonl
batch_323518.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_214228.jsonl Loaded
batch_214228.jsonl Processed
batch_214228.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_214228.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_214228.jsonl
batch_214228.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_526719.jsonl Loaded
batch_526719.jsonl Processed
batch_526719.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_526719.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_526719.jsonl
batch_526719.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_350617.jsonl Loaded
batch_350617.jsonl Processed
batch_350617.jsonl List Split
batch_350617.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_350617.jsonl
batch_350617.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_184355.jsonl Loaded
batch_184355.jsonl Processed
batch_184355.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_184355.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_184355.jsonl
batch_184355.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_202213.jsonl Loaded
batch_202213.jsonl Processed
batch_202213.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_202213.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_202213.jsonl
batch_202213.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_334376.jsonl Loaded
batch_334376.jsonl Processed
batch_334376.jsonl List Split
batch_334376.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_334376.jsonl
batch_334376.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_530975.jsonl Loaded
batch_530975.jsonl Processed


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_530975.jsonl List Split
batch_530975.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_530975.jsonl
batch_530975.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_341238.jsonl Loaded
batch_341238.jsonl Processed
batch_341238.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_341238.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_341238.jsonl
batch_341238.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_678838.jsonl Loaded
batch_678838.jsonl Processed
batch_678838.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_678838.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_678838.jsonl
batch_678838.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_144356.jsonl Loaded
Error processing batch_144356.jsonl: Unterminated string starting at: line 21 column 20 (char 23204)
batch_144356.jsonl Moved to ../../../../datasets/blogger/batch_4o_fail/batch_144356.jsonl
batch_297972.jsonl Loaded
batch_297972.jsonl Processed
batch_297972.jsonl List Split
batch_297972.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_297972.jsonl


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_297972.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_481359.jsonl Loaded
batch_481359.jsonl Processed
batch_481359.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_481359.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_481359.jsonl
batch_481359.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_343349.jsonl Loaded
batch_343349.jsonl Processed
batch_343349.jsonl List Split
batch_343349.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_343349.jsonl
batch_343349.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_175880.jsonl Loaded
batch_175880.jsonl Processed


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_175880.jsonl List Split
batch_175880.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_175880.jsonl
batch_175880.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_18516.jsonl Loaded
batch_18516.jsonl Processed
batch_18516.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_18516.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_18516.jsonl
batch_18516.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_57262.jsonl Loaded
batch_57262.jsonl Processed
batch_57262.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_57262.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_57262.jsonl
batch_57262.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_542822.jsonl Loaded
batch_542822.jsonl Processed
batch_542822.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_542822.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_542822.jsonl
batch_542822.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_288775.jsonl Loaded
batch_288775.jsonl Processed
batch_288775.jsonl List Split
batch_288775.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_288775.jsonl
batch_288775.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_306438.jsonl Loaded
batch_306438.jsonl Processed
batch_306438.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_306438.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_306438.jsonl
batch_306438.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_125908.jsonl Loaded
batch_125908.jsonl Processed
batch_125908.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_125908.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_125908.jsonl
batch_125908.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_207710.jsonl Loaded
batch_207710.jsonl Processed
batch_207710.jsonl List Split
batch_207710.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_207710.jsonl
batch_207710.jsonl Removed from ../../../../datasets/blogger/batch_complete/


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_124522.jsonl Loaded
batch_124522.jsonl Processed
batch_124522.jsonl List Split
batch_124522.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_124522.jsonl
batch_124522.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_506923.jsonl Loaded
batch_506923.jsonl Processed
batch_506923.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_506923.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_506923.jsonl
batch_506923.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_37303.jsonl Loaded
batch_37303.jsonl Processed
batch_37303.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_37303.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_37303.jsonl
batch_37303.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_402496.jsonl Loaded
Error processing batch_402496.jsonl: Unterminated string starting at: line 20 column 20 (char 18698)
batch_402496.jsonl Moved to ../../../../datasets/blogger/batch_4o_fail/batch_402496.jsonl
batch_560732.jsonl Loaded
batch_560732.jsonl Processed
batch_560732.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_560732.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_560732.jsonl
batch_560732.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_621278.jsonl Loaded
batch_621278.jsonl Processed
batch_621278.jsonl List Split
batch_621278.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_621278.jsonl
batch_621278.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_397139.jsonl Loaded
batch_397139.jsonl Processed
batch_397139.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_397139.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_397139.jsonl
batch_397139.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_99088.jsonl Loaded
batch_99088.jsonl Processed
batch_99088.jsonl List Split
batch_99088.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_99088.jsonl
batch_99088.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_16188.jsonl Loaded
batch_16188.jsonl Processed


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_16188.jsonl List Split
batch_16188.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_16188.jsonl
batch_16188.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_597056.jsonl Loaded
batch_597056.jsonl Processed
batch_597056.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_597056.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_597056.jsonl
batch_597056.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_110133.jsonl Loaded
batch_110133.jsonl Processed
batch_110133.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_110133.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_110133.jsonl
batch_110133.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_274759.jsonl Loaded
batch_274759.jsonl Processed
batch_274759.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_274759.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_274759.jsonl
batch_274759.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_483447.jsonl Loaded
batch_483447.jsonl Processed
batch_483447.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_483447.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_483447.jsonl
batch_483447.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_452954.jsonl Loaded
batch_452954.jsonl Processed
batch_452954.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_452954.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_452954.jsonl
batch_452954.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_67056.jsonl Loaded
batch_67056.jsonl Processed
batch_67056.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_67056.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_67056.jsonl
batch_67056.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_623524.jsonl Loaded
batch_623524.jsonl Processed
batch_623524.jsonl List Split
batch_623524.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_623524.jsonl
batch_623524.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_207833.jsonl Loaded
batch_207833.jsonl Processed
batch_207833.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_207833.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_207833.jsonl
batch_207833.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_26252.jsonl Loaded
batch_26252.jsonl Processed
batch_26252.jsonl List Split
batch_26252.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_26252.jsonl
batch_26252.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_544873.jsonl Loaded
Error processing batch_544873.jsonl: Unterminated string starting at: line 1 column 3465 (char 3464)


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_544873.jsonl Moved to ../../../../datasets/blogger/batch_4o_fail/batch_544873.jsonl
batch_317636.jsonl Loaded
batch_317636.jsonl Processed
batch_317636.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_317636.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_317636.jsonl
batch_317636.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_433589.jsonl Loaded
batch_433589.jsonl Processed
batch_433589.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_433589.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_433589.jsonl
batch_433589.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_505205.jsonl Loaded
batch_505205.jsonl Processed
batch_505205.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_505205.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_505205.jsonl
batch_505205.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_425775.jsonl Loaded
batch_425775.jsonl Processed
batch_425775.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_425775.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_425775.jsonl
batch_425775.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_49343.jsonl Loaded
batch_49343.jsonl Processed
batch_49343.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_49343.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_49343.jsonl
batch_49343.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_490831.jsonl Loaded
batch_490831.jsonl Processed
batch_490831.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_490831.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_490831.jsonl
batch_490831.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_183047.jsonl Loaded
batch_183047.jsonl Processed
batch_183047.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_183047.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_183047.jsonl
batch_183047.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_669029.jsonl Loaded
batch_669029.jsonl Processed
batch_669029.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_669029.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_669029.jsonl
batch_669029.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_270402.jsonl Loaded
batch_270402.jsonl Processed
batch_270402.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_270402.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_270402.jsonl
batch_270402.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_148224.jsonl Loaded
batch_148224.jsonl Processed
batch_148224.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_148224.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_148224.jsonl
batch_148224.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_488822.jsonl Loaded
batch_488822.jsonl Processed
batch_488822.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_488822.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_488822.jsonl
batch_488822.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_398279.jsonl Loaded
batch_398279.jsonl Processed
batch_398279.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_398279.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_398279.jsonl
batch_398279.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_178480.jsonl Loaded
batch_178480.jsonl Processed
batch_178480.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_178480.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_178480.jsonl
batch_178480.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_279772.jsonl Loaded
batch_279772.jsonl Processed
batch_279772.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_279772.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_279772.jsonl
batch_279772.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_509577.jsonl Loaded
batch_509577.jsonl Processed
batch_509577.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_509577.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_509577.jsonl
batch_509577.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_325534.jsonl Loaded
batch_325534.jsonl Processed
batch_325534.jsonl List Split
batch_325534.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_325534.jsonl
batch_325534.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_116471.jsonl Loaded
batch_116471.jsonl Processed


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_116471.jsonl List Split
batch_116471.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_116471.jsonl
batch_116471.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_274492.jsonl Loaded
batch_274492.jsonl Processed
batch_274492.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_274492.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_274492.jsonl
batch_274492.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_222624.jsonl Loaded
batch_222624.jsonl Processed
batch_222624.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_222624.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_222624.jsonl
batch_222624.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_155142.jsonl Loaded
batch_155142.jsonl Processed
batch_155142.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_155142.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_155142.jsonl
batch_155142.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_512026.jsonl Loaded
batch_512026.jsonl Processed
batch_512026.jsonl List Split
batch_512026.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_512026.jsonl
batch_512026.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_174699.jsonl Loaded
batch_174699.jsonl Processed
batch_174699.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_174699.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_174699.jsonl
batch_174699.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_114599.jsonl Loaded
batch_114599.jsonl Processed
batch_114599.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_114599.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_114599.jsonl
batch_114599.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_122748.jsonl Loaded
batch_122748.jsonl Processed
batch_122748.jsonl List Split
batch_122748.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_122748.jsonl
batch_122748.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_198223.jsonl Loaded


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_198223.jsonl Processed
batch_198223.jsonl List Split
batch_198223.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_198223.jsonl
batch_198223.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_517663.jsonl Loaded
batch_517663.jsonl Processed
batch_517663.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_517663.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_517663.jsonl
batch_517663.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_386133.jsonl Loaded
batch_386133.jsonl Processed
batch_386133.jsonl List Split
batch_386133.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_386133.jsonl
batch_386133.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_546588.jsonl Loaded
batch_546588.jsonl Processed


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_546588.jsonl List Split
batch_546588.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_546588.jsonl
batch_546588.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_676573.jsonl Loaded
batch_676573.jsonl Processed
batch_676573.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_676573.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_676573.jsonl
batch_676573.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_306449.jsonl Loaded
batch_306449.jsonl Processed
batch_306449.jsonl List Split
batch_306449.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_306449.jsonl
batch_306449.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_607120.jsonl Loaded
batch_607120.jsonl Processed
batch_607120.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_607120.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_607120.jsonl
batch_607120.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_272513.jsonl Loaded
batch_272513.jsonl Processed
batch_272513.jsonl List Split
batch_272513.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_272513.jsonl
batch_272513.jsonl Removed from ../../../../datasets/blogger/batch_complete/


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_496050.jsonl Loaded
batch_496050.jsonl Processed
batch_496050.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_496050.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_496050.jsonl
batch_496050.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_82243.jsonl Loaded
batch_82243.jsonl Processed
batch_82243.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_82243.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_82243.jsonl
batch_82243.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_183886.jsonl Loaded
batch_183886.jsonl Processed
batch_183886.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_183886.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_183886.jsonl
batch_183886.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_363388.jsonl Loaded
batch_363388.jsonl Processed
batch_363388.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_363388.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_363388.jsonl
batch_363388.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_219981.jsonl Loaded
batch_219981.jsonl Processed
batch_219981.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_219981.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_219981.jsonl
batch_219981.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_394143.jsonl Loaded
batch_394143.jsonl Processed
batch_394143.jsonl List Split
batch_394143.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_394143.jsonl
batch_394143.jsonl Removed from ../../../../datasets/blogger/batch_complete/


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_579184.jsonl Loaded
batch_579184.jsonl Processed
batch_579184.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_579184.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_579184.jsonl
batch_579184.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_331546.jsonl Loaded
batch_331546.jsonl Processed
batch_331546.jsonl List Split
batch_331546.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_331546.jsonl
batch_331546.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_291125.jsonl Loaded
batch_291125.jsonl Processed


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_291125.jsonl List Split
batch_291125.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_291125.jsonl
batch_291125.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_611550.jsonl Loaded
batch_611550.jsonl Processed
batch_611550.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_611550.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_611550.jsonl
batch_611550.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_327604.jsonl Loaded
batch_327604.jsonl Processed
batch_327604.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_327604.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_327604.jsonl
batch_327604.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_552933.jsonl Loaded
batch_552933.jsonl Processed
batch_552933.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_552933.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_552933.jsonl
batch_552933.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_585356.jsonl Loaded
batch_585356.jsonl Processed
batch_585356.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_585356.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_585356.jsonl
batch_585356.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_551652.jsonl Loaded
batch_551652.jsonl Processed
batch_551652.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_551652.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_551652.jsonl
batch_551652.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_491687.jsonl Loaded
batch_491687.jsonl Processed
batch_491687.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_491687.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_491687.jsonl
batch_491687.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_463289.jsonl Loaded
batch_463289.jsonl Processed
batch_463289.jsonl List Split
batch_463289.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_463289.jsonl
batch_463289.jsonl Removed from ../../../../datasets/blogger/batch_complete/
batch_171105.jsonl Loaded
batch_171105.jsonl Processed
batch_171105.jsonl List Split


  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
  result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)


batch_171105.jsonl Saved to ../../../../datasets/blogger/batch_4o/doc_171105.jsonl
batch_171105.jsonl Removed from ../../../../datasets/blogger/batch_complete/


In [166]:
fail_test =read_jsonl_file(f"{batch_complete_loc}batch_144356.jsonl")
processed_dataframe = process_dataframe(fail_test)
final_df = process_rephrased_sentences(processed_dataframe)

JSONDecodeError: Unterminated string starting at: line 21 column 20 (char 23204)

In [169]:
fail_test

Unnamed: 0,id,custom_id,response,doc_id,chunk_id,repetition
0,batch_req_iwJgunTo81E2QukR0O3D8s0X,doc_144356_chunk_1_1,"{""original"": ""Almost the end of July.&nbsp; Ho...",144356,1,1
1,batch_req_koOWaGvqW0AZCglgHlo6eYuX,doc_144356_chunk_1_2,"{""original"": ""Almost the end of July. How is y...",144356,1,2
2,batch_req_1O2H6cY6YRsBEgm8is5IFz1T,doc_144356_chunk_1_3,"{""original"": ""Almost the end of July.&nbsp; Ho...",144356,1,3
3,batch_req_yZOdBE93nNZWRBhDRyt3CTEn,doc_144356_chunk_1_4,"{""original"": ""Almost the end of July. How is y...",144356,1,4
4,batch_req_VSe2v2S3Uay1xbhZ5PQRHsZK,doc_144356_chunk_1_5,"{""original"": ""Almost the end of July. How is y...",144356,1,5
5,batch_req_HBUp6qGcAlzxNsStFhJ3GBKM,doc_144356_chunk_1_6,"{""original"": ""Almost the end of July. How is y...",144356,1,6
6,batch_req_9mU0h5BHyqXpIUUcRO5fRrSl,doc_144356_chunk_1_7,"{""original"": ""Almost the end of July. How is y...",144356,1,7
7,batch_req_YdgXrNdRcMia0ZcPmjrTtexP,doc_144356_chunk_1_8,"{""original"": ""Almost the end of July. How is y...",144356,1,8
8,batch_req_tTzKiCFDvn4Xle2jAweFI0Mb,doc_144356_chunk_1_9,"{""original"": ""Almost the end of July.&nbsp; Ho...",144356,1,9
9,batch_req_rZsjSdN4BpyHC9tsTtOzhPup,doc_144356_chunk_1_10,"{""original"": ""Almost the end of July. How is y...",144356,1,10


In [168]:
for index, row in fail_test.iterrows():
    response_str = row['response']
    try:
        response_json = json.loads(response_str)
    except json.JSONDecodeError:
        print(f"Error parsing JSON in row {index}: {response_str}")

Error parsing JSON in row 1: {"original": "Almost the end of July. How is your summer going? Yesterday I visited some Blogs on the Internet and picked up a nasty adware or spyware virus, which has disabled my Internet browser, closes open windows automatically, opens up pop-up advertising windows indiscriminately, and seems to keep reinstalling itself, despite my best attempts to delete any foreign files that I find in my operating system. Mind you, I am such an inept technophobe that I couldn’t differentiate a malicious virus from a harmless bug. None the less, as I am not the administrator of my computer at work, I have to wait until he comes back on Monday to access my computer remote, and reboot it after, hopefully, isolating and decontaminating my hard drive. What a nuisance. I was doubly surprised to find out that my networked PC did not even have any anti-virus software on it, thereby making it a perfect roaming target for this adware or spyware to attach itself to my computer, 

In [161]:
final_df

Unnamed: 0,doc_id,chunk_id,original,rephrased
0,617540,1,It has been suggested that I share some storie...,I have been recommended to talk about some sto...
1,617540,1,It has been suggested that I share some storie...,I've been asked to recount stories about my st...
2,617540,1,It has been suggested that I share some storie...,There have been recommendations for me to reco...
3,617540,1,It has been suggested that I share some storie...,People have suggested that I share a few tales...
4,617540,1,It has been suggested that I share some storie...,It has been proposed that I tell some stories ...
...,...,...,...,...
7943,617540,9,they always have cash and insist on paying for...,They consistently carry cash and make it a poi...
7944,617540,9,they always have cash and insist on paying for...,They always seem to have cash available and in...
7945,617540,9,they always have cash and insist on paying for...,They are always seen with cash and are adamant...
7946,617540,9,they always have cash and insist on paying for...,They typically carry cash and insist on handli...


In [156]:
save_loc = f"{batch_save_loc}batch_402496.jsonl"
save_as_jsonl(final_df, save_loc)

In [33]:
grammar = r"""
start: "[" items "]"

items: item ("," item)*

item: ESCAPED_STRING

ESCAPED_STRING: "\"" /[a-zA-Z0-9.,!?;:’\-()' ]+/ "\""

%ignore " "
%ignore "\n"
"""

parser = Lark(grammar, parser='earley')

In [36]:
grammar = r"""
start: "[" items "]"

items: item ("," item)*

item: ESCAPED_STRING

ESCAPED_STRING: "\"" /[^"]+/ "\""

%ignore " "
%ignore "\n"
"""

parser = Lark(grammar, parser='earley')

In [37]:
def validate_and_parse_response(df, parser):
    # # Load the grammar
    # with open(grammar_file, 'r') as file:
    #     grammar = file.read()
    
    # parser = Lark(grammar, start='start', parser='earley')

    # Function to parse each response
    def parse_response(response):
        try:
            # Check if the response is a string, if so, parse it
            if isinstance(response, str):
                # Validate the response using the grammar
                parser.parse(response)
                # Convert the string to an actual list
                return ast.literal_eval(response)
            else:
                return response
        except (UnexpectedInput, ValueError, SyntaxError) as e:
            print(f"Invalid response format: {response} - Error: {e}")
            return response  # Return the original string if parsing fails

    # Loop through each row and validate/parse the 'response' column
    for index, row in df.iterrows():
        df.at[index, 'response'] = parse_response(row['response'])
    
    return df

In [38]:
test_converted = validate_and_parse_response(test, parser)

Invalid response format: ["It presents itself as more of a supportive scenario rather than a case of earning a hefty sum to have an unpleasant individual with a waste bag beside him perform an unsavory act on you.", 
 "It creates the impression of being a more charitable context rather than a lucrative opportunity to have a repulsive man with a bodily waste bag engage in an indecent act with you.", 
 "It feels more like a scenario where assistance is offered rather than a situation where you get paid a significant amount to have a disgusting man with a sanitary bag next to him satisfy your lower regions.", 
 "This makes it seem more like a situation where you're lending a hand instead of one where you receive substantial compensation to endure the presence of a foul person carrying a waste pouch licking you.", 
 "It comes across as a more altruistic arrangement than one that involves receiving a lot of money for enduring the advances of a nasty guy with a defecation bag beside him.", 


In [15]:
test_converted.iloc[0,2]

'["It has been proposed that I share a few tales about my stripper friends.", "Some have recommended that I tell some stories involving my stripper acquaintances.", "There has been a suggestion for me to relay stories of my stripper buddies.", "I’ve been advised to share anecdotes about my stripper pals.", "It has been hinted that I should narrate some experiences with my stripper friends.", "People have suggested that I recount some stories related to my stripper companions.", "Someone recommended that I share tales featuring my stripper friends.", "There has been an idea floated that I should talk about my stripper pals.", "It has been put forward that I narrate some stories about my stripper friends.", "I’ve received suggestions to tell some stories regarding my stripper friends.", "Some individuals have suggested that I share stories of my stripper buddies.", "I\'ve been encouraged to discuss my experiences with my stripper pals.", "It has been communicated that I might share some 

In [16]:
test_converted

Unnamed: 0,id,custom_id,response,doc_id,chunk_id,repetition
0,batch_req_SGhzjs2FC09ZbXkFCIFVKXAE,doc_617540_chunk_1_1,"[""It has been proposed that I share a few tale...",617540,1,1
1,batch_req_rzSSPUOj9hzG1RRSNSet8rEy,doc_617540_chunk_1_2,"[""It has been recommended that I share a few t...",617540,1,2
2,batch_req_m0hECxM0s1dGDMg7BUyybsxr,doc_617540_chunk_1_3,"[""It has been proposed that I recount some tal...",617540,1,3
3,batch_req_fNvwAF3S3oXC73x2NdNnzhmb,doc_617540_chunk_1_4,"[""It has been recommended that I share a few t...",617540,1,4
4,batch_req_4tzMUFTOq0L52SqnzAKehBVp,doc_617540_chunk_1_5,"[""It has been recommended that I tell some tal...",617540,1,5
...,...,...,...,...,...,...
465,batch_req_ykOKTuWBaylBo3jUIwPh2zMp,doc_617540_chunk_47_6,"[""But I have a feeling that someone made a mov...",617540,47,6
466,batch_req_VO6mIoTEcEcxWWee9Wvxg6KZ,doc_617540_chunk_47_7,"[""But I have a feeling that someone made a mov...",617540,47,7
467,batch_req_B8U6pJ7ToLMa3H39Srserxs9,doc_617540_chunk_47_8,"[""But I have a feeling that someone flirted wi...",617540,47,8
468,batch_req_w50FDCIKy07GW6fOILlCuQpy,doc_617540_chunk_47_9,"[""But I have a feeling that someone flirted wi...",617540,47,9


In [31]:
from lark import Lark, Transformer

grammar = r"""
start: "[" items "]"

items: item ("," item)*

item: string

string: "\"" word (ws+ word)* "\"" 

word: /[a-zA-Z0-9.,!?;:’\-()']+/

ws: " "

%ignore ws
%ignore "\n"
"""

parser = Lark(grammar, parser='earley')

class ListTransformer(Transformer):
    def start(self, items):
        return items[0]

    def items(self, items):
        return items

    def item(self, item):
        return item[0]

    def string(self, string):
        return "".join(string)

    def word(self, word):
        return str(word[0])

tree = parser.parse('["hello", "world", "example"]')
result = ListTransformer().transform(tree)
print(result)


GrammarError: Rules aren't allowed inside terminals (NonTerminal('ws') in __IGNORE_0)