In [352]:
import json
import os
import shutil
import warnings

import pandas as pd

In [353]:
credential_loc = "../../credentials.json"

data_type = "training"
corpus = "Enron"

base_loc = f"/Volumes/BCross/datasets/author_verification/"
data_loc = f"{base_loc}{data_type}/{corpus}/"

raw_data_loc = f"{data_loc}known_raw.jsonl"
processed_data_loc = f"{data_loc}known_processed.jsonl"
batch_complete_loc = f"{data_loc}batch_sentence_complete/"

# ParaScore save location
post_process_loc = f"{data_loc}batch_postprocessed/"
os.makedirs(post_process_loc, exist_ok=True)

# Phone number for WhatsApp notifications
phone_number = "+447756976114"

### Helper Functions

In [354]:
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Parse the line as JSON
            parsed_line = json.loads(line)
            # If the line is a single-element list, extract the first element
            if isinstance(parsed_line, list) and len(parsed_line) == 1:
                data.append(parsed_line[0])
            else:
                data.append(parsed_line)
    
    # Convert to a DataFrame
    data = pd.DataFrame(data)
    return data

def write_jsonl(data, output_file_path):
    with open(output_file_path, 'w') as file:
        for _, row in data.iterrows():
            json.dump(row.to_dict(), file)
            file.write('\n')


In [355]:
raw_df = read_jsonl(raw_data_loc)

batch_completed_files = [
    f for f in os.listdir(batch_complete_loc)
    if os.path.isfile(os.path.join(batch_complete_loc, f)) and f.endswith('.jsonl')
]

files_processed = [
    f for f in os.listdir(post_process_loc)
    if os.path.isfile(os.path.join(post_process_loc, f)) and f.endswith('.jsonl')
]

# Replace "batch_" with "doc_" in each element of files_processed
files_processed = [file.replace("doc_", "batch_") for file in files_processed]

files_to_be_processed = list(set(batch_completed_files) - set(files_processed))

print(f"Number of documents to process in raw data: {len(raw_df['doc_id'])}")
print(f"Files Complete in Batch location: {len(batch_completed_files)}")
print(f"Files Post Processed: {len(files_processed)}")
print(f"Files to be Processed: {len(files_to_be_processed)}")

Number of documents to process in raw data: 112
Files Complete in Batch location: 112
Files Post Processed: 0
Files to be Processed: 112


In [356]:
def split_custon_id(custon_id):
    parts = custon_id.split('_')
    doc_id = parts[2] + "_" + parts[3] + "_" + parts[4]
    chunk_id = parts[-2]
    repetition = parts[-1]
    return pd.Series([doc_id, chunk_id, repetition])

def parse_response(response_str):
    # Convert the JSON string to a Python dictionary
    response_dict = json.loads(response_str)
    
    # Extract the 'original' sentence
    original_sentence = response_dict.get('original', '')
    
    # Extract other keys and add them to the list with 'repetition_i' format
    rephrased = []
    for key, value in response_dict.items():
        if key != 'original':
            rephrased.append(value)
    
    return original_sentence, rephrased

def process_dataframe(df):

    if 'custom_id' not in df.columns:
        df['doc_id'] = df['doc_id'].str.replace("batch_", "")

        df = df[['doc_id', 'chunk_id', 'original', 'rephrased']]

    else:
        df[['doc_id', 'chunk_id', 'repetition']] = df['custom_id'].apply(split_custon_id)
        
        # Apply the parse_response function to each row of the dataframe
        df['original_sentence'], df['rephrased'] = zip(*df['response'].apply(parse_response))

        df = df[['doc_id', 'chunk_id', 'original_sentence', 'rephrased']]
    
    return df

def combine_and_unique(group):
    # Combine all lists in the 'rephrased' column into one list
    combined_list = sum(group['rephrased'], [])
    
    # Remove duplicates by converting to a set and back to a list
    unique_list = list(set(combined_list))
    
    # Return a DataFrame where each unique rephrased sentence is a new row
    return pd.DataFrame({
        'doc_id': group['doc_id'].iloc[0],
        'chunk_id': group['chunk_id'].iloc[0],
        'original': group['original_sentence'].iloc[0],
        'rephrased': unique_list
    })

def process_rephrased_sentences(df):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", DeprecationWarning)
        # Group by 'doc_id' and 'chunk_id' and apply the combine_and_unique function
        result_df = df.groupby(['doc_id', 'chunk_id']).apply(combine_and_unique).reset_index(drop=True)
        
    return result_df

In [357]:
def process_file_list(files, base_read_loc, base_write_loc):

    files = sorted(files)
    total_files = len(files)  # Total number of files to process
    for i, file in enumerate(files, start=1):  # Add index to track file number
        complete_combine_step = False
        try:
            print(f"Processing file {i} of {total_files}: {file}")  # Tracker message
            
            # Step 1: Read the JSONL file
            response = read_jsonl(f"{base_read_loc}{file}")

            if 'custom_id' in response.columns:
                complete_combine_step = True
            
            # Step 2: Process the DataFrame
            processed_dataframe = process_dataframe(response)

            if complete_combine_step:
                # Step 3: Split the list into separate rows
                final_df = process_rephrased_sentences(processed_dataframe)
            else:
                final_df = processed_dataframe
            
            # Step 4: Save the processed DataFrame to a new location
            save_loc = f"{base_write_loc}{file.replace('batch', 'doc')}"
            write_jsonl(final_df, save_loc)
        
        except Exception as e:
            print(f"    Error processing {file}: {e}")
            continue

In [358]:
process_file_list(files_to_be_processed, batch_complete_loc, post_process_loc)

Processing file 1 of 112: batch_andy_zipper_mail_1.jsonl
    Error processing batch_andy_zipper_mail_1.jsonl: Expecting ',' delimiter: line 2 column 5047 (char 5048)
Processing file 2 of 112: batch_andy_zipper_mail_3.jsonl
Processing file 3 of 112: batch_andy_zipper_mail_4.jsonl
Processing file 4 of 112: batch_andy_zipper_mail_5.jsonl
Processing file 5 of 112: batch_barry_tycholiz_mail_1.jsonl
Processing file 6 of 112: batch_barry_tycholiz_mail_3.jsonl
Processing file 7 of 112: batch_barry_tycholiz_mail_4.jsonl
Processing file 8 of 112: batch_barry_tycholiz_mail_5.jsonl
Processing file 9 of 112: batch_benjamin_rogers_mail_2.jsonl
Processing file 10 of 112: batch_benjamin_rogers_mail_3.jsonl
Processing file 11 of 112: batch_benjamin_rogers_mail_4.jsonl
Processing file 12 of 112: batch_bill_williams_mail_2.jsonl
Processing file 13 of 112: batch_bill_williams_mail_3.jsonl
Processing file 14 of 112: batch_bill_williams_mail_4.jsonl
Processing file 15 of 112: batch_cara_semperger_mail_1.jso