# 01_split_known_docs

Script to split the knownn corpus into seperate files allowing for parallel paraphrasing.

In [42]:
import json
import os
import re

import pandas as pd

In [43]:
data_type = "training"
corpus = "Wiki"

data_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/{corpus}"

raw_data_loc = f"/Volumes/BCross/datasets/author_verification/{data_type}/{corpus}/known_raw.jsonl"

save_loc = f"{data_loc}/known_corpus_split/"
os.makedirs(save_loc, exist_ok=True)

In [44]:
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Parse the line as JSON
            parsed_line = json.loads(line)
            # If the line is a single-element list, extract the first element
            if isinstance(parsed_line, list) and len(parsed_line) == 1:
                data.append(parsed_line[0])
            else:
                data.append(parsed_line)
    
    # Convert to a DataFrame
    data = pd.DataFrame(data)
    return data

def write_jsonl(data, output_file_path):
    with open(output_file_path, 'w') as file:
        for _, row in data.iterrows():
            json.dump(row.to_dict(), file)
            file.write('\n')
            
def create_temp_doc_id(input_text):
    # Extract everything between the brackets
    match = re.search(r'\[(.*?)\]', input_text)
    if match:
        extracted_text = match.group(1)
        # Replace all punctuation and spaces with "_"
        cleaned_text = re.sub(r'[^\w]', '_', extracted_text)
        # Replace multiple underscores with a single "_"
        final_text = re.sub(r'_{2,}', '_', cleaned_text)
        return final_text.lower()
    return None

In [45]:
df = read_jsonl(raw_data_loc)

# Rename doc_id to orig_doc_id first
df.rename(columns={'doc_id': 'orig_doc_id'}, inplace=True)

# Create the new doc_id column directly
df['doc_id'] = df['orig_doc_id'].apply(create_temp_doc_id)
# Move the new doc_id column to the front
cols = ['doc_id'] + [col for col in df.columns if col not in ['doc_id', 'text']] + ['text']

df = df[cols]

In [46]:
# Loop through unique doc_id values in the dataframe
for doc_id in df['doc_id'].unique():
    # Optionally, filter the dataframe rows for this doc_id
    doc_df = df[df['doc_id'] == doc_id]
    
    # Build the file path using the doc_id
    file_path = os.path.join(save_loc, f"{doc_id}.jsonl")
    
    # Write the data to a JSONL file using your function
    write_jsonl(doc_df, file_path)