In [49]:
import json
import os
import re

import pandas as pd

In [50]:
data_type = "training"
corpus = "Enron"

base_loc = f"/Volumes/BCross/datasets/author_verification/"
data_loc = f"{base_loc}{data_type}/{corpus}"

raw_data_loc = f"{data_loc}/known_raw.jsonl"
paraphrase_loc = f"{data_loc}/full_doc_paraphrase"

In [51]:
def read_jsonl(file_path):
    """
    Reads a JSONL file and converts it into a pandas DataFrame.

    Parameters:
    - file_path: Path to the JSONL file to read.

    Returns:
    - A pandas DataFrame containing the data from the JSONL file.
    """
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Parse the line as JSON
            parsed_line = json.loads(line)
            # If the line is a single-element list, extract the first element
            if isinstance(parsed_line, list) and len(parsed_line) == 1:
                data.append(parsed_line[0])
            else:
                data.append(parsed_line)
    
    # Convert to a DataFrame
    data = pd.DataFrame(data)
    return data

def create_temp_doc_id(input_text):
    # Extract everything between the brackets
    match = re.search(r'\[(.*?)\]', input_text)
    if match:
        extracted_text = match.group(1)
        # Replace all punctuation and spaces with "_"
        cleaned_text = re.sub(r'[^\w]', '_', extracted_text)
        # Replace multiple underscores with a single "_"
        final_text = re.sub(r'_{2,}', '_', cleaned_text)
        return final_text.lower()
    return None

## Get Document ID's from Raw Data

In [52]:
raw_df = read_jsonl(raw_data_loc)

# Rename doc_id to orig_doc_id first
raw_df.rename(columns={'doc_id': 'orig_doc_id'}, inplace=True)

# Create the new doc_id column directly
raw_df['doc_id'] = raw_df['orig_doc_id'].apply(create_temp_doc_id)

raw_doc_id = raw_df['doc_id']

## Get Files in Paraphrase Location

In [53]:
files = os.listdir(paraphrase_loc)

# Filter only .jsonl files
jsonl_files = sorted([file for file in files if file.endswith(".jsonl")])

## Save List if None Missing

In [55]:
# Remove .jsonl extension from filenames in jsonl_files
jsonl_ids = {file.replace(".jsonl", "") for file in jsonl_files}

# Convert raw_doc_id to a set
raw_doc_id_set = set(raw_doc_id)

# Find missing elements in jsonl_files that are in raw_doc_id
missing_ids = raw_doc_id_set - jsonl_ids

# Store missing IDs as a variable
missing_ids_list = sorted(missing_ids)

# If no missing IDs, save the complete list to a .txt file
if not missing_ids_list:
    output_file = os.path.join(data_loc, "parascore_files_list.txt")
    with open(output_file, "w") as f:
        for file in sorted(jsonl_ids):
            f.write(file + "\n")
    print(f"All files are present. Saved complete list to {output_file}")
    print(f"Number of files for job script: {len(jsonl_ids)}")
else:
    print("Missing IDs:", missing_ids_list)


All files are present. Saved complete list to /Volumes/BCross/datasets/author_verification/training/Enron/parascore_files_list.txt
Number of files for job script: 112
