In [19]:
import os
import json

import pandas as pd

In [20]:
%run "../scorer.py"

In [21]:
credential_loc = "../../credentials.json"

data_type = "training"
corpus = "TripAdvisor"

base_loc = f"/Volumes/BCross/datasets/author_verification/"
data_loc = f"{base_loc}{data_type}/{corpus}/"

raw_data_loc = f"{data_loc}known_raw.jsonl"
processed_data_loc = f"{data_loc}known_processed.jsonl"
batch_complete_loc = f"{data_loc}batch_sentence_complete/"
post_process_loc = f"{data_loc}batch_postprocessed/"

# ParaScore save location
parascore_loc = f"{data_loc}batch_parascore/"
os.makedirs(parascore_loc, exist_ok=True)

# Phone number for WhatsApp notifications
phone_number = "+447756976114"

In [22]:
def read_jsonl(file_path):
    """
    Reads a JSONL file and converts it into a pandas DataFrame.

    Parameters:
    - file_path: Path to the JSONL file to read.

    Returns:
    - A pandas DataFrame containing the data from the JSONL file.
    """
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            # Parse the line as JSON
            parsed_line = json.loads(line)
            # If the line is a single-element list, extract the first element
            if isinstance(parsed_line, list) and len(parsed_line) == 1:
                data.append(parsed_line[0])
            else:
                data.append(parsed_line)
    
    # Convert to a DataFrame
    data = pd.DataFrame(data)
    return data

def write_jsonl(data, output_file_path):
    with open(output_file_path, 'w') as file:
        for _, row in data.iterrows():
            json.dump(row.to_dict(), file)
            file.write('\n')

In [23]:
raw_df = read_jsonl(raw_data_loc)

batch_completed_files = [
    f for f in os.listdir(batch_complete_loc)
    if os.path.isfile(os.path.join(batch_complete_loc, f)) and f.endswith('.jsonl')
]

files_postprocessed = [
    f for f in os.listdir(post_process_loc)
    if os.path.isfile(os.path.join(post_process_loc, f)) and f.endswith('.jsonl')
]

# Replace "batch_" with "doc_" in each element of files_processed
# files_postprocessed = [file.replace("doc_", "batch_") for file in files_postprocessed]

files_parascore_complete = [
    f for f in os.listdir(parascore_loc)
    if os.path.isfile(os.path.join(parascore_loc, f)) and f.endswith('.jsonl')
]

files_to_be_processed = list(set(files_postprocessed) - set(files_parascore_complete))
files_to_be_processed = sorted(files_to_be_processed)

print(f"Number of documents to process in raw data: {len(raw_df['doc_id'])}")
print(f"Files Complete in Batch location: {len(batch_completed_files)}")
print(f"Files Post Processed: {len(files_postprocessed)}")
print(f"Files with Score: {len(files_parascore_complete)}")
print(f"Files to be Processed: {len(files_to_be_processed)}")

Number of documents to process in raw data: 104
Files Complete in Batch location: 103
Files Post Processed: 103
Files with Score: 43
Files to be Processed: 60


In [24]:
parascore_free = ParaphraseScorer(score_type='parascore_free', model_type='bert-base-uncased')



In [None]:
for idx, file in enumerate(files_to_be_processed):
    
    print(f"Processing {idx + 1} out of {len(files_to_be_processed)}: {os.path.basename(file)}")
    output_file_loc = f"{parascore_loc}{file}"
    
    try:
        
        df = read_jsonl(f"{post_process_loc}{file}")

        if 'original' not in df.columns:
            df.rename(columns={"original_sentence": "original"}, inplace=True)

        df_with_score = parascore_free.calculate_score(df)

        write_jsonl(df_with_score, output_file_loc)
        
    except Exception as e:
        print(f"Error processing {file}: {e}")
        continue

Processing 1 out of 60: doc_TripAdvisor_MaineIzzy.jsonl
Processing 2 out of 60: doc_TripAdvisor_Mano40.jsonl
Processing 3 out of 60: doc_TripAdvisor_Mark H.jsonl
Processing 4 out of 60: doc_TripAdvisor_Michael C.jsonl
Processing 5 out of 60: doc_TripAdvisor_MikeyBikey.jsonl
Processing 6 out of 60: doc_TripAdvisor_Nicks_gal.jsonl
Processing 7 out of 60: doc_TripAdvisor_PandEKew.jsonl
Processing 8 out of 60: doc_TripAdvisor_Paul W.jsonl
Processing 9 out of 60: doc_TripAdvisor_Peter G.jsonl
Processing 10 out of 60: doc_TripAdvisor_Peter R.jsonl
Processing 11 out of 60: doc_TripAdvisor_Peter W.jsonl
Processing 12 out of 60: doc_TripAdvisor_Peter_Joan.jsonl
Processing 13 out of 60: doc_TripAdvisor_Rachel_Whales.jsonl
Processing 14 out of 60: doc_TripAdvisor_Rags2006.jsonl
Processing 15 out of 60: doc_TripAdvisor_RallyMonkey.jsonl
Processing 16 out of 60: doc_TripAdvisor_RedSox28.jsonl
Processing 17 out of 60: doc_TripAdvisor_RoryMc.jsonl
Processing 18 out of 60: doc_TripAdvisor_Samuel-D.jso