In [None]:
import os
import sys
import subprocess

from from_root import from_root

sys.path.insert(0, str(from_root("src")))

from utils import get_base_location
from read_and_write_docs import read_jsonl

In [None]:

corpus      = "Wiki"
data_type   = "training"

# Set NAS so can run on Windows laptop seamlessly
nas_base_loc = get_base_location()

known_loc = f"{nas_base_loc}/datasets/author_verification/{data_type}/{corpus}/known_raw.jsonl"
unknown_loc = f"{nas_base_loc}/datasets/author_verification/{data_type}/{corpus}/unknown_raw.jsonl"
metadata_loc = f"{nas_base_loc}/datasets/author_verification/{data_type}/metadata.rds"
model_loc = f"{nas_base_loc}/models/Qwen 2.5/Qwen2.5-0.5B-Instruct"
save_loc = f"{nas_base_loc}/paraphrase examples"

corpus      = "Wiki"
data_type   = "training"

openai_model = "gpt-4.1"
max_tokens   = 5000
temperature  = 0.7
n_samples    = 10

script_loc = str(from_root("scripts", "run_openai_paraphrase_method.py"))

In [None]:
# Load the pre-made problem datasets for speed
problem_dataset_base = f"{nas_base_loc}/datasets/author_verification/{data_type}/{corpus}"
problem_dataset_agg = read_jsonl(f"{problem_dataset_base}/{corpus}_{data_type}_agg.jsonl")

In [None]:
same_probs = problem_dataset_agg[problem_dataset_agg['known_author'] == problem_dataset_agg['unknown_author']].copy()
same_probs.sort_values(["highest_common_count"], ascending=[False], inplace=True)
same_probs[(same_probs['highest_common_count'] >= 3) & (same_probs['highest_common_count'] <= 10)].head(30)

In [None]:
diff_probs = problem_dataset_agg[problem_dataset_agg['known_author'] != problem_dataset_agg['unknown_author']].copy()
diff_probs.sort_values(["highest_common_count"], ascending=[False], inplace=True)
diff_probs[(diff_probs['highest_common_count'] >= 3) & (diff_probs['highest_common_count'] <= 10)].head(30)

In [None]:
tests = {
    "test_01": {"known": "alienus_text_11", "unknown": "amalthea_text_5"},
    "test_02": {"known": "david_shankbone_text_2", "unknown": "delicious_carbuncle_text_1"},
    "test_03": {"known": "dennis_brown_text_12", "unknown": "dennis_brown_text_11"},
    "test_04": {"known": "falcon9x5_text_4", "unknown": "falcon9x5_text_1"}
}

In [None]:
# sys.executable.replace("c:\\", "C:/").replace("\\", "/")

In [None]:
# script_loc.replace("C:\\", "C:/").replace("\\", "/")

In [None]:
env = dict(os.environ, PYTHONUNBUFFERED="1")

for test_num, entry in tests.items():

    known_doc = entry['known']
    unknown_doc = entry['unknown']
    print(f"Working on {test_num}: {known_doc} vs {unknown_doc}")
    
    cmd = [
        sys.executable.replace("c:\\", "C:/").replace("\\", "/"), "-u", script_loc.replace("C:\\", "C:/").replace("\\", "/"),
        "--known_loc", known_loc,
        "--unknown_loc", unknown_loc,
        "--metadata_loc", metadata_loc,
        "--model_loc", model_loc,
        "--save_loc", save_loc,
        "--corpus", corpus,
        "--data_type", data_type,
        "--known_doc", known_doc,
        "--unknown_doc", unknown_doc,
        "--openai_model", openai_model,
        "--max_tokens", str(max_tokens),
        "--temperature", str(temperature),
        "--n", str(n_samples),
    ]

    subprocess.run(cmd, text=True, check=True)