In [100]:
import os
import sys
import subprocess

from from_root import from_root

sys.path.insert(0, str(from_root("src")))

from utils import get_base_location
from read_and_write_docs import read_jsonl

In [None]:

corpus      = "Wiki"
data_type   = "test"

# Set NAS so can run on Windows laptop seamlessly
nas_base_loc = get_base_location()

known_loc = f"{nas_base_loc}/datasets/author_verification/{data_type}/{corpus}/known_raw.jsonl"
unknown_loc = f"{nas_base_loc}/datasets/author_verification/{data_type}/{corpus}/unknown_raw.jsonl"
metadata_loc = f"{nas_base_loc}/datasets/author_verification/{data_type}/metadata.rds"
model_loc = f"{nas_base_loc}/models/Qwen 2.5/Qwen2.5-0.5B-Instruct"
save_loc = f"{nas_base_loc}/paraphrase examples/{corpus}-{data_type}"
completed_loc = f"{nas_base_loc}/paraphrase examples/{corpus}-{data_type}-completed"

openai_model = "gpt-4.1"
max_tokens   = 5000
temperature  = 0.7
n_samples    = 10

script_loc = str(from_root("scripts", "run_openai_paraphrase_method.py"))

In [102]:
# Load the pre-made problem datasets for speed
problem_dataset_base = f"{nas_base_loc}/datasets/author_verification/{data_type}/{corpus}"
problem_dataset_agg = read_jsonl(f"{problem_dataset_base}/{corpus}_{data_type}_agg.jsonl")

In [103]:
same_probs = problem_dataset_agg[problem_dataset_agg['known_author'] == problem_dataset_agg['unknown_author']].copy()
same_probs.sort_values(["highest_common_count"], ascending=[False], inplace=True)
same_probs[(same_probs['highest_common_count'] >= 3) & (same_probs['highest_common_count'] <= 10)].head(10)

Unnamed: 0,problem,known_author,unknown_author,known_doc_id,unknown_doc_id,highest_common_count,highest_common_ngram
601,Vedant vs Vedant,Vedant,Vedant,vedant_text_2,vedant_text_4,10,this Ġis Ġthe Ġproblem Ġwhen Ġindian Ġnational...
236,Mayalld vs Mayalld,Mayalld,Mayalld,mayalld_text_4,mayalld_text_3,10,"Ġunder Ġthe Ġname Ġof Ġthe Ġmaster Ġaccount , ..."
546,Swift&silent vs Swift&silent,Swift&silent,Swift&silent,swift_silent_text_1,swift_silent_text_2,10,"ug ab oy 5 3 5 1 3 6 ,"
506,Snowded vs Snowded,Snowded,Snowded,snowded_text_2,snowded_text_1,8,Ġis Ġa Ġbehavioural Ġissue Ġnot Ġa Ġcontent Ġone
138,KBlott vs KBlott,KBlott,KBlott,kblott_text_3,kblott_text_1,8,Ġevidence Ġto Ġsupport Ġthe Ġpractice Ġof Ġwit...
426,Richard_Daft vs Richard_Daft,Richard_Daft,Richard_Daft,richard_daft_text_2,richard_daft_text_4,7,Ġpre Ġ 1 8 0 0 Ġcricket
588,U21980 vs U21980,U21980,U21980,u21980_text_1,u21980_text_5,7,Ġnx iv m Ġand Ġr ani ere
462,Scheinwerfermann vs Scheinwerfermann,Scheinwerfermann,Scheinwerfermann,scheinwerfermann_text_11,scheinwerfermann_text_10,7,Ġto Ġfind Ġproduction Ġstart Ġand Ġend Ġdates
565,Thekohser vs Thekohser,Thekohser,Thekohser,thekohser_text_10,thekohser_text_11,7,"Ġ 1 6 , 0 0 0"
661,Yoenit vs Yoenit,Yoenit,Yoenit,yoenit_text_4,yoenit_text_2,7,. Ġ' y oen it ' Ġ


In [104]:
#diff_probs = problem_dataset_agg[problem_dataset_agg['known_author'] != problem_dataset_agg['unknown_author']].copy()
#diff_probs.sort_values(["highest_common_count"], ascending=[False], inplace=True)
#diff_probs[(diff_probs['highest_common_count'] >= 3) & (diff_probs['highest_common_count'] <= 10)].head(30)

In [105]:
def build_test_dict(df, problem, start=1):
    """
    Build a dictionary of tests from a dataframe with known/unknown doc IDs.

    Parameters
    ----------
    df : pd.DataFrame
        DataFrame with columns ['problem', 'known_doc_id', 'unknown_doc_id']
    problem : str
        Problem name to filter on
    start : int, optional
        Starting number for test IDs (default is 1)

    Returns
    -------
    dict
        Dictionary of tests in the form:
        {
            "test_01": {"known": ..., "unknown": ...},
            "test_02": {"known": ..., "unknown": ...},
            ...
        }
    """
    # Filter rows for the given problem
    sub_df = df[df['problem'] == problem]

    # Build dictionary
    return {
        f"test_{i:02d}": {"known": row.known_doc_id, "unknown": row.unknown_doc_id}
        for i, row in enumerate(sub_df.itertuples(index=False), start)
    }

In [106]:
build_test_dict(same_probs, 'Yoenit vs Yoenit', 28)

{'test_28': {'known': 'yoenit_text_4', 'unknown': 'yoenit_text_2'},
 'test_29': {'known': 'yoenit_text_5', 'unknown': 'yoenit_text_2'},
 'test_30': {'known': 'yoenit_text_1', 'unknown': 'yoenit_text_2'}}

In [107]:
tests = {
    "test_01": {"known": "vedant_text_5", "unknown": "vedant_text_4"},
    "test_02": {"known": "vedant_text_2", "unknown": "vedant_text_4"},
    "test_03": {"known": "vedant_text_1", "unknown": "vedant_text_4"},
    "test_04": {"known": "mayalld_text_4", "unknown": "mayalld_text_3"},
    "test_05": {"known": "mayalld_text_1", "unknown": "mayalld_text_3"},
    "test_06": {"known": "mayalld_text_2", "unknown": "mayalld_text_3"},
    "test_07": {"known": "swift_silent_text_1", "unknown": "swift_silent_text_2"},
    "test_08": {"known": "swift_silent_text_5", "unknown": "swift_silent_text_2"},
    "test_09": {"known": "swift_silent_text_4", "unknown": "swift_silent_text_2"},
    "test_10": {"known": "snowded_text_2", "unknown": "snowded_text_1"},
    "test_11": {"known": "snowded_text_12", "unknown": "snowded_text_1"},
    "test_12": {"known": "snowded_text_10", "unknown": "snowded_text_1"},
    'test_13': {'known': 'kblott_text_4', 'unknown': 'kblott_text_1'},
    'test_14': {'known': 'kblott_text_3', 'unknown': 'kblott_text_1'},
    'test_15': {'known': 'kblott_text_5', 'unknown': 'kblott_text_1'},
    'test_16': {'known': 'richard_daft_text_5', 'unknown': 'richard_daft_text_4'},
    'test_17': {'known': 'richard_daft_text_2', 'unknown': 'richard_daft_text_4'},
    'test_18': {'known': 'richard_daft_text_3', 'unknown': 'richard_daft_text_4'}, 
    'test_19': {'known': 'u21980_text_1', 'unknown': 'u21980_text_5'},
    'test_20': {'known': 'u21980_text_4', 'unknown': 'u21980_text_5'},
    'test_21': {'known': 'u21980_text_3', 'unknown': 'u21980_text_5'},
    'test_22': {'known': 'scheinwerfermann_text_11', 'unknown': 'scheinwerfermann_text_10'},
    'test_23': {'known': 'scheinwerfermann_text_13', 'unknown': 'scheinwerfermann_text_10'},
    'test_24': {'known': 'scheinwerfermann_text_12', 'unknown': 'scheinwerfermann_text_10'},
    'test_25': {'known': 'thekohser_text_10', 'unknown': 'thekohser_text_11'},
    'test_26': {'known': 'thekohser_text_1', 'unknown': 'thekohser_text_11'},
    'test_27': {'known': 'thekohser_text_3', 'unknown': 'thekohser_text_11'},
    'test_28': {'known': 'yoenit_text_4', 'unknown': 'yoenit_text_2'},
    'test_29': {'known': 'yoenit_text_5', 'unknown': 'yoenit_text_2'},
    'test_30': {'known': 'yoenit_text_1', 'unknown': 'yoenit_text_2'}
}

In [108]:
# sys.executable.replace("c:\\", "C:/").replace("\\", "/")

In [109]:
# script_loc.replace("C:\\", "C:/").replace("\\", "/")

In [None]:
env = dict(os.environ, PYTHONUNBUFFERED="1")

for test_num, entry in tests.items():

    known_doc = entry['known']
    unknown_doc = entry['unknown']
    print(f"Working on {test_num}: {known_doc} vs {unknown_doc}")
    
    cmd = [
        sys.executable.replace("c:\\", "C:/").replace("\\", "/"), "-u", script_loc.replace("C:\\", "C:/").replace("\\", "/"),
        "--known_loc", known_loc,
        "--unknown_loc", unknown_loc,
        "--metadata_loc", metadata_loc,
        "--model_loc", model_loc,
        "--save_loc", save_loc,
        "--completed_loc", completed_loc,
        "--corpus", corpus,
        "--data_type", data_type,
        "--known_doc", known_doc,
        "--unknown_doc", unknown_doc,
        "--openai_model", openai_model,
        "--max_tokens", str(max_tokens),
        "--temperature", str(temperature),
        "--n", str(n_samples),
    ]

    subprocess.run(cmd, text=True, env=env, check=True)

Working on test_01: vedant_text_5 vs vedant_text_4
Path /Volumes/BCross/paraphrase examples/Wiki-test/vedant_text_5 vs vedant_text_4.xlsx already exists. Exiting.
Working on test_02: vedant_text_2 vs vedant_text_4
Path /Volumes/BCross/paraphrase examples/Wiki-test/vedant_text_2 vs vedant_text_4.xlsx already exists. Exiting.
Working on test_03: vedant_text_1 vs vedant_text_4
Path /Volumes/BCross/paraphrase examples/Wiki-test/vedant_text_1 vs vedant_text_4.xlsx already exists. Exiting.
Working on test_04: mayalld_text_4 vs mayalld_text_3
Path /Volumes/BCross/paraphrase examples/Wiki-test/mayalld_text_4 vs mayalld_text_3.xlsx already exists. Exiting.
Working on test_05: mayalld_text_1 vs mayalld_text_3
Path /Volumes/BCross/paraphrase examples/Wiki-test/mayalld_text_1 vs mayalld_text_3.xlsx already exists. Exiting.
Working on test_06: mayalld_text_2 vs mayalld_text_3
Path /Volumes/BCross/paraphrase examples/Wiki-test/mayalld_text_2 vs mayalld_text_3.xlsx already exists. Exiting.
Working on

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Scoring phrases
    Scoring known text
Processing Phrase - phrase_01
Processing Phrase - phrase_02
Processing Phrase - phrase_03
Processing Phrase - phrase_04
Processing Phrase - phrase_05
Processing Phrase - phrase_06
Processing Phrase - phrase_07
Processing Phrase - phrase_08
Processing Phrase - phrase_09
Processing Phrase - phrase_10
Processing Phrase - phrase_11
Processing Phrase - phrase_12
Processing Phrase - phrase_13
Processing Phrase - phrase_14
Processing Phrase - phrase_15
Processing Phrase - phrase_16
Processing Phrase - phrase_17
Processing Phrase - phrase_18
Processing Phrase - phrase_19
Processing Phrase - phrase_20
Processing Phrase - phrase_21
Processing Phrase - phrase_22
Processing Phrase - phrase_23
Processing Phrase - phrase_24
Processing Phrase - phrase_25
Processing Phrase - phrase_26
Processing Phrase - phrase_27
Processing Phrase - phrase_28
Processing Phrase - phrase_29
Processing Phrase - phrase_30
Processing Phrase - phrase_31
    Scoring unknown text
Proces

  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)


Working on test_23: scheinwerfermann_text_13 vs scheinwerfermann_text_10
Working on problem: scheinwerfermann_text_13 vs scheinwerfermann_text_10
Loading model
Loading data
Getting common n-grams
There are 30 n-grams in common!
Generating paraphrases


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Scoring phrases
    Scoring known text
Processing Phrase - phrase_01
Processing Phrase - phrase_02
Processing Phrase - phrase_03
Processing Phrase - phrase_04
Processing Phrase - phrase_05
Processing Phrase - phrase_06
Processing Phrase - phrase_07
Processing Phrase - phrase_08
Processing Phrase - phrase_09
Processing Phrase - phrase_10
Processing Phrase - phrase_11
Processing Phrase - phrase_12
Processing Phrase - phrase_13
Processing Phrase - phrase_14
Processing Phrase - phrase_15
Processing Phrase - phrase_16
Processing Phrase - phrase_17
Processing Phrase - phrase_18
Processing Phrase - phrase_19
Processing Phrase - phrase_20
Processing Phrase - phrase_21
Processing Phrase - phrase_22
Processing Phrase - phrase_23
Processing Phrase - phrase_24
Processing Phrase - phrase_25
Processing Phrase - phrase_26
Processing Phrase - phrase_27
Processing Phrase - phrase_28
Processing Phrase - phrase_29
Processing Phrase - phrase_30
    Scoring unknown text
Processing Phrase - phrase_01
Proces

  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)


Working on test_24: scheinwerfermann_text_12 vs scheinwerfermann_text_10
Working on problem: scheinwerfermann_text_12 vs scheinwerfermann_text_10
Loading model
Loading data
Getting common n-grams
There are 30 n-grams in common!
Generating paraphrases


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Scoring phrases
    Scoring known text
Processing Phrase - phrase_01
Processing Phrase - phrase_02
Processing Phrase - phrase_03
Processing Phrase - phrase_04
Processing Phrase - phrase_05
Processing Phrase - phrase_06
Processing Phrase - phrase_07
Processing Phrase - phrase_08
Processing Phrase - phrase_09
Processing Phrase - phrase_10
Processing Phrase - phrase_11
Processing Phrase - phrase_12
Processing Phrase - phrase_13
Processing Phrase - phrase_14
Processing Phrase - phrase_15
Processing Phrase - phrase_16
Processing Phrase - phrase_17
Processing Phrase - phrase_18
Processing Phrase - phrase_19
Processing Phrase - phrase_20
Processing Phrase - phrase_21
Processing Phrase - phrase_22
Processing Phrase - phrase_23
Processing Phrase - phrase_24
Processing Phrase - phrase_25
Processing Phrase - phrase_26
Processing Phrase - phrase_27
Processing Phrase - phrase_28
Processing Phrase - phrase_29
Processing Phrase - phrase_30
    Scoring unknown text
Processing Phrase - phrase_01
Proces

  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)


Working on test_25: thekohser_text_10 vs thekohser_text_11
Working on problem: thekohser_text_10 vs thekohser_text_11
Loading model
Loading data
Getting common n-grams
There are 21 n-grams in common!
Generating paraphrases


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Scoring phrases
    Scoring known text
Processing Phrase - phrase_01
Processing Phrase - phrase_02
Processing Phrase - phrase_03
Processing Phrase - phrase_04
Processing Phrase - phrase_05
Processing Phrase - phrase_06
Processing Phrase - phrase_07
Processing Phrase - phrase_08
Processing Phrase - phrase_09
Processing Phrase - phrase_10
Processing Phrase - phrase_11
Processing Phrase - phrase_12
Processing Phrase - phrase_13
Processing Phrase - phrase_14
Processing Phrase - phrase_15
Processing Phrase - phrase_16
Processing Phrase - phrase_17
Processing Phrase - phrase_18
Processing Phrase - phrase_19
Processing Phrase - phrase_20
Processing Phrase - phrase_21
    Scoring unknown text
Processing Phrase - phrase_01
Processing Phrase - phrase_02
Processing Phrase - phrase_03
Processing Phrase - phrase_04
Processing Phrase - phrase_05
Processing Phrase - phrase_06
Processing Phrase - phrase_07
Processing Phrase - phrase_08
Processing Phrase - phrase_09
Processing Phrase - phrase_10
Proces

  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)


Working on test_26: thekohser_text_1 vs thekohser_text_11
Working on problem: thekohser_text_1 vs thekohser_text_11
Loading model
Loading data
Getting common n-grams
There are 30 n-grams in common!
Generating paraphrases


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Scoring phrases
    Scoring known text
Processing Phrase - phrase_01
Processing Phrase - phrase_02
Processing Phrase - phrase_03
Processing Phrase - phrase_04
Processing Phrase - phrase_05
Processing Phrase - phrase_06
Processing Phrase - phrase_07
Processing Phrase - phrase_08
Processing Phrase - phrase_09
Processing Phrase - phrase_10
Processing Phrase - phrase_11
Processing Phrase - phrase_12
Processing Phrase - phrase_13
Processing Phrase - phrase_14
Processing Phrase - phrase_15
Processing Phrase - phrase_16
Processing Phrase - phrase_17
Processing Phrase - phrase_18
Processing Phrase - phrase_19
Processing Phrase - phrase_20
Processing Phrase - phrase_21
Processing Phrase - phrase_22
Processing Phrase - phrase_23
Processing Phrase - phrase_24
Processing Phrase - phrase_25
Processing Phrase - phrase_26
Processing Phrase - phrase_27
Processing Phrase - phrase_28
Processing Phrase - phrase_29
Processing Phrase - phrase_30
    Scoring unknown text
Processing Phrase - phrase_01
Proces

  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)


Working on test_27: thekohser_text_3 vs thekohser_text_11
Working on problem: thekohser_text_3 vs thekohser_text_11
Loading model
Loading data
Getting common n-grams
There are 13 n-grams in common!
Generating paraphrases


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Scoring phrases
    Scoring known text
Processing Phrase - phrase_01
Processing Phrase - phrase_02
Processing Phrase - phrase_03
Processing Phrase - phrase_04
Processing Phrase - phrase_05
Processing Phrase - phrase_06
Processing Phrase - phrase_07
Processing Phrase - phrase_08
Processing Phrase - phrase_09
Processing Phrase - phrase_10
Processing Phrase - phrase_11
Processing Phrase - phrase_12
Processing Phrase - phrase_13
    Scoring unknown text
Processing Phrase - phrase_01
Processing Phrase - phrase_02
Processing Phrase - phrase_03
Processing Phrase - phrase_04
Processing Phrase - phrase_05
Processing Phrase - phrase_06
Processing Phrase - phrase_07
Processing Phrase - phrase_08
Processing Phrase - phrase_09
Processing Phrase - phrase_10
Processing Phrase - phrase_11
Processing Phrase - phrase_12
Processing Phrase - phrase_13
    Scoring phrases with no context
Processing Phrase - phrase_01
→ [1/28] Processing reference…
→ [2/28] Processing paraphrase…
→ [3/28] Processing paraphr

  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)


Working on test_28: yoenit_text_4 vs yoenit_text_2
Working on problem: yoenit_text_4 vs yoenit_text_2
Loading model
Loading data
Getting common n-grams
There are 32 n-grams in common!
Generating paraphrases


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Scoring phrases
    Scoring known text
Processing Phrase - phrase_01
Processing Phrase - phrase_02
Processing Phrase - phrase_03
Processing Phrase - phrase_04
Processing Phrase - phrase_05
Processing Phrase - phrase_06
Processing Phrase - phrase_07
Processing Phrase - phrase_08
Processing Phrase - phrase_09
Processing Phrase - phrase_10
Processing Phrase - phrase_11
Processing Phrase - phrase_12
Processing Phrase - phrase_13
Processing Phrase - phrase_14
Processing Phrase - phrase_15
Processing Phrase - phrase_16
Processing Phrase - phrase_17
Processing Phrase - phrase_18
Processing Phrase - phrase_19
Processing Phrase - phrase_20
Processing Phrase - phrase_21
Processing Phrase - phrase_22
Processing Phrase - phrase_23
Processing Phrase - phrase_24
Processing Phrase - phrase_25
Processing Phrase - phrase_26
Processing Phrase - phrase_27
Processing Phrase - phrase_28
Processing Phrase - phrase_29
Processing Phrase - phrase_30
Processing Phrase - phrase_31
Processing Phrase - phrase_32
 

  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)


Working on test_29: yoenit_text_5 vs yoenit_text_2
Working on problem: yoenit_text_5 vs yoenit_text_2
Loading model
Loading data
Getting common n-grams
There are 41 n-grams in common!
Generating paraphrases


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Scoring phrases
    Scoring known text
Processing Phrase - phrase_01
Processing Phrase - phrase_02
Processing Phrase - phrase_03
Processing Phrase - phrase_04
Processing Phrase - phrase_05
Processing Phrase - phrase_06
Processing Phrase - phrase_07
Processing Phrase - phrase_08
Processing Phrase - phrase_09
Processing Phrase - phrase_10
Processing Phrase - phrase_11
Processing Phrase - phrase_12
Processing Phrase - phrase_13
Processing Phrase - phrase_14
Processing Phrase - phrase_15
Processing Phrase - phrase_16
Processing Phrase - phrase_17
Processing Phrase - phrase_18
Processing Phrase - phrase_19
Processing Phrase - phrase_20
Processing Phrase - phrase_21
Processing Phrase - phrase_22
Processing Phrase - phrase_23
Processing Phrase - phrase_24
Processing Phrase - phrase_25
Processing Phrase - phrase_26
Processing Phrase - phrase_27
Processing Phrase - phrase_28
Processing Phrase - phrase_29
Processing Phrase - phrase_30
Processing Phrase - phrase_31
Processing Phrase - phrase_32
P

  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)


Working on test_30: yoenit_text_1 vs yoenit_text_2
Working on problem: yoenit_text_1 vs yoenit_text_2
Loading model
Loading data
Getting common n-grams
There are 35 n-grams in common!
Generating paraphrases


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Scoring phrases
    Scoring known text
Processing Phrase - phrase_01
Processing Phrase - phrase_02
Processing Phrase - phrase_03
Processing Phrase - phrase_04
Processing Phrase - phrase_05
Processing Phrase - phrase_06
Processing Phrase - phrase_07
Processing Phrase - phrase_08
Processing Phrase - phrase_09
Processing Phrase - phrase_10
Processing Phrase - phrase_11
Processing Phrase - phrase_12
Processing Phrase - phrase_13
Processing Phrase - phrase_14
Processing Phrase - phrase_15
Processing Phrase - phrase_16
Processing Phrase - phrase_17
Processing Phrase - phrase_18
Processing Phrase - phrase_19
Processing Phrase - phrase_20
Processing Phrase - phrase_21
Processing Phrase - phrase_22
Processing Phrase - phrase_23
Processing Phrase - phrase_24
Processing Phrase - phrase_25
Processing Phrase - phrase_26
Processing Phrase - phrase_27
Processing Phrase - phrase_28
Processing Phrase - phrase_29
Processing Phrase - phrase_30
Processing Phrase - phrase_31
Processing Phrase - phrase_32
P

  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
  df[obj_cols] = df[obj_cols].applymap(_clean_cell)
