In [70]:
import ast
import sys
import os

import pandas as pd

from from_root import from_root
from pathlib import Path

In [71]:
sys.path.insert(0, str(from_root("src")))

from read_and_write_docs import read_completed_excel_result
from excel_functions import create_excel_template

## Get Distinct Phrases to Keep

In [72]:
phrase_list = pd.read_excel('/Volumes/BCross/paraphrase examples slurm/wiki-phrase-list-reviewed.xlsx')
phrases_to_keep = phrase_list[phrase_list['keep_phrase'] == 1].copy()

# Convert the stringified tuples into actual tuples, then into lists
phrases_to_keep['tokens'] = phrases_to_keep['tokens'].apply(lambda x: list(ast.literal_eval(x)) if isinstance(x, str) else list(x))
phrases_to_keep = phrases_to_keep[['phrase']]

print(f"Original Number of Phrases: {phrase_list.shape[0]}")
print(f"Revised Number of Phrases: {phrases_to_keep.shape[0]}")

Original Number of Phrases: 3026
Revised Number of Phrases: 2593


## Convert Single Document

In [73]:
excel_path = '/Volumes/BCross/paraphrase examples slurm/Wiki-test/hodja_nasreddin_text_1 vs hodja_nasreddin_text_3.xlsx'

### Filter to only include phrases to keep

In [74]:
data = read_completed_excel_result(excel_path)

no_context = data['no_context']
known = data['known']
unknown = data['unknown']

reference_phrases = no_context[no_context['phrase_type'] == 'reference'].copy()

# Perform the merge using the tuple-based key
merged_phrases = pd.merge(reference_phrases, phrases_to_keep, on='phrase', how='inner')
merged_phrases = merged_phrases[['phrase_num']]

no_context_filtered = pd.merge(no_context, merged_phrases, on='phrase_num', how='inner')
known_filtered = pd.merge(known, merged_phrases, on='phrase_num', how='inner')
unknown_filtered = pd.merge(unknown, merged_phrases, on='phrase_num', how='inner')

In [75]:
# create_excel_template(
#     known=data['known'].drop(columns=['include_phrase']),
#     unknown=data['unknown'].drop(columns=['include_phrase']),
#     no_context=data['no_context'],
#     metadata=data['metadata'],
#     docs=data['docs'],
#     path=excel_path,
#     known_sheet = "known",
#     unknown_sheet = "unknown",
#     nc_sheet = "no context",
#     metadata_sheet = "metadata",
#     docs_sheet = "docs",
#     llr_sheet = "LLR",
#     use_xlookup = False
# )

## Convert Multiple Files

In [76]:
read_dir = Path('/Volumes/BCross/paraphrase examples slurm/Wiki-test')
write_dir = Path('/Volumes/BCross/paraphrase examples slurm/Wiki-test-auto-initial-filter')

os.makedirs(write_dir, exist_ok=True)

for excel_path in sorted(read_dir.glob("*.xlsx")):
    if excel_path.name.startswith("~$"):
        continue  # skip Excel temp/lock files
    
    write_path = write_dir / excel_path.name
    print(f"Processing: {excel_path.name}")
    
    data = read_completed_excel_result(excel_path)
    
    try:
        # Load the tables we wish to filter
        no_context = data['no_context']
        known = data['known']
        unknown = data['unknown']

        reference_phrases = no_context[no_context['phrase_type'] == 'reference'].copy()

        # Perform the merge using the tuple-based key
        merged_phrases = pd.merge(reference_phrases, phrases_to_keep, on='phrase', how='inner')
        merged_phrases = merged_phrases[['phrase_num']]

        # Filter out phrases we do not wish to keep
        no_context_filtered = pd.merge(no_context, merged_phrases, on='phrase_num', how='inner')
        known_filtered = pd.merge(known, merged_phrases, on='phrase_num', how='inner')
        unknown_filtered = pd.merge(unknown, merged_phrases, on='phrase_num', how='inner')

        create_excel_template(
            known=known_filtered.drop(columns=['include_phrase']),
            unknown=unknown_filtered.drop(columns=['include_phrase']),
            no_context=no_context,
            metadata=data['metadata'],
            docs=data['docs'],
            path=write_path,
            known_sheet = "known",
            unknown_sheet = "unknown",
            nc_sheet = "no context",
            metadata_sheet = "metadata",
            docs_sheet = "docs",
            llr_sheet = "LLR",
            use_xlookup = False
        )

        print(f"  ✓ Wrote: {excel_path}")
        
    except:
        print(f"  Failed")
        continue

Processing: hodja_nasreddin_text_1 vs hodja_nasreddin_text_3.xlsx
  ✓ Wrote: /Volumes/BCross/paraphrase examples slurm/Wiki-test/hodja_nasreddin_text_1 vs hodja_nasreddin_text_3.xlsx
Processing: hodja_nasreddin_text_1 vs honestopl_text_1.xlsx
  ✓ Wrote: /Volumes/BCross/paraphrase examples slurm/Wiki-test/hodja_nasreddin_text_1 vs honestopl_text_1.xlsx
Processing: hodja_nasreddin_text_10 vs hodja_nasreddin_text_3.xlsx
  ✓ Wrote: /Volumes/BCross/paraphrase examples slurm/Wiki-test/hodja_nasreddin_text_10 vs hodja_nasreddin_text_3.xlsx
Processing: hodja_nasreddin_text_10 vs honestopl_text_1.xlsx
  ✓ Wrote: /Volumes/BCross/paraphrase examples slurm/Wiki-test/hodja_nasreddin_text_10 vs honestopl_text_1.xlsx
Processing: hodja_nasreddin_text_11 vs hodja_nasreddin_text_3.xlsx
  ✓ Wrote: /Volumes/BCross/paraphrase examples slurm/Wiki-test/hodja_nasreddin_text_11 vs hodja_nasreddin_text_3.xlsx
Processing: hodja_nasreddin_text_11 vs honestopl_text_1.xlsx
  ✓ Wrote: /Volumes/BCross/paraphrase exam