In [73]:
import ast
import os
import glob

from pathlib import Path

import numpy as np
import pandas as pd

In [74]:
doc_loc = '/Volumes/BCross/paraphrase examples slurm/Wiki-test/hodja_nasreddin_text_1 vs hodja_nasreddin_text_3.xlsx'
phrase_loc = '/Volumes/BCross/paraphrase examples slurm/wiki-phrase-list-reviewed.xlsx'

known = pd.read_excel(doc_loc, sheet_name="known")
unknown = pd.read_excel(doc_loc, sheet_name="unknown")
no_context = pd.read_excel(doc_loc, sheet_name="no context")
metadata = pd.read_excel(doc_loc, sheet_name="metadata")

phrase_list = pd.read_excel(phrase_loc)
phrases_to_keep = phrase_list[phrase_list['keep_phrase'] == 1].copy()

# Convert the stringified tuples into actual tuples, then into lists
phrases_to_keep['tokens'] = phrases_to_keep['tokens'].apply(lambda x: list(ast.literal_eval(x)) if isinstance(x, str) else list(x))
phrases_to_keep = phrases_to_keep[['phrase']]
        
reference_phrases = no_context[no_context['phrase_type'] == 'reference'].copy()

# Perform the merge using the tuple-based key
merged_phrases = pd.merge(reference_phrases, phrases_to_keep, on='phrase', how='inner')
merged_phrases = merged_phrases[['phrase_num']]

no_context = pd.merge(no_context, merged_phrases, on='phrase_num', how='inner')
known = pd.merge(known, merged_phrases, on='phrase_num', how='inner')
unknown= pd.merge(unknown, merged_phrases, on='phrase_num', how='inner')

### Create the base LLR table

In [75]:
cols = ['phrase_num', 'phrase_occurence', 'original_phrase']

llr_base = (
    pd.concat([known[cols], unknown[cols]], ignore_index=True)
      .drop_duplicates()
      .sort_values(cols, ascending=[True, True, True])  # explicit
      .reset_index(drop=True)
)

### Get phrase statistics

In [76]:
# 1) No context phrase stats
no_context_phrase_stats = (
    no_context
    .assign(ref_raw_prob=no_context['raw_prob']
        .where(no_context['phrase_type'].eq('reference')))
    .groupby('phrase_num', dropna=False)
    .agg(
        num_phrases=('phrase_num', 'size'),
        sum_raw_prob=('raw_prob', 'sum'),
        reference_prob=('ref_raw_prob', 'max')
    )
    .assign(
        phrases_kept=lambda d: d['num_phrases'],
        pmf_no_context=lambda d: d['reference_prob'].div(d['sum_raw_prob']),
        llr_no_context=lambda d: np.where(d['pmf_no_context'] > 0, -np.log10(d['pmf_no_context']), 0.0)
    )
    .drop(columns=['sum_raw_prob', 'reference_prob'])
)

# 2) Known phrase stats
known_phrase_stats = (
    known
    .assign(ref_raw_prob=known['raw_prob']
        .where(known['phrase_type'].eq('reference')))
    .groupby(['phrase_num', 'phrase_occurence'], dropna=False)
    .agg(
        sum_raw_prob=('raw_prob', 'sum'),
        reference_prob=('ref_raw_prob', 'max')
    )
    .assign(
        pmf_known=lambda d: d['reference_prob'].div(d['sum_raw_prob']),
        llr_known=lambda d: np.where(d['pmf_known'] > 0, -np.log10(d['pmf_known']), 0.0)
    )
    .drop(columns=['sum_raw_prob', 'reference_prob'])
)

# 3) Unknown phrase stats
unknown_phrase_stats = (
    unknown
      .assign(ref_raw_prob=unknown['raw_prob']
              .where(unknown['phrase_type'].eq('reference')))
      .groupby(['phrase_num', 'phrase_occurence'], dropna=False)
      .agg(
          sum_raw_prob=('raw_prob', 'sum'),
          reference_prob=('ref_raw_prob', 'max')
      )
      .assign(
          pmf_unknown=lambda d: d['reference_prob'].div(d['sum_raw_prob']),
          llr_unknown=lambda d: np.where(d['pmf_unknown'] > 0, -np.log10(d['pmf_unknown']), 0.0)
      )
      .drop(columns=['sum_raw_prob', 'reference_prob'])
)

### Compare the reference phrase to the top ranked phrase

In [77]:
# Rank all phrases including the reference phrase
unknown['rank_including_ref'] = (
    unknown
      .groupby(['phrase_num', 'phrase_occurence'])['sum_log_probs_phrase']
      .rank(ascending=False, method='first')   # or method='dense' if you prefer
)

# reference_log_probs: from the 'reference' row
ref = (
    unknown[unknown['phrase_type'] == 'reference']
    .groupby(['phrase_num', 'phrase_occurence'])['sum_log_probs_phrase']
    .first()                 # or .iloc[0] via .agg('first')
    .rename('reference_log_probs')
)

# max_log_probs: from the top-ranked row (rank_including_ref == 1)
top = (
    unknown[unknown['rank_including_ref'] == 1]
    .groupby(['phrase_num', 'phrase_occurence'])['sum_log_probs_phrase']
    .first()
    .rename('max_log_probs')
)

# combine into one dataframe
agg_df = (
    pd.concat([ref, top], axis=1)
      .reset_index()
)

# difference
agg_df['unknown_ref_vs_top_rank'] = agg_df['reference_log_probs'] - agg_df['max_log_probs']

agg_df = agg_df.drop(columns=['reference_log_probs', 'max_log_probs']).reset_index()


In [78]:
# join
LLR = (
    llr_base
      .assign(
          phrase_num=llr_base['phrase_num'].astype('string'),
          phrase_occurence=pd.to_numeric(llr_base['phrase_occurence'], errors='coerce').astype('Int64')
      )
      .merge(no_context_phrase_stats, on='phrase_num', how='left')
      .merge(known_phrase_stats, on=['phrase_num','phrase_occurence'], how='left')
      .merge(unknown_phrase_stats, on=['phrase_num','phrase_occurence'], how='left')
      .merge(agg_df, on=['phrase_num','phrase_occurence'], how='left')
)

LLR = LLR[['phrase_num', 'phrase_occurence', 'original_phrase', 'num_phrases', 'phrases_kept',
           'pmf_no_context', 'pmf_known', 'pmf_unknown', 'llr_no_context', 'llr_known', 'llr_unknown',
           'unknown_ref_vs_top_rank']]

In [79]:
LLR

Unnamed: 0,phrase_num,phrase_occurence,original_phrase,num_phrases,phrases_kept,pmf_no_context,pmf_known,pmf_unknown,llr_no_context,llr_known,llr_unknown,unknown_ref_vs_top_rank
0,phrase_01,1,", this is not",51,51,0.488617,0.284798,0.314008,0.311031,0.545463,0.50306,-0.63697
1,phrase_02,1,", but this",24,24,0.012607,0.020277,0.024125,1.899381,1.693002,1.617525,-3.563433
2,phrase_02,2,", but this",24,24,0.012607,0.030038,,1.899381,1.522332,,
3,phrase_03,1,", you are",29,29,0.716338,0.914467,0.915033,0.144882,0.038832,0.038563,0.0
4,phrase_04,1,do not have,50,50,0.000732,0.614058,0.291087,3.135517,0.21179,0.535977,-0.193795
5,phrase_05,1,one of the,19,19,0.881184,0.970453,0.959643,0.054933,0.013026,0.01789,0.0
6,phrase_06,1,welcome to improve,59,59,0.254981,0.083369,0.032135,0.593492,1.078997,1.493017,-2.829769
7,phrase_07,1,you do not,3,3,0.059834,0.21725,0.377068,1.223055,0.663041,0.42358,-0.492485
8,phrase_08,1,about this,58,58,0.894526,0.662013,0.058029,0.048407,0.179133,1.236355,-2.740012
9,phrase_09,1,articles on,33,33,0.033955,0.397028,0.425706,1.469096,0.401179,0.370891,0.0


In [80]:
LLR_summary = pd.DataFrame([{
    'num_phrases': LLR['phrase_num'].nunique(),
    'phrases_kept': LLR.loc[LLR['phrases_kept'] > 0, 'phrase_num'].nunique(),
    'llr_no_context': LLR['llr_no_context'].sum(skipna=True),
    'llr_known': LLR['llr_known'].sum(skipna=True),
    'llr_unknown': LLR['llr_unknown'].sum(skipna=True),
    'llr_unknown_vs_top_rank': LLR['unknown_ref_vs_top_rank'].sum(skipna=True)
}])

LLR_summary = LLR_summary.assign(
    normalised_llr_no_context=lambda d: d['llr_no_context'] / d['phrases_kept'],
    normalised_llr_known=lambda d: d['llr_known'] / d['phrases_kept'],
    normalised_llr_unknown=lambda d: d['llr_unknown'] / d['phrases_kept']
)


In [81]:
# 3. Drop any overlapping columns in metadata
overlapping_cols = LLR_summary.columns.intersection(metadata.columns)
metadata = metadata.drop(columns=overlapping_cols, errors='ignore')

# 4. Concatenate new values
metadata = pd.concat([metadata, LLR_summary], axis=1)

In [82]:
metadata

Unnamed: 0,index,sample_id,problem,corpus,known_author,unknown_author,unknown_doc_id,known_doc_id,target,num_phrases,phrases_kept,llr_no_context,llr_known,llr_unknown,llr_unknown_vs_top_rank,normalised_llr_no_context,normalised_llr_known,normalised_llr_unknown
0,0,1,Hodja_Nasreddin vs Hodja_Nasreddin,Wiki,Hodja_Nasreddin,Hodja_Nasreddin,hodja_nasreddin_text_3,hodja_nasreddin_text_1,True,12,12,18.661448,10.42207,9.763942,-14.550765,1.555121,0.868506,0.813662


In [83]:
def create_results_doc_pipeline(doc_loc, write_excel=True, save_dir=None, phrase_loc=None):
    """Pipeline to manually get the results from a document"""
    
    doc_name = os.path.basename(doc_loc)
    
    print(f"Processing Document: {doc_name}")
    # Read the sheets as dataframes
    docs = pd.read_excel(doc_loc, sheet_name="docs")
    known = pd.read_excel(doc_loc, sheet_name="known")
    unknown = pd.read_excel(doc_loc, sheet_name="unknown")
    no_context = pd.read_excel(doc_loc, sheet_name="no context")
    metadata = pd.read_excel(doc_loc, sheet_name="metadata")
    
    # Get phrases to keep
    if phrase_loc:
        phrase_list = pd.read_excel(phrase_loc)
        phrases_to_keep = phrase_list[phrase_list['keep_phrase'] == 1].copy()

        # Convert the stringified tuples into actual tuples, then into lists
        phrases_to_keep['tokens'] = phrases_to_keep['tokens'].apply(lambda x: list(ast.literal_eval(x)) if isinstance(x, str) else list(x))
        phrases_to_keep = phrases_to_keep[['phrase']]
        
        reference_phrases = no_context[no_context['phrase_type'] == 'reference'].copy()

        # Perform the merge using the tuple-based key
        merged_phrases = pd.merge(reference_phrases, phrases_to_keep, on='phrase', how='inner')
        merged_phrases = merged_phrases[['phrase_num']]

        no_context = pd.merge(no_context, merged_phrases, on='phrase_num', how='inner')
        known = pd.merge(known, merged_phrases, on='phrase_num', how='inner')
        unknown= pd.merge(unknown, merged_phrases, on='phrase_num', how='inner')
        
    # Get the base LLR table
    cols = ['phrase_num', 'phrase_occurence', 'original_phrase']
    llr_base = (
        pd.concat([known[cols], unknown[cols]], ignore_index=True)
        .drop_duplicates()
        .sort_values(cols, ascending=[True, True, True])  # explicit
        .reset_index(drop=True)
    )
    
    # Now get the phrase statistics
    # 1) No context phrase stats
    no_context_phrase_stats = (
        no_context
        .assign(ref_raw_prob=no_context['raw_prob']
            .where(no_context['phrase_type'].eq('reference')))
        .groupby('phrase_num', dropna=False)
        .agg(
            num_phrases=('phrase_num', 'size'),
            sum_raw_prob=('raw_prob', 'sum'),
            reference_prob=('ref_raw_prob', 'max')
        )
        .assign(
            phrases_kept=lambda d: d['num_phrases'],
            pmf_no_context=lambda d: d['reference_prob'].div(d['sum_raw_prob']),
            llr_no_context=lambda d: np.where(d['pmf_no_context'] > 0, -np.log10(d['pmf_no_context']), 0.0)
        )
        .drop(columns=['sum_raw_prob', 'reference_prob'])
    )

    # 2) Known phrase stats
    known_phrase_stats = (
        known
        .assign(ref_raw_prob=known['raw_prob']
            .where(known['phrase_type'].eq('reference')))
        .groupby(['phrase_num', 'phrase_occurence'], dropna=False)
        .agg(
            sum_raw_prob=('raw_prob', 'sum'),
            reference_prob=('ref_raw_prob', 'max')
        )
        .assign(
            pmf_known=lambda d: d['reference_prob'].div(d['sum_raw_prob']),
            llr_known=lambda d: np.where(d['pmf_known'] > 0, -np.log10(d['pmf_known']), 0.0)
        )
        .drop(columns=['sum_raw_prob', 'reference_prob'])
    )

    # 3) Unknown phrase stats
    unknown_phrase_stats = (
        unknown
        .assign(ref_raw_prob=unknown['raw_prob']
                .where(unknown['phrase_type'].eq('reference')))
        .groupby(['phrase_num', 'phrase_occurence'], dropna=False)
        .agg(
            sum_raw_prob=('raw_prob', 'sum'),
            reference_prob=('ref_raw_prob', 'max')
        )
        .assign(
            pmf_unknown=lambda d: d['reference_prob'].div(d['sum_raw_prob']),
            llr_unknown=lambda d: np.where(d['pmf_unknown'] > 0, -np.log10(d['pmf_unknown']), 0.0)
        )
        .drop(columns=['sum_raw_prob', 'reference_prob'])
    )
    
    # ---- START OF RANK TEST ---- #
    # Rank all phrases including the reference phrase
    unknown['rank_including_ref'] = (
        unknown
        .groupby(['phrase_num', 'phrase_occurence'])['sum_log_probs_phrase']
        .rank(ascending=False, method='first')   # or method='dense' if you prefer
    )

    # reference_log_probs: from the 'reference' row
    ref = (
        unknown[unknown['phrase_type'] == 'reference']
        .groupby(['phrase_num', 'phrase_occurence'])['sum_log_probs_phrase']
        .first()                 # or .iloc[0] via .agg('first')
        .rename('reference_log_probs')
    )

    # max_log_probs: from the top-ranked row (rank_including_ref == 1)
    top = (
        unknown[unknown['rank_including_ref'] == 1]
        .groupby(['phrase_num', 'phrase_occurence'])['sum_log_probs_phrase']
        .first()
        .rename('max_log_probs')
    )

    # combine into one dataframe
    agg_df = (
        pd.concat([ref, top], axis=1)
        .reset_index()
    )

    # difference
    agg_df['unknown_ref_vs_top_rank'] = agg_df['reference_log_probs'] - agg_df['max_log_probs']

    agg_df = agg_df.drop(columns=['reference_log_probs', 'max_log_probs']).reset_index()
    
    # ---- END OF RANK TEST ---- #

    # Create final LLR table
    LLR = (
        llr_base
        .assign(
            phrase_num=llr_base['phrase_num'].astype('string'),
            phrase_occurence=pd.to_numeric(llr_base['phrase_occurence'], errors='coerce').astype('Int64')
        )
        .merge(no_context_phrase_stats, on='phrase_num', how='left')
        .merge(known_phrase_stats, on=['phrase_num','phrase_occurence'], how='left')
        .merge(unknown_phrase_stats, on=['phrase_num','phrase_occurence'], how='left')
        .merge(agg_df, on=['phrase_num','phrase_occurence'], how='left')
    )

    LLR = LLR[['phrase_num', 'phrase_occurence', 'original_phrase', 'num_phrases', 'phrases_kept',
            'pmf_no_context', 'pmf_known', 'pmf_unknown', 'llr_no_context', 'llr_known', 'llr_unknown',
            'unknown_ref_vs_top_rank']]
    
    # Summarise the LLR table for the metadata
    LLR_summary = pd.DataFrame([{
        'num_phrases': LLR['phrase_num'].nunique(),
        'phrases_kept': LLR.loc[LLR['phrases_kept'] > 0, 'phrase_num'].nunique(),
        'llr_no_context': LLR['llr_no_context'].sum(skipna=True),
        'llr_known': LLR['llr_known'].sum(skipna=True),
        'llr_unknown': LLR['llr_unknown'].sum(skipna=True),
        'llr_unknown_vs_top_rank': LLR['unknown_ref_vs_top_rank'].sum(skipna=True)
    }])

    LLR_summary = LLR_summary.assign(
        normalised_llr_no_context=lambda d: d['llr_no_context'] / d['phrases_kept'],
        normalised_llr_known=lambda d: d['llr_known'] / d['phrases_kept'],
        normalised_llr_unknown=lambda d: d['llr_unknown'] / d['phrases_kept']
    )
    
    # Create final metadata table
    # 1. Drop any overlapping columns in metadata
    overlapping_cols = LLR_summary.columns.intersection(metadata.columns)
    metadata_final = metadata.drop(columns=overlapping_cols, errors='ignore')

    # 2. Concatenate new values
    metadata_final = pd.concat([metadata_final, LLR_summary], axis=1)
    
    if write_excel:
        
        print("Writing file")
        path = Path(save_dir + '/' + doc_name)
        
        # Choose writer mode safely
        writer_mode = "a" if path.exists() else "w"
        writer_kwargs = {"engine": "openpyxl", "mode": writer_mode}
        if writer_mode == "a":
            writer_kwargs["if_sheet_exists"] = "replace"  # only valid in append mode
        

        with pd.ExcelWriter(path, **writer_kwargs) as writer:
            # Write sheets
            docs.to_excel(writer, index=False, sheet_name="docs")
            known.to_excel(writer, index=False, sheet_name="known")
            unknown.to_excel(writer, index=False, sheet_name="unknown")
            no_context.to_excel(writer, index=False, sheet_name="no context")
            LLR.to_excel(writer, index=False, sheet_name="LLR")
            metadata_final.to_excel(writer, index=False, sheet_name="metadata")
            
            # wb = writer.book
            # wb._sheets = ["docs", "metadata", "no context", "known", "unknown", "LLR"]
    
    return metadata_final

In [84]:
model_list = ['gpt2', 'gemma', 'llama', 'qwen']

for model in model_list:
    print(f"Completing the scoring for {model}")
    save_dir = f"/Volumes/BCross/paraphrase examples slurm/{model} results/filtered_inc_rank"
    os.makedirs(save_dir, exist_ok=True)
    
    phrase_loc = f'/Volumes/BCross/paraphrase examples slurm/wiki-phrase-list-reviewed.xlsx'
        
    result_save_loc = f'/Volumes/BCross/paraphrase examples slurm/{model} results/filtered_results_inc_rank.xlsx'
    
    if os.path.exists(result_save_loc):
        print(f"Path {result_save_loc} already exists. Exiting.")
        continue
        
    if model != 'qwen':
        orig_dir = f"/Volumes/BCross/paraphrase examples slurm/{model} results/raw"

    else:
        orig_dir = '/Volumes/BCross/paraphrase examples slurm/Wiki-test'

    # Get all .xlsx files from the original directory
    xlsx_files = glob.glob(os.path.join(orig_dir, "*.xlsx"))

    all_metadata = []

    for i, file_path in enumerate(xlsx_files, start=1):
        print(f"Completing file {i} out of {len(xlsx_files)}")
            
        try:
            metadata = create_results_doc_pipeline(file_path, write_excel=True, save_dir=save_dir, phrase_loc=phrase_loc)
            all_metadata.append(metadata)
        except Exception as e:
            print(f"File failed: {file_path}\nError: {e}")
            continue

    # Combine all metadata after processing
    if all_metadata:
        full_metadata = pd.concat(all_metadata, ignore_index=True)
        # You can optionally save full_metadata here
    else:
        full_metadata = pd.DataFrame()

    full_metadata = full_metadata.sort_values(by="index").reset_index(drop=True)
    full_metadata.to_excel(result_save_loc, index=False)
        
    print("All files complete")


Completing the scoring for gpt2
Path /Volumes/BCross/paraphrase examples slurm/gpt2 results/filtered_results_inc_rank.xlsx already exists. Exiting.
Completing the scoring for gemma
Path /Volumes/BCross/paraphrase examples slurm/gemma results/filtered_results_inc_rank.xlsx already exists. Exiting.
Completing the scoring for llama
Path /Volumes/BCross/paraphrase examples slurm/llama results/filtered_results_inc_rank.xlsx already exists. Exiting.
Completing the scoring for qwen
Completing file 1 out of 672
Processing Document: mymoloboaccount_text_2 vs mymoloboaccount_text_10.xlsx
Writing file
Completing file 2 out of 672
Processing Document: livelikemusic_text_5 vs livelikemusic_text_3.xlsx
Writing file
Completing file 3 out of 672
Processing Document: legolas2186_text_2 vs legolas2186_text_3.xlsx
Writing file
Completing file 4 out of 672
Processing Document: obamafan70_text_3 vs orangemarlin_text_4.xlsx
Writing file
Completing file 5 out of 672
Processing Document: nil_einne_text_11 vs 

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Writing file
Completing file 389 out of 672
Processing Document: irvine22_text_1 vs irvine22_text_3.xlsx
Writing file
Completing file 390 out of 672
Processing Document: jmrh6_text_3 vs jmrh6_text_1.xlsx
Writing file
Completing file 391 out of 672
Processing Document: machine_elf_1735_text_4 vs machine_elf_1735_text_3.xlsx
Writing file
Completing file 392 out of 672
Processing Document: nocrowx_text_5 vs notpietru_text_2.xlsx
Writing file
Completing file 393 out of 672
Processing Document: mystichumwipe_text_1 vs n419bh_text_4.xlsx
Writing file
Completing file 394 out of 672
Processing Document: jimharlow99_text_13 vs jimharlow99_text_10.xlsx
Writing file
Completing file 395 out of 672
Processing Document: jerekrischel_text_11 vs jerekrischel_text_13.xlsx
Writing file
Completing file 396 out of 672
Processing Document: hodja_nasreddin_text_11 vs honestopl_text_1.xlsx
Writing file
Completing file 397 out of 672
Processing Document: kudzu1_text_2 vs kudzu1_text_1.xlsx
Writing file
Comple