# Rescore Excel with new model

This code is used to rescore a pre-completed Excel file using a new model. This is needed due to me being an idiot and not creating a full pipeline and instead paraphrasing and scoring in a single step.

The script to run this through CSF etc. is **scripts/run_scoring_with_new_model.py**

In [46]:
import sys

from pathlib import Path
from from_root import from_root

sys.path.insert(0, str(from_root("src")))

from read_and_write_docs import read_excel_sheets
from model_loading import load_model
from n_gram_functions import (
    get_scored_df,
    get_scored_df_no_context
)
from excel_functions import create_excel_template

In [47]:
data_loc = '/Volumes/BCross/paraphrase examples slurm/Wiki-test/hodja_nasreddin_text_1 vs hodja_nasreddin_text_3.xlsx'
save_dir = '/Volumes/BCross/paraphrase examples slurm/Wiki-test/gemma'

data_path = Path(data_loc)
if not data_path.is_file():
    raise FileNotFoundError(f"Input file not found: {data_path}")

# Build write_path FIRST using save_dir + input filename (stem) + ".xlsx"
save_dir_path = Path(save_dir)
write_path = save_dir_path / f"{data_path.stem}.xlsx"

In [48]:
model_loc = '/Volumes/BCross/models/gemma-3-270m'

tokenizer, model = load_model(model_loc)

data = read_excel_sheets(data_loc, ['docs', 'no context', 'metadata'])

# Get the full texts from docs
docs = data['docs']
texts = docs.tail(1).copy().reset_index()

# Get the texts, will be the last row in docs
known_text = texts.loc[0, 'known']
unknown_text = texts.loc[0, 'unknown']

# Pull only the needed columns from metadata
metadata = data['metadata']
metadata_subset = metadata.loc[:, : 'target']

no_context = data['no context']

# Get the n-grams in a way necessary to compute scores
n_gram_dict = {}

for phrase_num, df in no_context.groupby("phrase_num"):
    # Get the reference phrase (there should be only one)
    ref_series = df.loc[df["phrase_type"] == "reference", "phrase"]
    reference_phrase = ref_series.iloc[0] if not ref_series.empty else None

    # Get all paraphrases as a list
    paraphrases = df.loc[df["phrase_type"] == "paraphrase", "phrase"].tolist()

    n_gram_dict[phrase_num] = {
        "phrase": reference_phrase,
        "paraphrases": paraphrases
    }

# Score the n-grams vs the text
print("Processing known text")
known_scored = get_scored_df(n_gram_dict, known_text, tokenizer, model)

print("Processing unknown text")
unknown_scored = get_scored_df(n_gram_dict, unknown_text, tokenizer, model)

print("Processing no context text")
no_context_scored = get_scored_df_no_context(n_gram_dict, tokenizer, model)

create_excel_template(
    known=known_scored,
    unknown=unknown_scored,
    no_context=no_context_scored,
    metadata=metadata_subset,
    docs=docs,
    path=write_path,
    known_sheet = "known",
    unknown_sheet = "unknown",
    nc_sheet = "no context",
    metadata_sheet = "metadata",
    docs_sheet = "docs",
    llr_sheet = "LLR",
    use_xlookup = False
)

Processing Phrase - phrase_01
Processing Phrase - phrase_02
Processing Phrase - phrase_03
Processing Phrase - phrase_04
Processing Phrase - phrase_05
Processing Phrase - phrase_06
Processing Phrase - phrase_07
Processing Phrase - phrase_08
Processing Phrase - phrase_09
Processing Phrase - phrase_10
Processing Phrase - phrase_11
Processing Phrase - phrase_12
Processing Phrase - phrase_01
Processing Phrase - phrase_02
Processing Phrase - phrase_03
Processing Phrase - phrase_04
Processing Phrase - phrase_05
Processing Phrase - phrase_06
Processing Phrase - phrase_07
Processing Phrase - phrase_08
Processing Phrase - phrase_09
Processing Phrase - phrase_10
Processing Phrase - phrase_11
Processing Phrase - phrase_12
Processing Phrase - phrase_01
→ [1/51] Processing reference…
→ [2/51] Processing paraphrase…
→ [3/51] Processing paraphrase…
→ [4/51] Processing paraphrase…
→ [5/51] Processing paraphrase…
→ [6/51] Processing paraphrase…
→ [7/51] Processing paraphrase…
→ [8/51] Processing paraphr

PosixPath('/Volumes/BCross/paraphrase examples slurm/Wiki-test/gemma/hodja_nasreddin_text_1 vs hodja_nasreddin_text_3.xlsx')