In [1]:
import logging
import os
import time

start_time = time.time()  # note at start of script

# %% Environment Setup
from utils import setup_bmc5_env, setup_logging_results

setup_bmc5_env()

# PARAMS SPECIFIC TO  THE CURRENT RUN
os.environ['CUDA_CORE'] = "cuda:1"  # NOTE: belongs to setup func above, but keep here for running diff scripts.
dataset_range = range(1500 + 734, 2500)

results_path = setup_logging_results(dataset_range)
logging.info(f"Script started")

# %% Data read and preprocess
from data_read_preprocess import load_and_preprocess

preprocessed_dataset = load_and_preprocess(dataset_range)
logging.info(f"Data read and preprocessing completed for dataset {dataset_range}")

# %%  model setup file.

from model_setup import setup_model_tokenizer
import pandas as pd

tokenizer, model = setup_model_tokenizer()
logging.info("Model setup completed")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 1 has a total capacity of 44.55 GiB of which 18.31 MiB is free. Process 3829304 has 342.00 MiB memory in use. Process 4138224 has 40.18 GiB memory in use. Including non-PyTorch memory, this process has 4.01 GiB memory in use. Of the allocated memory 3.75 GiB is allocated by PyTorch, and 1.88 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [2]:
reconstructed_df_path = '/home/s4user/PromptExplainabilityProject/ExplainabilityModularized/Results/2024-03-28_15-20-19_range(1500, 2500)/reconstructed_df.pkl'
reconstructed_df = pd.read_pickle(reconstructed_df_path)
# reconstructed_df = reconstructed_df[['bottom_reconstructed_query_0.4', 'component_range']]

In [4]:
column_names = []
for col in reconstructed_df.columns:
    if '_reconstructed_' in col:
       column_names.append(col)

len(column_names)

16

In [5]:
from inference import infer 
import torch

def run_peturbed_inference(df, model, tokenizer, results_path, column_names=None):
    """depreciated since this func does not handle failure
    demnarkated by the return None from infer func. 
    """
    logging.info(f"Inferencing Peturbed samples -------------------------")

    if column_names is None:
        # getting the columns demarkated by `reconstructed`
        column_names = []
        for col in df.columns:
            if '_reconstructed_' in col:
                column_names.append(col)

    for col_name in column_names:
        results = df.apply(lambda row: infer(row[col_name], model, tokenizer,
                                             component_sentences=row['component_range'],
                                             logging_ind=f"{row.name=},{col_name=}")
                                             , axis=1)
        df[f'{col_name}_token_level'], \
            df[f'{col_name}_word_level'], \
            df[f'{col_name}_component_level'], \
            df[f'{col_name}_output'] = zip(*results)

        # save after specific cols,
        # if fails then remove the cols in the `column_names` arg
        df.to_pickle(results_path + "_intermediate-run_peturbed_inference.pkl")
        torch.cuda.empty_cache()

    return df


peturbed_inferenced_df = run_peturbed_inference(reconstructed_df, model, tokenizer, results_path)