In [1]:
# import pandas as pd
# from SentenceParser import SentenceParser
# from PrepareSentenceContext import PrepareSentenceContext
# import pandas as pd
# from transformers import AutoTokenizer, AutoModelForCausalLM
# from PerplexityEvaluator import PerplexityEvaluator
# from PrepareSentenceContext import PrepareSentenceContext
# from tqdm import tqdm
# from many_atomic_detections import generate_responses
# from itertools import product

import os
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
from PerplexityEvaluator import PerplexityEvaluator
from PrepareSentenceContext import PrepareSentenceContext

In [2]:
dataset_paths = ['src/wiki_dataset_clean.csv', 'src/news_dataset_clean.csv', 'src/abstract_dataset_clean.csv']
# model_names = ['openai-community/gpt2', 'microsoft/phi-2', 'microsoft/Phi-3-mini-4k-instruct', 'tiiuae/falcon-7b']
# context_policies = ['previous-3-sentences']

model_names = ['openai-community/gpt2']
context_policies = ['none']
authors = ['human_text', 'Llama3.1_clean']

In [3]:
# for ds_path, model_name, context_policy, author in product(dataset_paths, model_names, context_policies, authors):
#     generate_responses(ds_path, model_name, context_policy, author)

In [2]:
def generate_responses(
    input_path, model_name, context_policy, author,
    output_dir="Responses", target_rows=100, checkpoint_path=None
):
    """
    Generate responses for a dataset row by row, ensuring that the target number of rows from the dataset is processed.

    Parameters:
    - input_path: Path to the input dataset CSV.
    - model_name: Name of the language model for perplexity evaluation.
    - context_policy: Context policy to use for sentence processing.
    - author_column: Column containing the text to process.
    - output_dir: Directory where responses will be saved.
    - target_rows: Number of rows from the dataset to process in this run.
    - checkpoint_path: Path to save progress for crash recovery.
    """
    # Load input dataset
    df = pd.read_csv(input_path)
    dataset_name = os.path.basename(input_path).split("_")[0]

    # Prepare output file path
    if "/" in model_name:
        lm_name_str = model_name.split("/")[-1]
    else:
        lm_name_str = model_name
    save_path = os.path.join(
        output_dir, f"{dataset_name}_{author}_{context_policy}_{lm_name_str}.csv"
    )
    os.makedirs(output_dir, exist_ok=True)

    # Load existing responses if file exists
    if os.path.exists(save_path):
        responses_df = pd.read_csv(save_path)
    else:
        responses_df = pd.DataFrame(columns=["num", "length", "response", "context_length", "name"])

    # Initialize model, tokenizer, and utilities
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    model.eval()

    evaluator = PerplexityEvaluator(model, tokenizer)
    parser = PrepareSentenceContext(context_policy=context_policy)

    # Process rows individually
    with tqdm(total=target_rows, desc="Processing rows", unit="row") as pbar:
        for idx, row in df.iterrows():
            if idx >= target_rows:
                break

            row_id = row.get("id", idx)  # Use row index as fallback for ID

            if row_id in responses_df["name"].unique():
                pbar.update(1)
                continue

            parsed = parser(row[author])

            for sentence_num, (sentence, context) in enumerate(zip(parsed["text"], parsed["context"])):
                try:
                    response = evaluator(sentence, context)
                    length = len(sentence.split())
                    context_length = len(context.split()) if context else 0

                    responses_df = pd.concat([
                        responses_df,
                        pd.DataFrame({
                            "num": [sentence_num + 1],
                            "length": [length],
                            "response": [response],
                            "context_length": [context_length],
                            "name": [row_id]
                        })
                    ], ignore_index=True)
                except Exception as e:
                    print(f"Error processing sentence: {sentence[:50]} -> {e}")

            # Save progress incrementally
            if checkpoint_path:
                with open(checkpoint_path, "w") as f:
                    f.write(str(idx + 1))
            responses_df.to_csv(save_path, index=False)
            pbar.update(1)

    print(f"Finished processing rows. Saved to {save_path}.")

In [3]:
generate_responses(
    input_path="src/wiki_dataset_clean.csv",
    model_name="microsoft/phi-2",
    context_policy="none",
    author="human_text",
    target_rows=30
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Processing rows:   0%|          | 0/30 [02:30<?, ?row/s]


KeyboardInterrupt: 