In [1]:
import pandas as pd
import json
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import argparse
import os

file = "5shot_llama3"
csv_file_path = f'../results/{file}.csv'
gt_file_path = 'KeyphrasesGroundTruth.csv'

# Load the sentence transformer model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

def merge_and_process_csv(csv_data, gt_data):
    csv_data = pd.read_csv(csv_file_path, usecols=[1], header=0, sep=";")
    csv_data.columns = ['LLM']
    gt_data = pd.read_csv(gt_file_path, usecols=[1], header=0, sep=";")
    gt_data.columns = ['Ground truth']
    merged_data = pd.concat([gt_data, csv_data], axis=1)
    return merged_data

def clean_data(data):
    def remove_after_dash(text):
        if isinstance(text, str):
            return ', '.join(part.split(' - ')[0] for part in text.split(', '))
        return text
    return data.applymap(remove_after_dash)

def data_to_json(data):
    return [{"source": "a", "target": row['Ground truth'], "predictions": row['LLM']} for _, row in data.iterrows()]

def save_json_file(data, file_name):
    # Create directory if it doesn't exist
    directory = f'model_outputs/sample/{file}'
    os.makedirs(directory, exist_ok=True)
    
    # Full path for the JSON file
    file_path = f'{directory}/{file_name}'
    
    # Save JSON data to file, one object per line
    with open(file_path, 'w') as f:
        for item in data:
            json.dump(item, f)
            f.write('\n')
    
    print(f"JSON data saved to {file_path}")
def parse_phrases(sentiments_str):
    return [phrase.strip() for phrase in sentiments_str.split(',')] if isinstance(sentiments_str, str) else []

def compute_similarity(phrase1, phrase2):
    embeddings1 = model.encode([phrase1])
    embeddings2 = model.encode([phrase2])
    return cosine_similarity(embeddings1, embeddings2)[0][0]

def compare_sentiments(sent1, sent2):
    return sent1.strip().lower().rstrip('.') == sent2.strip().lower().rstrip('.')

def calculate_sentiment_score(data):
    similarity_threshold = 0.7
    total_score = 0
    total_pairs = 0
    for index, row in data.iterrows():
        if index % 5 == 0:
            print(f"Processing row {index}")
        try:
            ground_truth_phrases = parse_phrases(row['Ground truth'])
            llm_phrases = parse_phrases(row['LLM'])
            for gt_phrase in ground_truth_phrases:
                if ' - ' not in gt_phrase:
                    continue
                gt_text, gt_sentiment = gt_phrase.rsplit(' - ', 1)
                best_match = max(
                    (phrase for phrase in llm_phrases if ' - ' in phrase),
                    key=lambda phrase: compute_similarity(gt_text, phrase.rsplit(' - ', 1)[0]),
                    default=None
                )
                if best_match:
                    llm_text, llm_sentiment = best_match.rsplit(' - ', 1)
                    if compute_similarity(gt_text, llm_text) >= similarity_threshold and compare_sentiments(gt_sentiment, llm_sentiment):
                        total_score += 1
                total_pairs += 1
        except Exception as e:
            print(f"Error processing row {index}: {e}")
    return total_score / total_pairs if total_pairs > 0 else 0

# Main execution
print("Merging CSV files...")
merged_data = merge_and_process_csv(csv_file_path, gt_file_path)

print("Cleaning merged data...")
cleaned_data = clean_data(merged_data)

print("Converting data to JSON...")
json_data = data_to_json(cleaned_data)

print("Calculating sentiment score...")
sentiment_score = calculate_sentiment_score(merged_data)

# Print results
print(f"\nSentiment Score: {sentiment_score:.2f}")

# Save JSON data to file
save_json_file(json_data, 'sample_hypotheses_linked.json')

  from tqdm.autonotebook import tqdm, trange


Merging CSV files...
Cleaning merged data...
Converting data to JSON...
Calculating sentiment score...
Processing row 0


  return data.applymap(remove_after_dash)


Processing row 5

Sentiment Score: 0.82
JSON data saved to keyphrase-generation/KPEval/model_outputs/sample/5shot_llama3/sample_hypotheses_linked.json


In [2]:
# Notebook Cell 1
import os
import subprocess

# Set the environment variable for PYTHONIOENCODING
os.environ['PYTHONIOENCODING'] = 'utf-8'

# Set HOME_DIR and PYTHONPATH
HOME_DIR = '.'
os.environ['PYTHONPATH'] = os.environ.get('PYTHONPATH', '') + f':{HOME_DIR}'

# Define other variables
dataset = 'sample'
model = '5shot_llama3'
metrics = 'semantic_matching'
OUTDIR = f'{HOME_DIR}/eval_results/{dataset}/{model}/'

# Create the output directory
os.makedirs(OUTDIR, exist_ok=True)


In [4]:
# Notebook Cell 2
# Run the evaluation script using subprocess
config_file = f'{HOME_DIR}/configs/sample_config_{dataset}.gin'
jsonl_file = f'{HOME_DIR}/model_outputs/{dataset}/{model}/{dataset}_hypotheses_linked.json'
log_file_prefix = OUTDIR

command = [
    'python', f'{HOME_DIR}/run_evaluation.py',
    '--config-file', config_file,
    '--jsonl-file', jsonl_file,
    '--metrics', metrics,
    '--log-file-prefix', log_file_prefix
]

# Execute the command
result = subprocess.run(command, capture_output=True, text=True)
print(result.stdout)
print(result.stderr)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading input files...
Calculating scores for the semantic_matching metric.
{'semantic_p': 0.9644260823726654, 'semantic_r': 0.9644260942935944, 'semantic_f1': 0.9644260883331295}


Preparing predictions:   0%|          | 0/10 [00:00<?, ?it/s]
Preparing predictions: 100%|██████████| 10/10 [00:00<00:00, 2997.64it/s]

Preparing references:   0%|          | 0/10 [00:00<?, ?it/s]
Preparing references: 100%|██████████| 10/10 [00:00<00:00, 3269.14it/s]

Preparing inputs:   0%|          | 0/10 [00:00<?, ?it/s]
Preparing inputs: 100%|██████████| 10/10 [00:00<00:00, 81284.96it/s]

Evaluating...:   0%|          | 0/10 [00:00<?, ?it/s]
Evaluating...:  10%|█         | 1/10 [00:03<00:33,  3.69s/it]
Evaluating...:  20%|██        | 2/10 [00:03<00:12,  1.59s/it]
Evaluating...:  30%|███       | 3/10 [00:03<00:06,  1.08it/s]
Evaluating...:  40%|████      | 4/10 [00:04<00:03,  1.64it/s]
Evaluating...:  50%|█████     | 5/10 [00:04<00:02,  2.33it/s]
Evaluating...:  70%|███████   | 7/10 [00:04<00:00,  3.80i