## 1. Load Data into Benchmark Tool

This section initializes the UWU-Benchmarks evaluation framework by loading the configuration settings and character definitions. The configuration file contains API keys, LLM selection, and benchmark parameters, while character files define the roleplay personas to be evaluated.

In [2]:
from datahelper import DataHelper

# Load the config file
config = DataHelper.load_config('./config.ini') 

# Load all characters from directory into a Character Object array
characters = DataHelper.load_character_directory('./characters', config) 

## 2. Roleplay Scene Generation

This phase uses the selected LLM to generate interactive roleplay scenes for each character. The system provides sample dialogues and character attributes as context, then generates multi-turn conversations between the user and AI character to evaluate roleplay performance.

In [None]:
from llms.gpt import GPT
from llms.claude import Claude
from llms.gemini import Gemini
from llms.novelai import NovelAI

# For each character, produce roleplay scenes
if config.test_llm.startswith('gpt'):
    for character in characters:
        character.rpscene = GPT.generate_roleplay(character, config)
elif config.test_llm.startswith('claude'):
    for character in characters:
        character.rpscene = Claude.generate_roleplay(character, config)
elif config.test_llm.startswith('gemini'):
    for character in characters:
        character.rpscene = Gemini.generate_roleplay(character, config)
else:
    for character in characters:
        character.rpscene = await NovelAI.generate_roleplay(character, config)

## 3. Benchmark Evaluation

This final stage applies all four benchmark algorithms (NVCS, ERTD, JERA, ALMP) to evaluate the generated roleplay content. Each algorithm measures different aspects of roleplay quality, and results are combined into comprehensive UWU scores across all test characters.

In [None]:
from algorithms.nvcs import NVCS
from algorithms.ertd import ERTD
from algorithms.jera import JERA
from algorithms.almp import ALMP
from algorithms.uwu import UWU
from datahelper import DataHelper

import datetime
import statistics as st

# Initialize benchmark algorithms
nvcs, ertd, jera, almp, uwu = NVCS(), ERTD(), JERA(), ALMP(), UWU()
nvcs_scores, ertd_scores, jera_scores, almp_scores, uwu_scores, character_evals = [], [], [], [], [], []

# Use generated responses to calculate benchmark scores
characters = DataHelper.load_generations_directory('./generations')
config = DataHelper.load_config('./config.ini') 

# For each LLM, gather the average benchmark scores based on all Characters
for character in characters:

    # Get all test character's utterances from sample dialogues as single string, separated by spaces, excluding the character's name
    charaOnlySampleDialogues = " ".join([dialogue.split(": ", 1)[1] for dialogue in character.smpdialogues if dialogue.startswith(f"{character.name}:")]) 
    # Get all test character's utterances from generated responses as single string, separated by spaces, excluding the character's name
    charaOnlyGeneratedResponses = " ".join([response.split(": ", 1)[1] for response in character.rpscene if response.startswith(f"{character.name}:")]) 
    # Get all roleplay dialogue between the test character and user character as single string, separated by newlines, including character's name
    fullRoleplay = "\n".join(character.rpscene) 

    print(f"========================== Benchmarking using {character.name}... ==========================\n")
    
    nvcs_score = nvcs.calculate_nvcs(config.ngram, charaOnlySampleDialogues, charaOnlyGeneratedResponses) # Return calculated NVCS as Float value
    print(f"- NVCS Score: {nvcs_score:.2f} out of 1.0\n")

    ertd_score = ertd.calculate_ertd(charaOnlySampleDialogues, charaOnlyGeneratedResponses)
    print(f"- ERTD Score: {ertd_score:.2f} out of 100\n")
    
    jera_score = jera.calculate_jera(character.name, character.attributes, character.smpdialogues, fullRoleplay, config)
    print(f"- JERA Score: {jera_score} out of 300\n")

    almp_score = almp.calculate_almp(character.attributes, charaOnlyGeneratedResponses, config)
    print(f"- ALMP Score: {almp_score:.2f} out of 1.0\n")

    uwu_score = uwu.calculate_uwu_score(nvcs_score, ertd_score, jera_score, almp_score)
    print(f"- UwU Score: {uwu_score:.2f} out of 100\n")

    nvcs_scores.append(nvcs_score)
    ertd_scores.append(ertd_score)
    jera_scores.append(jera_score)
    almp_scores.append(almp_score)
    uwu_scores.append(uwu_score)

    # Save scores unto character_evals
    character_evals.append(
        {
            'character': {
                'scores': {
                    'nvcs': nvcs_score,
                    'ertd': ertd_score,
                    'jera': jera_score,
                    'almp': almp_score,
                    'uwu': uwu_score
                },
                'name': character.name
            }
        }
            
    )

print(f"========================== Benchmarking all characters accomplished, now saving... ==========================")

# Save Benchmark results to file
benchmark_results = {
    'date_generated': datetime.datetime.now().strftime("%Y%m%d_%H%M"),
    'results': {
        'avg_scores': { # Average results
            'nvcs': st.mean(nvcs_scores),
            'ertd': st.mean(ertd_scores),
            'jera': st.mean(jera_scores),
            'almp': st.mean(almp_scores),
            'uwu': st.mean(uwu_scores)
        },
        'max_scores': { # Best results
            'nvcs': max(nvcs_scores),
            'ertd': max(ertd_scores),
            'jera': max(jera_scores),
            'almp': max(almp_scores),
            'uwu': max(uwu_scores)
        },
        'min_scores': { # Worst results
            'nvcs': min(nvcs_scores),
            'ertd': min(ertd_scores),
            'jera': min(jera_scores),
            'almp': min(almp_scores),
            'uwu': min(uwu_scores)
        },
        'std_devs': { # Standard Deviations of results
            'nvcs': st.stdev(nvcs_scores) if len(nvcs_scores) > 1 else 0,
            'ertd': st.stdev(ertd_scores) if len(ertd_scores) > 1 else 0,
            'jera': st.stdev(jera_scores) if len(jera_scores) > 1 else 0,
            'almp': st.stdev(almp_scores) if len(almp_scores) > 1 else 0,
            'uwu': st.stdev(uwu_scores) if len(uwu_scores) > 1 else 0
        }
    },
    'config': {
        'ngram': config.ngram,
        'smp_dialogues_max': config.smp_dialogues_max,
        'test_llm': config.test_llm,
        'rp_turns': config.rp_turns,
        'user_name': config.user_name
    },
    'characount': len(characters),
    'characters_used': character_evals
}

DataHelper.save_results(benchmark_results)