In [1]:
pip install openai==0.28


Note: you may need to restart the kernel to use updated packages.


In [32]:
pip install bert-score


Collecting bert-scoreNote: you may need to restart the kernel to use updated packages.

  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting torch>=1.0.0 (from bert-score)
  Downloading torch-2.5.0-cp312-cp312-win_amd64.whl.metadata (28 kB)
Collecting transformers>=3.0.0 (from bert-score)
  Downloading transformers-4.46.0-py3-none-any.whl.metadata (44 kB)
     ---------------------------------------- 0.0/44.1 kB ? eta -:--:--
     -------------------------------------- 44.1/44.1 kB 721.1 kB/s eta 0:00:00
Collecting sympy==1.13.1 (from torch>=1.0.0->bert-score)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers>=3.0.0->bert-score)
  Downloading huggingface_hub-0.26.2-py3-none-any.whl.metadata (13 kB)
Collecting safetensors>=0.4.1 (from transformers>=3.0.0->bert-score)
  Downloading safetensors-0.4.5-cp312-none-win_amd64.whl.metadata (3.9 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers>=

In [36]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"


In [38]:
pip install ipywidgets


Collecting fqdn (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Downloading fqdn-1.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting isoduration (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Downloading isoduration-20.11.0-py3-none-any.whl.metadata (5.7 kB)
Collecting uri-template (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Downloading uri_template-1.3.0-py3-none-any.whl.metadata (8.8 kB)
Collecting webcolors>=1.11 (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.9.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.6.6->ipywidgets)
  Downloading webcolors-24.8.0-py3-none-any.whl.metadata (2.6 kB)
Downloading webcolors-24.8.0-py3-none-any.whl (15 kB)
Downloadi

In [27]:
pip install rouge

Collecting rougeNote: you may need to restart the kernel to use updated packages.

  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [18]:
#read data file
import json
with open('cura-llm-training-data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)


In [20]:
#To estimate roughly how many tokens it will take for input and output in each json object, 
#this will help find the optimal token numbers to assign
import statistics
# Function to estimate tokens based on character count
def estimate_tokens_by_characters(text):
    # Roughly 1 token for every 4 characters
    return len(text) / 4

# Analyze token usage for each input-output pair
token_analysis = []
for item in data:
    task_content = item.get("task_content", "")
    question = item.get("question", "")
    rubric = item.get("rubric", "")
    answer = item.get("answer", "")
    
    # Estimate tokens for input (task content, question, and rubric combined)
    input_text = f"Context: {task_content}\nQuestion: {question}\nRubric: {rubric}\nAnswer:"
    input_tokens = estimate_tokens_by_characters(input_text)
    
    # Estimate tokens for output (exemplar answer)
    output_tokens = estimate_tokens_by_characters(answer)
    
    token_analysis.append({
        "question_id": item["question_id"],
        "input_tokens": input_tokens,
        "output_tokens": output_tokens,
        "total_tokens": input_tokens + output_tokens
    })

# Calculate average token usage for input and output
average_input_tokens = sum(item["input_tokens"] for item in token_analysis) / len(token_analysis)
average_output_tokens = sum(item["output_tokens"] for item in token_analysis) / len(token_analysis)
# Collect output tokens for each answer
output_tokens_list = []

# Analyze output token usage for each JSON item
for item in data:
    answer = item.get("answer", "")
    # Estimate tokens for output (exemplar answer)
    output_tokens = estimate_tokens_by_characters(answer)
    output_tokens_list.append(output_tokens)

# Calculate maximum, minimum, and median output tokens
max_output_tokens = max(output_tokens_list)
min_output_tokens = min(output_tokens_list)
median_output_tokens = statistics.median(output_tokens_list)

print("Average Input Tokens:", average_input_tokens)
print("Average Output Tokens:", average_output_tokens)


print("Maximum Output Tokens:", max_output_tokens)
print("Minimum Output Tokens:", min_output_tokens)
print("Median Output Tokens:", median_output_tokens)

Average Input Tokens: 1329.5833333333333
Average Output Tokens: 68.39102564102564
Maximum Output Tokens: 296.0
Minimum Output Tokens: 2.0
Median Output Tokens: 52.5


In [30]:
# Extract all exemplar answers from the training data and put them in a list
exemplar_answers = []
for item in data:
    exemplar_answers.append({
        'question_id': item['question_id'],
        'generated_answer': item['answer'] 
    })

In [24]:
def prepare_input(task_content, question, rubric):
    return f"Context: {task_content}\nQuestion: {question}\nRubric: {rubric}\nAnswer:"


In [26]:
import openai

def generate_exemplar_answer(task_content, question, rubric):
    prompt = prepare_input(task_content, question, rubric)
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  
        messages=[
            {"role": "system", "content": "You are an assistant that provides exemplar answers."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150,
        temperature=0.2,
        top_p=0.85,
        frequency_penalty=0.3,
        presence_penalty=0.2
    )
    return response.choices[0].message['content'].strip()



In [28]:
generated_answers = []

for item in data:
    answer = generate_exemplar_answer(item['task_content'], item['question'], item['rubric'])
    generated_answers.append({'question_id': item['question_id'], 'generated_answer': answer})
print(generated_answers)



In [32]:
from sklearn.model_selection import KFold
from bert_score import score
import numpy as np
# Prepare lists of text answers only
exemplar_text = [item['generated_answer'] for item in exemplar_answers]
generated_text = [item['generated_answer'] for item in generated_answers]

# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
precision_scores = []
recall_scores = []
# Perform K-Fold Cross Validation with BERTScore
for train_index, test_index in kf.split(exemplar_text):
    # Use indices to get text-only answers for each fold
    train_exemplar = [exemplar_text[i] for i in train_index]
    train_generated = [generated_text[i] for i in train_index]
    
    # Calculate BERTScore for the current fold
    precision, recall, f1_scores = score(train_generated, train_exemplar, lang="en", model_type="bert-base-uncased")
    
    # Append the average F1 score, R, P for the fold
    scores.append(f1_scores.mean().item())
    precision_scores.append(precision.mean().item())
    recall_scores.append(recall.mean().item())

# Calculate the average BERTScore F1 across all folds
average_bertscore_f1 = np.mean(scores)
# Calculate the average BERTScore precision and recall across all folds
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)

print("Average BERTScore F1 across folds:", average_bertscore_f1)
print("Average BERTScore Precision across folds:", average_precision)
print("Average BERTScore Recall across folds:", average_recall)

Average BERTScore F1 across folds: 0.5660456418991089
Average BERTScore Precision across folds: 0.5165732979774476
Average BERTScore Recall across folds: 0.6299543142318725


In [34]:
from rouge import Rouge 
rouge = Rouge()
# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rouge_scores = []
# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(data):
    fold_rouge_scores = []
    for i in test_index:
        item = data[i]
        
        # Generate answer for the test fold
        generated_answer = generate_exemplar_answer(item['task_content'], item['question'], item['rubric'])
        
        # Calculate ROUGE score with the reference answer
        reference_answer = item['answer']
        rouge_score = rouge.get_scores(generated_answer, reference_answer, avg=True)
        
        # Store ROUGE-L score for this answer (use 'rouge-1' or 'rouge-2' if preferred)
        fold_rouge_scores.append(rouge_score['rouge-l']['f'])
    
    # Average ROUGE-L score for this fold
    rouge_scores.append(np.mean(fold_rouge_scores))

# Calculate overall average ROUGE-L score across all folds
average_rouge_score = np.mean(rouge_scores)
print("Average ROUGE-L F1 score across folds:", average_rouge_score)

Average ROUGE-L F1 score across folds: 0.21162943825581948
