############ fine-tuning LLM

In [1]:
#read data file
import json
with open('cura-llm-training-data.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

In [3]:
# Extract all exemplar answers from the training data and put them in a list
exemplar_answers = []
for item in data:
    exemplar_answers.append({
        'question_id': item['question_id'],
        'generated_answer': item['answer'] 
    })

############## modify the input to make the input query more concise and accurate
# Extract only the "item" and the "criteria" field, excluding "curriculum_codes" and "total_score"

In [21]:
def prepare_input1(task_content, question, rubric):
    rubric_data = json.loads(rubric)
    modified_rubric = {
        "items": rubric_data["items"],  
        "criteria": rubric_data["criteria"]
    }

    return f"Context: {task_content}\nQuestion: {question}\nRubric: {modified_rubric}\nAnswer:"

In [23]:
import openai

def generate_exemplar_answer(task_content, question, rubric):
    prompt = prepare_input1(task_content, question, rubric)
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  
        messages=[
            {"role": "system", "content": "You are an assistant that provides exemplar answers."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150,
        temperature=0.2,
        top_p=0.85,
        frequency_penalty=0.3,
        presence_penalty=0.2
    )
    return response.choices[0].message['content'].strip()



In [25]:
generated_answers = []

for item in data:
    answer = generate_exemplar_answer(item['task_content'], item['question'], item['rubric'])
    generated_answers.append({'question_id': item['question_id'], 'generated_answer': answer})
print(generated_answers)



In [11]:
from sklearn.model_selection import KFold
import numpy as np
from rouge import Rouge 
rouge = Rouge()
# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rouge_scores = []
# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(data):
    fold_rouge_scores = []
    for i in test_index:
        item = data[i]
        
        # Generate answer for the test fold
        generated_answer = generate_exemplar_answer(item['task_content'], item['question'], item['rubric'])
        
        # Calculate ROUGE score with the reference answer
        reference_answer = item['answer']
        rouge_score = rouge.get_scores(generated_answer, reference_answer, avg=True)
        
        # Store ROUGE-L score for this answer (use 'rouge-1' or 'rouge-2' if preferred)
        fold_rouge_scores.append(rouge_score['rouge-l']['f'])
    
    # Average ROUGE-L score for this fold
    rouge_scores.append(np.mean(fold_rouge_scores))

# Calculate overall average ROUGE-L score across all folds
average_rouge_score = np.mean(rouge_scores)
print("Average ROUGE-L F1 score across folds:", average_rouge_score)

Average ROUGE-L F1 score across folds: 0.21032785349405486


In [27]:
from bert_score import score
# Prepare lists of text answers only
exemplar_text = [item['generated_answer'] for item in exemplar_answers]
generated_text = [item['generated_answer'] for item in generated_answers]

# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
precision_scores = []
recall_scores = []
# Perform K-Fold Cross Validation with BERTScore
for train_index, test_index in kf.split(exemplar_text):
    # Use indices to get text-only answers for each fold
    train_exemplar = [exemplar_text[i] for i in train_index]
    train_generated = [generated_text[i] for i in train_index]
    
    # Calculate BERTScore for the current fold
    precision, recall, f1_scores = score(train_generated, train_exemplar, lang="en", model_type="bert-base-uncased")
    
    # Append the average F1 score, R, P for the fold
    scores.append(f1_scores.mean().item())
    precision_scores.append(precision.mean().item())
    recall_scores.append(recall.mean().item())

# Calculate the average BERTScore F1 across all folds
average_bertscore_f1 = np.mean(scores)
# Calculate the average BERTScore precision and recall across all folds
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)

print("Average BERTScore F1 across folds:", average_bertscore_f1)
print("Average BERTScore Precision across folds:", average_precision)
print("Average BERTScore Recall across folds:", average_recall)

Average BERTScore F1 across folds: 0.5645399928092957
Average BERTScore Precision across folds: 0.5148611307144165
Average BERTScore Recall across folds: 0.6282832145690918


In [17]:
#### fine-tuning max_tokens
# when max_tokens = 100
def generate_exemplar_answer(task_content, question, rubric):
    prompt = prepare_input1(task_content, question, rubric)
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  
        messages=[
            {"role": "system", "content": "You are an assistant that provides exemplar answers."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100,
        temperature=0.2,
        top_p=0.85,
        frequency_penalty=0.3,
        presence_penalty=0.2
    )
    return response.choices[0].message['content'].strip()
generated_answers = []

for item in data:
    answer = generate_exemplar_answer(item['task_content'], item['question'], item['rubric'])
    generated_answers.append({'question_id': item['question_id'], 'generated_answer': answer})

In [19]:
from bert_score import score
# Prepare lists of text answers only
exemplar_text = [item['generated_answer'] for item in exemplar_answers]
generated_text = [item['generated_answer'] for item in generated_answers]

# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
precision_scores = []
recall_scores = []
# Perform K-Fold Cross Validation with BERTScore
for train_index, test_index in kf.split(exemplar_text):
    # Use indices to get text-only answers for each fold
    train_exemplar = [exemplar_text[i] for i in train_index]
    train_generated = [generated_text[i] for i in train_index]
    
    # Calculate BERTScore for the current fold
    precision, recall, f1_scores = score(train_generated, train_exemplar, lang="en", model_type="bert-base-uncased")
    
    # Append the average F1 score, R, P for the fold
    scores.append(f1_scores.mean().item())
    precision_scores.append(precision.mean().item())
    recall_scores.append(recall.mean().item())

# Calculate the average BERTScore F1 across all folds
average_bertscore_f1 = np.mean(scores)
# Calculate the average BERTScore precision and recall across all folds
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)

print("Average BERTScore F1 across folds:", average_bertscore_f1)
print("Average BERTScore Precision across folds:", average_precision)
print("Average BERTScore Recall across folds:", average_recall)

Average BERTScore F1 across folds: 0.5728601336479187
Average BERTScore Precision across folds: 0.5354673743247986
Average BERTScore Recall across folds: 0.6198107957839966


In [29]:
rouge = Rouge()
# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rouge_scores = []
# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(data):
    fold_rouge_scores = []
    for i in test_index:
        item = data[i]
        
        # Generate answer for the test fold
        generated_answer = generate_exemplar_answer(item['task_content'], item['question'], item['rubric'])
        
        # Calculate ROUGE score with the reference answer
        reference_answer = item['answer']
        rouge_score = rouge.get_scores(generated_answer, reference_answer, avg=True)
        
        # Store ROUGE-L score for this answer (use 'rouge-1' or 'rouge-2' if preferred)
        fold_rouge_scores.append(rouge_score['rouge-l']['f'])
    
    # Average ROUGE-L score for this fold
    rouge_scores.append(np.mean(fold_rouge_scores))

# Calculate overall average ROUGE-L score across all folds
average_rouge_score = np.mean(rouge_scores)
print("Average ROUGE-L F1 score across folds:", average_rouge_score)

Average ROUGE-L F1 score across folds: 0.21035546568021074


In [31]:
# when max_tokens = 200
def generate_exemplar_answer(task_content, question, rubric):
    prompt = prepare_input1(task_content, question, rubric)
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  
        messages=[
            {"role": "system", "content": "You are an assistant that provides exemplar answers."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=200,
        temperature=0.2,
        top_p=0.85,
        frequency_penalty=0.3,
        presence_penalty=0.2
    )
    return response.choices[0].message['content'].strip()
generated_answers = []

for item in data:
    answer = generate_exemplar_answer(item['task_content'], item['question'], item['rubric'])
    generated_answers.append({'question_id': item['question_id'], 'generated_answer': answer})

In [33]:
from bert_score import score
# Prepare lists of text answers only
exemplar_text = [item['generated_answer'] for item in exemplar_answers]
generated_text = [item['generated_answer'] for item in generated_answers]

# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
precision_scores = []
recall_scores = []
# Perform K-Fold Cross Validation with BERTScore
for train_index, test_index in kf.split(exemplar_text):
    # Use indices to get text-only answers for each fold
    train_exemplar = [exemplar_text[i] for i in train_index]
    train_generated = [generated_text[i] for i in train_index]
    
    # Calculate BERTScore for the current fold
    precision, recall, f1_scores = score(train_generated, train_exemplar, lang="en", model_type="bert-base-uncased")
    
    # Append the average F1 score, R, P for the fold
    scores.append(f1_scores.mean().item())
    precision_scores.append(precision.mean().item())
    recall_scores.append(recall.mean().item())

# Calculate the average BERTScore F1 across all folds
average_bertscore_f1 = np.mean(scores)
# Calculate the average BERTScore precision and recall across all folds
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)

print("Average BERTScore F1 across folds:", average_bertscore_f1)
print("Average BERTScore Precision across folds:", average_precision)
print("Average BERTScore Recall across folds:", average_recall)

Average BERTScore F1 across folds: 0.5634082674980163
Average BERTScore Precision across folds: 0.5079074263572693
Average BERTScore Recall across folds: 0.6368028879165649


In [35]:
rouge = Rouge()
# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rouge_scores = []
# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(data):
    fold_rouge_scores = []
    for i in test_index:
        item = data[i]
        
        # Generate answer for the test fold
        generated_answer = generate_exemplar_answer(item['task_content'], item['question'], item['rubric'])
        
        # Calculate ROUGE score with the reference answer
        reference_answer = item['answer']
        rouge_score = rouge.get_scores(generated_answer, reference_answer, avg=True)
        
        # Store ROUGE-L score for this answer (use 'rouge-1' or 'rouge-2' if preferred)
        fold_rouge_scores.append(rouge_score['rouge-l']['f'])
    
    # Average ROUGE-L score for this fold
    rouge_scores.append(np.mean(fold_rouge_scores))

# Calculate overall average ROUGE-L score across all folds
average_rouge_score = np.mean(rouge_scores)
print("Average ROUGE-L F1 score across folds:", average_rouge_score)

Average ROUGE-L F1 score across folds: 0.1983589284504873


######### to conclude, max_tokens = 100 seems to perform the best

####### fine tuning parameter temperature

In [37]:
# when temperature is 1
def generate_exemplar_answer(task_content, question, rubric):
    prompt = prepare_input1(task_content, question, rubric)
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  
        messages=[
            {"role": "system", "content": "You are an assistant that provides exemplar answers."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100,
        temperature=1,
        top_p=0.85,
        frequency_penalty=0.3,
        presence_penalty=0.2
    )
    return response.choices[0].message['content'].strip()
generated_answers = []

for item in data:
    answer = generate_exemplar_answer(item['task_content'], item['question'], item['rubric'])
    generated_answers.append({'question_id': item['question_id'], 'generated_answer': answer})

In [39]:
from bert_score import score
# Prepare lists of text answers only
exemplar_text = [item['generated_answer'] for item in exemplar_answers]
generated_text = [item['generated_answer'] for item in generated_answers]

# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
precision_scores = []
recall_scores = []
# Perform K-Fold Cross Validation with BERTScore
for train_index, test_index in kf.split(exemplar_text):
    # Use indices to get text-only answers for each fold
    train_exemplar = [exemplar_text[i] for i in train_index]
    train_generated = [generated_text[i] for i in train_index]
    
    # Calculate BERTScore for the current fold
    precision, recall, f1_scores = score(train_generated, train_exemplar, lang="en", model_type="bert-base-uncased")
    
    # Append the average F1 score, R, P for the fold
    scores.append(f1_scores.mean().item())
    precision_scores.append(precision.mean().item())
    recall_scores.append(recall.mean().item())

# Calculate the average BERTScore F1 across all folds
average_bertscore_f1 = np.mean(scores)
# Calculate the average BERTScore precision and recall across all folds
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)

print("Average BERTScore F1 across folds:", average_bertscore_f1)
print("Average BERTScore Precision across folds:", average_precision)
print("Average BERTScore Recall across folds:", average_recall)

Average BERTScore F1 across folds: 0.5651350140571594
Average BERTScore Precision across folds: 0.5271066188812256
Average BERTScore Recall across folds: 0.6128738164901734


In [41]:
# when temperature is 0.1
def generate_exemplar_answer(task_content, question, rubric):
    prompt = prepare_input1(task_content, question, rubric)
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  
        messages=[
            {"role": "system", "content": "You are an assistant that provides exemplar answers."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100,
        temperature=0.1,
        top_p=0.85,
        frequency_penalty=0.3,
        presence_penalty=0.2
    )
    return response.choices[0].message['content'].strip()
generated_answers = []

for item in data:
    answer = generate_exemplar_answer(item['task_content'], item['question'], item['rubric'])
    generated_answers.append({'question_id': item['question_id'], 'generated_answer': answer})

In [43]:
from bert_score import score
# Prepare lists of text answers only
exemplar_text = [item['generated_answer'] for item in exemplar_answers]
generated_text = [item['generated_answer'] for item in generated_answers]

# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
precision_scores = []
recall_scores = []
# Perform K-Fold Cross Validation with BERTScore
for train_index, test_index in kf.split(exemplar_text):
    # Use indices to get text-only answers for each fold
    train_exemplar = [exemplar_text[i] for i in train_index]
    train_generated = [generated_text[i] for i in train_index]
    
    # Calculate BERTScore for the current fold
    precision, recall, f1_scores = score(train_generated, train_exemplar, lang="en", model_type="bert-base-uncased")
    
    # Append the average F1 score, R, P for the fold
    scores.append(f1_scores.mean().item())
    precision_scores.append(precision.mean().item())
    recall_scores.append(recall.mean().item())

# Calculate the average BERTScore F1 across all folds
average_bertscore_f1 = np.mean(scores)
# Calculate the average BERTScore precision and recall across all folds
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)

print("Average BERTScore F1 across folds:", average_bertscore_f1)
print("Average BERTScore Precision across folds:", average_precision)
print("Average BERTScore Recall across folds:", average_recall)

Average BERTScore F1 across folds: 0.5676766276359558
Average BERTScore Precision across folds: 0.529153335094452
Average BERTScore Recall across folds: 0.6163294315338135


###### to conclude, it seems that when temperature is 0.2 the model performs the best

######## fine-tune presence penalty

In [45]:
# when presence penalty is 0.5
def generate_exemplar_answer(task_content, question, rubric):
    prompt = prepare_input1(task_content, question, rubric)
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  
        messages=[
            {"role": "system", "content": "You are an assistant that provides exemplar answers."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100,
        temperature=0.2,
        top_p=0.85,
        frequency_penalty=0.3,
        presence_penalty=0.5
    )
    return response.choices[0].message['content'].strip()
generated_answers = []

for item in data:
    answer = generate_exemplar_answer(item['task_content'], item['question'], item['rubric'])
    generated_answers.append({'question_id': item['question_id'], 'generated_answer': answer})

In [47]:
from bert_score import score
# Prepare lists of text answers only
exemplar_text = [item['generated_answer'] for item in exemplar_answers]
generated_text = [item['generated_answer'] for item in generated_answers]

# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
precision_scores = []
recall_scores = []
# Perform K-Fold Cross Validation with BERTScore
for train_index, test_index in kf.split(exemplar_text):
    # Use indices to get text-only answers for each fold
    train_exemplar = [exemplar_text[i] for i in train_index]
    train_generated = [generated_text[i] for i in train_index]
    
    # Calculate BERTScore for the current fold
    precision, recall, f1_scores = score(train_generated, train_exemplar, lang="en", model_type="bert-base-uncased")
    
    # Append the average F1 score, R, P for the fold
    scores.append(f1_scores.mean().item())
    precision_scores.append(precision.mean().item())
    recall_scores.append(recall.mean().item())

# Calculate the average BERTScore F1 across all folds
average_bertscore_f1 = np.mean(scores)
# Calculate the average BERTScore precision and recall across all folds
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)

print("Average BERTScore F1 across folds:", average_bertscore_f1)
print("Average BERTScore Precision across folds:", average_precision)
print("Average BERTScore Recall across folds:", average_recall)

Average BERTScore F1 across folds: 0.5666139125823975
Average BERTScore Precision across folds: 0.5277320265769958
Average BERTScore Recall across folds: 0.6155896425247193


In [49]:
# when presence penalty is 0.7
def generate_exemplar_answer(task_content, question, rubric):
    prompt = prepare_input1(task_content, question, rubric)
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  
        messages=[
            {"role": "system", "content": "You are an assistant that provides exemplar answers."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100,
        temperature=0.2,
        top_p=0.85,
        frequency_penalty=0.3,
        presence_penalty=0.7
    )
    return response.choices[0].message['content'].strip()
generated_answers = []

for item in data:
    answer = generate_exemplar_answer(item['task_content'], item['question'], item['rubric'])
    generated_answers.append({'question_id': item['question_id'], 'generated_answer': answer})

In [51]:
from bert_score import score
# Prepare lists of text answers only
exemplar_text = [item['generated_answer'] for item in exemplar_answers]
generated_text = [item['generated_answer'] for item in generated_answers]

# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
precision_scores = []
recall_scores = []
# Perform K-Fold Cross Validation with BERTScore
for train_index, test_index in kf.split(exemplar_text):
    # Use indices to get text-only answers for each fold
    train_exemplar = [exemplar_text[i] for i in train_index]
    train_generated = [generated_text[i] for i in train_index]
    
    # Calculate BERTScore for the current fold
    precision, recall, f1_scores = score(train_generated, train_exemplar, lang="en", model_type="bert-base-uncased")
    
    # Append the average F1 score, R, P for the fold
    scores.append(f1_scores.mean().item())
    precision_scores.append(precision.mean().item())
    recall_scores.append(recall.mean().item())

# Calculate the average BERTScore F1 across all folds
average_bertscore_f1 = np.mean(scores)
# Calculate the average BERTScore precision and recall across all folds
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)

print("Average BERTScore F1 across folds:", average_bertscore_f1)
print("Average BERTScore Precision across folds:", average_precision)
print("Average BERTScore Recall across folds:", average_recall)

Average BERTScore F1 across folds: 0.5658486962318421
Average BERTScore Precision across folds: 0.5264888286590577
Average BERTScore Recall across folds: 0.614991819858551


In [53]:
rouge = Rouge()
# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rouge_scores = []
# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(data):
    fold_rouge_scores = []
    for i in test_index:
        item = data[i]
        
        # Generate answer for the test fold
        generated_answer = generate_exemplar_answer(item['task_content'], item['question'], item['rubric'])
        
        # Calculate ROUGE score with the reference answer
        reference_answer = item['answer']
        rouge_score = rouge.get_scores(generated_answer, reference_answer, avg=True)
        
        # Store ROUGE-L score for this answer (use 'rouge-1' or 'rouge-2' if preferred)
        fold_rouge_scores.append(rouge_score['rouge-l']['f'])
    
    # Average ROUGE-L score for this fold
    rouge_scores.append(np.mean(fold_rouge_scores))

# Calculate overall average ROUGE-L score across all folds
average_rouge_score = np.mean(rouge_scores)
print("Average ROUGE-L F1 score across folds:", average_rouge_score)

Average ROUGE-L F1 score across folds: 0.22158327516692605


########## to conclude, adjusting the value of presence penalty does not seem to improve the model performance

########## fine tune top p

In [55]:
# when top p = 0.75
def generate_exemplar_answer(task_content, question, rubric):
    prompt = prepare_input1(task_content, question, rubric)
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",  
        messages=[
            {"role": "system", "content": "You are an assistant that provides exemplar answers."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=100,
        temperature=0.2,
        top_p=0.75,
        frequency_penalty=0.3,
        presence_penalty=0.2
    )
    return response.choices[0].message['content'].strip()

In [57]:
from bert_score import score
# Prepare lists of text answers only
exemplar_text = [item['generated_answer'] for item in exemplar_answers]
generated_text = [item['generated_answer'] for item in generated_answers]

# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
scores = []
precision_scores = []
recall_scores = []
# Perform K-Fold Cross Validation with BERTScore
for train_index, test_index in kf.split(exemplar_text):
    # Use indices to get text-only answers for each fold
    train_exemplar = [exemplar_text[i] for i in train_index]
    train_generated = [generated_text[i] for i in train_index]
    
    # Calculate BERTScore for the current fold
    precision, recall, f1_scores = score(train_generated, train_exemplar, lang="en", model_type="bert-base-uncased")
    
    # Append the average F1 score, R, P for the fold
    scores.append(f1_scores.mean().item())
    precision_scores.append(precision.mean().item())
    recall_scores.append(recall.mean().item())

# Calculate the average BERTScore F1 across all folds
average_bertscore_f1 = np.mean(scores)
# Calculate the average BERTScore precision and recall across all folds
average_precision = np.mean(precision_scores)
average_recall = np.mean(recall_scores)

print("Average BERTScore F1 across folds:", average_bertscore_f1)
print("Average BERTScore Precision across folds:", average_precision)
print("Average BERTScore Recall across folds:", average_recall)

Average BERTScore F1 across folds: 0.5658486962318421
Average BERTScore Precision across folds: 0.5264888286590577
Average BERTScore Recall across folds: 0.614991819858551


In [61]:
rouge = Rouge()
# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)
rouge_scores = []
# Perform K-Fold Cross Validation
for train_index, test_index in kf.split(data):
    fold_rouge_scores = []
    for i in test_index:
        item = data[i]
        
        # Generate answer for the test fold
        generated_answer = generate_exemplar_answer(item['task_content'], item['question'], item['rubric'])
        
        # Calculate ROUGE score with the reference answer
        reference_answer = item['answer']
        rouge_score = rouge.get_scores(generated_answer, reference_answer, avg=True)
        
        # Store ROUGE-L score for this answer (use 'rouge-1' or 'rouge-2' if preferred)
        fold_rouge_scores.append(rouge_score['rouge-l']['f'])
    
    # Average ROUGE-L score for this fold
    rouge_scores.append(np.mean(fold_rouge_scores))

# Calculate overall average ROUGE-L score across all folds
average_rouge_score = np.mean(rouge_scores)
print("Average ROUGE-L F1 score across folds:", average_rouge_score)

Average ROUGE-L F1 score across folds: 0.21990919771661846


######## End of Model Training and Fine Tuning ############

######## Analysis ##########

In part 1 of this project, I began by estimating the number of tokens needed for input and output for each question. The average output token count is 68, with a median of 52.5 tokens. The minimum and maximum output token counts appear to be outliers in this case. Therefore, in part 2 of the fine-tuning process, I adjusted the parameter max_tokens to 100, 150, and 200 to observe how the evaluation metrics BERTScore and ROUGE-L fluctuate. I used these two metrics because they measure different aspects of the "quality" of the generated output texts. BERTScore is more sophisticated as it measures semantic similarity and pays less attention to word order, relying on cosine similarity. In contrast, ROUGE-L measures the exact text overlap between the generated output and the reference exemplar answers, giving a rough idea of how closely the generated output aligns with the reference. However, its limitation is that it does not account for order, context, or semantics as BERTScore does. Nevertheless, both metrics offer unique insights into the quality of the generated output.

Based on the statistics of the token count for the exemplar answers in the training dataset, it seems that most exemplar answer tokens cluster around 50–70 (since the average is 68 tokens and the median is 52.5). Therefore, I hypothesized that setting max_tokens close to this cluster (i.e., 50–70) would yield better BERTScore and ROUGE scores. Indeed, when max_tokens = 100 (it may be worth lowering this further in future tests, such as to 80), I achieved the best BERTScore F1 score of 0.57. There was an increase in precision but a decrease in recall compared to when max_tokens = 150, suggesting that the model is better at avoiding irrelevant content with a capped token limit, though at the cost of omitting some relevant content.

When max_tokens = 200, as expected, I obtained the lowest BERTScore F1 of 0.563 but the highest BERTScore Recall of 0.6368. This again confirms that the model faces a trade-off between generating concise but potentially less relevant answers and longer, more relevant answers. However, surprisingly, I obtained the lowest ROUGE score (0.198) when max_tokens = 200. Given the high token limit (200), I expected the model to produce more relevant content with higher text overlap, and therefore a higher ROUGE score. However, this may indicate that the model generates text with similar semantics rather than exact wording from the exemplar answers, which the ROUGE metric does not fully capture.

I also fine-tuned other parameters, such as temperature, top_p, and presence penalty (there are more parameters like frequency penalty that I have not yet tested). Temperature is a parameter that controls the randomness and "creativity" of the model's output; the higher the value, the more random the generated content. However, for this project, since most of the exemplar answers are fact-based, a highly random and creative output is not ideal. After adjusting the temperature to 0.1, 0.2, and 1, the BERTScore did not show significant change, suggesting that temperature may not impact the actual quality of the texts in this case. I also fine-tuned the presence penalty, but this did not lead to significant change either. Lastly, I adjusted the top_p parameter. A low top_p value can help generate more focused and deterministic answers, which is favorable for this project. However, lowering this value also did not yield a significant change in the BERTScore.

Another factor I assumed would significantly influence the quality of the generated text was the input text itself. If the input could be more concise, direct, and clear, the quality of the generated output might improve. Therefore, I excluded "total score" and "curriculum codes," which are part of the "rubric," from the model input. However, this did not lead to a significant change. In the future, it might be worthwhile to modify item[0] (the highest score answer) to make it more concise and accurately describe what should be included for answers to score the highest. For example, for question 1, item[0] in the rubric, "I can explain my response" could be rephrased to "I can identify which fin design worked best and explain why I think it worked best." Although this approach may be somewhat time-consuming, it could enhance clarity.

Some creative ways to evaluate out-of-sample or unseen data:

1. slighly modify the input (such as by adding some distracting or less relevant words or contents which does not change the overall meaning of the question) and see if the model can still generate accurate and correct answers.
2. same logic as above, change the question and then see if the model is capable of changing its answer output accordingly.
3. for subject areas that the model has not yet been exposed to/trained on (for example, if the model has been extensively trained on data in biology, but not yet been trained in, say, history), have human experts rate on the ouput accuracy, correctness, relevancy in the subject history. This feedback would guide the model in improving its answers to align with the expected output.

