In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import openai
import pandas as pd
from transformers import StoppingCriteria, StoppingCriteriaList
from tqdm import tqdm

import sys
sys.path.append('../src')
from src.utils import DataCollatorForCompletionOnlyLM, GoalSolutionDataset, END_KEY, INSTRUCTION_KEY, RESPONSE_KEY_NL
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load models
tokenizer1 = AutoTokenizer.from_pretrained("EleutherAI/pythia-1b", padding_side = "left", max_length=1024)
model1 = AutoModelForCausalLM.from_pretrained("EleutherAI/pythia-1b")
model1 = model1.to(device)

tokenizer2 = AutoTokenizer.from_pretrained("./models/pythia-1b-fine-tuned/", padding_side = "left", max_length=1024)
model2 = AutoModelForCausalLM.from_pretrained("./models/pythia-1b-fine-tuned/checkpoint-3600")
model2 = model2.to(device)

# Initialize OpenAI API for text-davinci-003
openai.api_key = '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' # Invalid, Add your own key

valid_file_path = "./data/data_validation.txt"
valid_dataset = GoalSolutionDataset(tokenizer2, file_path=valid_file_path, max_length=1024)
valid_dataset_unformatted = GoalSolutionDataset(tokenizer1, file_path=valid_file_path, max_length=1024, format=False, padding=None)

In [2]:
print(tokenizer1.encode("### Response:\n"))
print(tokenizer2.encode("### Response:\n"))

[4118, 19371, 27, 187]
[50279]


In [3]:
print(tokenizer2.encode("### EndSolution"))

[50277, 37533]


In [4]:
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        stop_ids = [50277]
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

In [6]:
# Save predictions
questions = []
predictions1 = []
predictions2 = []
for i, (data_format, data_unformatted) in enumerate(tqdm(zip(valid_dataset, valid_dataset_unformatted), total=len(valid_dataset))):
    input_ids_formatted = []
    input_ids_unformatted = []
    response_key_id = tokenizer2.encode("### Response:\n")[0]
    solution_id = tokenizer1.encode("Solution:")[0]

    for j in range(0, len(data_unformatted["input_ids"])):
        current_token = data_unformatted["input_ids"][j]
        if current_token == 0:
            continue
        if current_token == solution_id:
            input_ids_unformatted.append(current_token)
            input_ids_unformatted.append(data_unformatted["input_ids"][j+1])
            break
        else:
            input_ids_unformatted.append(data_unformatted["input_ids"][j])

    for j in data_format["input_ids"]:
        if j == 0:
            continue
        if j == response_key_id:
            input_ids_formatted.append(j)
            break
        else:
            input_ids_formatted.append(j)

    data_decoded = tokenizer1.decode(input_ids_unformatted)
    """print("Unformatted Question:")
    print("--------------------")
    print(data_decoded)
    print("--------------------")
    print()"""

    data_decoded = tokenizer2.decode(input_ids_formatted)
    """print("Formatted Question:")
    print("--------------------")
    print(data_decoded)
    print("--------------------")
    print()"""

    output_ids = model1.generate(torch.LongTensor(input_ids_unformatted).to(device).unsqueeze(0), max_length=128)
    prediction1 = tokenizer2.decode(output_ids[0], skip_special_tokens=False)
    predictions1.append(prediction1)
    """print("Response 1:")
    print("--------------------")
    print(prediction1)
    print("--------------------")"""

    output_ids = model2.generate(torch.LongTensor(input_ids_formatted).to(device).unsqueeze(0), max_length=128, stopping_criteria=StoppingCriteriaList([StopOnTokens()]))
    prediction2 = tokenizer2.decode(output_ids[0], skip_special_tokens=False)
    predictions2.append(prediction2)
    """print("Response 2:")
    print("--------------------")
    print(prediction2)
    print("--------------------")"""

    questions.append(data_decoded)

    if i == 200:
        break

df = pd.DataFrame({"Question": questions, "Response 1 (No Fine-Tuning)": predictions1, "Response 2 (Fine-Tune)": predictions2})
df.to_csv("./data/model_evaluation.csv", index=False)

  0%|          | 0/1838 [00:00<?, ?it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
  0%|          | 1/1838 [00:06<3:16:03,  6.40s/it]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable

# Problems:
- Some solutions are too long for training (max tokens reached)
- Questions need to be in right format and need to be tokanized the same way
- Foundation models are better overall
- Smaller model doesn't know when to stop

In [61]:
import re
df = pd.read_csv("./data/model_evaluation.csv")

In [62]:
print(df.iloc[0]["Question"])
df['question_cleaned'] = df['Question'].apply(lambda x: re.search('(Goal:.*?)(?=### Response:)', x, re.DOTALL).group(1) if re.search('Goal:(.*?)(?=### Response:)', x, re.DOTALL) else '')
# Remove trailing white spaces and new lines
df['question_cleaned'] = df['question_cleaned'].apply(lambda x: x.rstrip())


Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Goal: How do I ready a guinea pig cage for it's new occupants?

### Response:



In [63]:
df['response1_cleaned'] = df['Response 1 (No Fine-Tuning)'].apply(lambda x: re.search('Solution:(.*)', x, re.DOTALL).group(1) if re.search('Solution:(.*)', x, re.DOTALL) else '')
# Remove trailing white spaces and new lines
df['response1_cleaned'] = df['response1_cleaned'].apply(lambda x: x.rstrip())

df['response2_cleaned'] = df['Response 2 (Fine-Tune)'].apply(
    lambda x: re.search('Solution:(.*)(?=### End)', x, re.DOTALL).group(1).rstrip() if "### End" in x 
              else re.search('Solution:(.*)', x, re.DOTALL).group(1).rstrip() if "Solution:" in x
              else ''
)
# Remove trailing white spaces and new lines
df['response2_cleaned'] = df['response2_cleaned'].apply(lambda x: x.rstrip())

In [64]:
print(df.iloc[0]["Response 1 (No Fine-Tuning)"])

Goal: How do I ready a guinea pig cage for it's new occupants?
Solution:

1. Remove the cage from the wall.
2. Remove the cage from the wall.
3. Remove the cage from the wall.
4. Remove the cage from the wall.
5. Remove the cage from the wall.
6. Remove the cage from the wall.
7. Remove the cage from the wall.
8. Remove the cage from the wall.
9. Remove the cage from the wall.
10. Remove the cage from the wall.
11. Remove the cage from


In [65]:
print(df.iloc[0]["Response 2 (Fine-Tune)"])

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Goal: How do I ready a guinea pig cage for it's new occupants?

### Response:
Solution: Take the guinea pig cage and place it in a large container.

### End


In [70]:
df.iloc[0]

Question                       Below is an instruction that describes a task....
Response 1 (No Fine-Tuning)    Goal: How do I ready a guinea pig cage for it'...
Response 2 (Fine-Tune)         Below is an instruction that describes a task....
question_cleaned               Goal: How do I ready a guinea pig cage for it'...
response1_cleaned              \n\n1. Remove the cage from the wall.\n2. Remo...
response2_cleaned               Take the guinea pig cage and place it in a la...
Name: 0, dtype: object

In [84]:
# Compare predictions
judge_response = []
for index, row in tqdm(df.iterrows(), total=len(df)):
    prompt = "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."
    prompt = [
        {"role": "system", "content": "You are a helpful and precise assistant for checking the quality of the answer."},
        {"role": "user", "content": f"[Question]\n{row['question_cleaned']}\n\n[The Start of Assistant 1's Answer]\n{row['response1_cleaned']}\n\n[The End of Assistant 1's Answer]\n\n[The Start of Assistant 2's Answer]\n{row['response2_cleaned']}\n\n[The End of Assistant 2's Answer]\n\n[System]\n{prompt}\n\n"}
    ]
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=prompt,
        max_tokens=100,
        temperature=0.7,
    )
    judge_response.append(response)  # Assuming the first score is for the first model

 92%|█████████▏| 185/201 [12:04<01:02,  3.92s/it]


ServiceUnavailableError: The server is overloaded or not ready yet.

In [100]:
df.loc["judge_response"] = None

for i, data in enumerate(judge_response):
    df.loc[i, "judge_response"] = data["choices"][0]["message"]["content"]

In [102]:
df.iloc[0]["judge_response"]

'2 3\n\nAssistant 1 received a score of 2 for its unhelpful, irrelevant, and inaccurate response that did not provide any relevant information on how to ready a guinea pig cage for new occupants.\nAssistant 2 received a score of 3 for its partially relevant response, but it lacked necessary details to be helpful and accurate. It only provided one step that may not be sufficient in readying a guinea pig cage for new occupants. \n\nOverall, both assistants failed to provide'

In [111]:
# Create 'score1' and 'score2' columns by extracting scores from 'response'
df[['score1', 'score2']] = df['judge_response'].str.extract('(\d)\s(\d)', expand=True)

# Convert scores from string type to numeric
df[['score1', 'score2']] = df[['score1', 'score2']].apply(pd.to_numeric)

In [4]:
# Count how many times 'score1' is greater than 'score2'
score1_greater_count = (df['score1'] > df['score2']).sum()

# Count how many times 'score2' is greater than 'score1'
score2_greater_count = (df['score2'] > df['score1']).sum()

# Calculate the average of 'score1' and 'score2', ignoring None values
average_score1 = df['score1'].mean()
average_score2 = df['score2'].mean()

print(f"'score1' is greater than 'score2' {score1_greater_count} times.")
print(f"'score2' is greater than 'score1' {score2_greater_count} times.")
print(f"The average score for 'score1' is {average_score1}.")
print(f"The average score for 'score2' is {average_score2}.")

'score1' is greater than 'score2' 128 times.
'score2' is greater than 'score1' 57 times.
The average score for 'score1' is 6.762162162162162.
The average score for 'score2' is 5.037837837837838.


In [114]:
df.to_csv("./data/model_evaluation.csv", index=False)

In [3]:
df = pd.read_csv("./data/model_evaluation.csv")

In [13]:
# get example of max and min scores of each model
# get example of average score of each model
# get 10 random examples of each model
# analyze judge response

sampled_max_scores_from_non_fine_tuned_model = df[df["score1"] == df['score1'].max()].sample(3, random_state=42)

print("Question cleaned:")
print(sampled_max_scores_from_non_fine_tuned_model["question_cleaned"].values)
print("--------------------")
print("Response Non Fine Tuned cleaned:")
print(sampled_max_scores_from_non_fine_tuned_model["response1_cleaned"].values)
print("--------------------")
print("Response Fine Tuned cleaned:")
print(sampled_max_scores_from_non_fine_tuned_model["response2_cleaned"].values)
print("--------------------")
print("Judge response:")
print(sampled_max_scores_from_non_fine_tuned_model["judge_response"].values)


Question cleaned:
['Goal: How to sleep in proper posture?'
 'Goal: How to clean blinds without tearing them up' 'Goal: fire']
--------------------
Response Non Fine Tuned cleaned:
["\n\nA:\n\nThe answer is that you can't.\nThe reason is that the body is a living thing, and it is constantly changing.\nThe body is a living thing, and it is constantly changing.\nThe body is a living thing, and it is constantly changing.\nThe body is a living thing, and it is constantly changing.\nThe body is a living thing, and it is constantly changing.\nThe body is a living thing, and it is constantly changing.\nThe body is a living thing, and it is constantly changing"
 '\n\n1.  Use a vacuum cleaner to clean the blinds.\n2.  Use a vacuum cleaner to clean the blinds.\n3.  Use a vacuum cleaner to clean the blinds.\n4.  Use a vacuum cleaner to clean the blinds.\n5.  Use a vacuum cleaner to clean the blinds.\n6.  Use a vacuum cleaner to clean the blinds.\n7.  Use a vacuum cleaner to clean the blinds.\n8.  

In [14]:
print(df['score2'].max())

sampled_max_scores_from_non_fine_tuned_model = df[df["score2"] == df['score2'].max()].sample(3, random_state=42)

print("Question cleaned:")
print(sampled_max_scores_from_non_fine_tuned_model["question_cleaned"].values)
print("--------------------")
print("Response Non Fine Tuned cleaned:")
print(sampled_max_scores_from_non_fine_tuned_model["response1_cleaned"].values)
print("--------------------")
print("Response Fine Tuned cleaned:")
print(sampled_max_scores_from_non_fine_tuned_model["response2_cleaned"].values)
print("--------------------")
print("Judge response:")
print(sampled_max_scores_from_non_fine_tuned_model["judge_response"].values)


9.0
Question cleaned:
['Goal: To allow the oil for the turkey fryer to reach the correct cooking temp'
 'Goal: how to make smoky pretzel mix' 'Goal: How to order room service']
--------------------
Response Non Fine Tuned cleaned:
[' The oil needs to be at the correct temperature. The oil needs to be at the correct temperature. The oil needs to be at the correct temperature. The oil needs to be at the correct temperature. The oil needs to be at the correct temperature. The oil needs to be at the correct temperature. The oil needs to be at the correct temperature. The oil needs to be at the correct temperature. The oil needs to be at the correct temperature. The oil needs to be at the correct temperature. The oil needs to be at the'
 '\n\n1.  Add the pretzel mix to the water and let it sit for about 5 minutes.\n2.  Add the water to the smoky pretzel mix and let it sit for about 5 minutes.\n3.  Add the water to the smoky pretzel mix and let it sit for about 5 minutes.\n4.  Add the water 