In [1]:
import json
from openai import OpenAI


# Setup OpenAI API client
#client = OpenAI()

# Function to get model completion
def get_answer(question, model_name):
    response = client.chat.completions.create(
        model= model_name,
        messages=[
            {"role": "system", "content": "You are a helpful assistant that only provides direct answers without any explanations."},
            {"role": "user", "content": question}
        ]
    )
    return response.choices[0].message.content

# Introduce another state-of-the-art model such as GPT-4 to evaluate model performace
def evaluation(benchmark_answer, generated_answer):
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "Decide if the benchmark answer and the generated answer match. Respond with 'True' if they match, otherwise 'False'."},
            {"role": "user", "content": f"Benchmark Answer: {benchmark_answer}\nGenerated Answer: {generated_answer}"}
        ]
    )
    return response.choices[0].message.content.strip().lower() == 'true'



In [7]:
def benchmark(file_path, generate_model_name):
    with open(file_path) as f:
        data = json.load(f)

    correct_count = 0
    total_count = len(data)

    for item in data:
        question = item["Question"]
        benchmark_answer = item["Answer"]
        generated_answer = get_answer(question, generate_model_name)
        match = evaluation(benchmark_answer, generated_answer)
        if match:
            correct_count += 1

    accuracy = correct_count / total_count if total_count > 0 else 0
    print(f"Accuracy: {accuracy * 100:.2f}%")


## File Path

In [3]:
abstract_algebra_path = "Data/Test Data/MMLU,Theorem QA, Linear Algebra QA/abstract_algebra_qa_test.json"
linear_algebra_qa_path = "Data/Test Data/MMLU,Theorem QA, Linear Algebra QA/linear_algebra_qa.json"
linear_algebra_theorem_qa_test_path = "Data/Test Data/MMLU,Theorem QA, Linear Algebra QA/linear_algebra_theorem_qa_test.json"
linear_algebra_data_test_path = "Data/Test Data/MATH Linear Algebra/linear_algebra_data_test.json"

## GPT-3.5-Turbo

In [61]:
benchmark(abstract_algebra_path, "gpt-3.5-turbo")

Accuracy: 22.00%


In [138]:
benchmark(linear_algebra_qa_path, "gpt-3.5-turbo")

Accuracy: 31.84%


In [91]:
benchmark(linear_algebra_theorem_qa_test_path, "gpt-3.5-turbo")

Accuracy: 9.62%


In [94]:
benchmark(linear_algebra_data_test_path, "gpt-3.5-turbo")

Accuracy: 8.60%


# Fine-Tuning

## Finetuned GPT-3.5-Turbo with two different datasets for comparison

In [None]:
client.files.create(
  file=open("Data/Finetuning Data/abstract_algebra_finetuning_chat.jsonl", "rb"),
  purpose="fine-tune"
)

In [105]:
# 9000 combined finetuning data for GPT-3.5-turbo and GPT-4o-mini
client.files.create(
  file=open("Data/Finetuning Data/combined_finetuning_data.jsonl", "rb"),
  purpose="fine-tune"
)

FileObject(id='file-87NJ25RTZ7X3On8UwBTqMwNm', bytes=3874885, created_at=1726508810, filename='combined_finetuning_data.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [106]:
client.fine_tuning.jobs.create(
  training_file="file-87NJ25RTZ7X3On8UwBTqMwNm", 
  model="gpt-3.5-turbo"
)

FineTuningJob(id='ftjob-4z4gAcbj2gbjPKdwyqSSSLg4', created_at=1726508851, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-0125', object='fine_tuning.job', organization_id='org-5UJP7VuDIG8zyGaxVePAnKpw', result_files=[], seed=896923282, status='validating_files', trained_tokens=None, training_file='file-87NJ25RTZ7X3On8UwBTqMwNm', validation_file=None, estimated_finish=None, integrations=[], user_provided_suffix=None)

In [119]:
print(client.fine_tuning.jobs.retrieve("ftjob-4z4gAcbj2gbjPKdwyqSSSLg4").hyperparameters, "for GPT-3.5-turbo Combined Training Data")
print(client.fine_tuning.jobs.retrieve("ftjob-9N1bLQqiWzpTnssk2RIwW2nt").hyperparameters, "for GPT-3.5-turbo Abstract Algebra Training Data Only")

Hyperparameters(n_epochs=2, batch_size=12, learning_rate_multiplier=2) for GPT-3.5-turbo Combined Training Data
Hyperparameters(n_epochs=3, batch_size=6, learning_rate_multiplier=2) for GPT-3.5-turbo Abstract Algebra Training Data Only


### Abstract Algebra

In [120]:
benchmark(abstract_algebra_path, "ft:gpt-3.5-turbo-0125:personal::A74brutO")

Accuracy: 23.00%


In [123]:
benchmark(abstract_algebra_path, "ft:gpt-3.5-turbo-0125:personal::A8AuHZhq")

Accuracy: 26.00%


### Linear Algebra QA

In [131]:
benchmark(linear_algebra_qa_path, "ft:gpt-3.5-turbo-0125:personal::A74brutO")

Accuracy: 34.98%


In [8]:
benchmark(linear_algebra_qa_path, "ft:gpt-3.5-turbo-0125:personal::A8AuHZhq")

Accuracy: 24.22%


### Linear Algebra Theorem QA

In [129]:
benchmark(linear_algebra_theorem_qa_test_path, "ft:gpt-3.5-turbo-0125:personal::A74brutO")

Accuracy: 17.31%


In [133]:
benchmark(linear_algebra_theorem_qa_test_path, "ft:gpt-3.5-turbo-0125:personal::A8AuHZhq")

Accuracy: 25.00%


### MATH QA

In [101]:
benchmark(linear_algebra_data_test_path, "ft:gpt-3.5-turbo-0125:personal::A74brutO")

Accuracy: 8.90%


In [137]:
benchmark(linear_algebra_data_test_path, "ft:gpt-3.5-turbo-0125:personal::A8AuHZhq")

Accuracy: 10.65%


## Outputs Comparison of GPT-3.5-Turbo

In [None]:
def display_sample_response(file_path, number,original_model_name, finetuned_model_name):
    with open(file_path) as f:
        data = json.load(f)

    if data:
        first_item = data[number]  # Using the 11th question as requested (index 10)
        question = first_item["Question"]
        benchmark_answer = first_item["Answer"]

        # Get answers from the original and fine-tuned models
        original_answer = get_answer(question, original_model_name)
        finetuned_answer = get_answer(question, finetuned_model_name)

        # Display the question and the expected benchmark answer
        print("Question:", question)
        print("Expected Answer:", benchmark_answer)

        # Display the answers from both models
        print("\nOriginal Model Answer:", original_answer)
        print("Fine-tuned Model Answer:", finetuned_answer)

    else:
        print("No data available.")

display_sample_response("Data/Test Data/MMLU,Theorem QA, Linear Algebra QA/abstract_algebra_qa_test.json", 44,"gpt-3.5-turbo","ft:gpt-3.5-turbo-0125:personal::A8AuHZhq")

Question: Let A and B be sets, f: A -> B and g: B -> A be functions such that for all a \in A, g(f(a)) = a. Statement 1 | The function f must necessarily be injective. Statement 2 | The function f must necessarily be surjective.
Expected Answer: True, False

Original Model Answer: Statement 1: True
Statement 2: False
Fine-tuned Model Answer: Statement 1: False. The function f(a) can still be non-injective. Consider f(a) = f(b) for some a, b \in A, then a = g(f(a)) = g(f(b)) = b. It means g is injective. Statement 2: True. The function g is surjective because it is defined from B \to A, and for all a \in A, g(f(a)) = a.



In [None]:
display_sample_response("Data/Test Data/MMLU,Theorem QA, Linear Algebra QA/linear_algebra_qa.json", 24,"gpt-3.5-turbo","ft:gpt-3.5-turbo-0125:personal::A8AuHZhq")

Question: If v1, v2, and v3 are linearly independent, can you find a linear combination of these vectors that has a magnitude of 3? How do you know?
Expected Answer: Yes, you can find a linear combination of v1, v2, and v3 that has a magnitude of 3.

To find a linear combination of v1, v2, and v3 that has a magnitude of 3, you can use the fact that if v1, v2, and v3 are linearly independent, then any linear combination of these vectors can be written in the form:

a1v1 + a2v2 + a3v3 = 0

where a1, a2, and a3 are constants.

Since v1, v2, and v3 are linearly independent, we know that none of these vectors are multiples of the others. Therefore, the only non-trivial linear combination of v1, v2, and v3 is:

v1 + v2 + v3 = 0

This linear combination has a magnitude of 3, since the magnitude of the vector is equal to the product of its components.

Therefore, the answer to the question is yes, you can find a linear combination of v1, v2, and v3 that has a magnitude of 3.

Original Model An

In [None]:
display_sample_response(linear_algebra_theorem_qa_test_path, 19,"gpt-3.5-turbo","ft:gpt-3.5-turbo-0125:personal::A8AuHZhq")

Question: Is 7 a quadratic residue modulo 19? Use Gauss's Lemma to answer it.
Expected Answer: True

Original Model Answer: Yes, 7 is a quadratic residue modulo 19.
Fine-tuned Model Answer: \[7 = -1\mod 19\] \\ \[7^{(19-1)/2} = (-1)^{9} = -1\mod 19\] \\ \[7\] is a quadratic nonresidue modulo 19.



In [None]:
display_sample_response(linear_algebra_data_test_path, 1,"gpt-3.5-turbo","ft:gpt-3.5-turbo-0125:personal::A8AuHZhq")

Question:  Find the determinant of the matrix $\left( \begin{array}{cc} -4 & -2 \\ -1 & -4 \\ \end{array} \right)$. 
Expected Answer: $14$

Original Model Answer: The determinant of the matrix is 14.
Fine-tuned Model Answer: The determinant of the matrix $\begin{pmatrix} a & b \\ c & d \end{pmatrix}$ is calculated as ad - bc. In this case, the determinant is (-4)(-4) - (-2)(-1) = 16 - 2 = 14.

