In [9]:
from google.cloud import documentai
from google.oauth2 import service_account
from google.auth import load_credentials_from_file
from google.cloud import aiplatform
import vertexai
from vertexai.language_models import TextGenerationModel
from vertexai.generative_models import GenerativeModel, SafetySetting, Part
import json
import os
import openai
import timeit
from scipy.spatial.distance import cosine
import time
import pandas as pd
import numpy as np

In [3]:
openai.api_key = os.getenv('OPENAI_API_KEY')

In [4]:
with open('./Test Data/test_data.json', "r") as json_file:
    test_data = json.load(json_file)

In [5]:
generation_config = {
    "max_output_tokens": 1024,
    "temperature": 0.2,
    "top_p": 1,
}

safety_settings = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
]


In [72]:
credentials, project_id = load_credentials_from_file("./GSuite Text Extraction Creds/vertex_ai_key.json")
vertexai.init(credentials=credentials,project="90458358443", location="us-central1")


In [97]:
model = GenerativeModel(
    "projects/90458358443/locations/us-central1/endpoints/352768410167279616",
    system_instruction=["You are a helpful tutor for the class - Applied Large Language Models and Natural Language Processing"]
)

In [98]:
model.generate_content(
    "What is a RAG system"
)

candidates {
  content {
    role: "model"
    parts {
      text: "RAG system stands for Retrieval Augmented Generation, which is a system that combines"
    }
  }
  finish_reason: STOP
  avg_logprobs: -1.0132228306361608
}
usage_metadata {
  prompt_token_count: 22
  candidates_token_count: 14
  total_token_count: 36
}

<h1>Creating Embedding Similarity Mechanism</h1>

In [5]:
def get_embedding(text, model="text-embedding-3-large"):
    response = openai.Embedding.create(input=text, model=model)
    return response['data'][0]['embedding']

In [6]:
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)


In [7]:
all_messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": f"Write a story about a magic backpack."}
]
response = openai.ChatCompletion.create(
model="ft:gpt-4o-mini-2024-07-18:f-prime-capital::AbZYSjIT",
max_tokens = 1500,
messages=all_messages
)

In [8]:
response['choices'][0]['message']['content']

'Once upon a time, in a small village at the edge of the Whispering Woods, lived a curious girl named Lila. Lila loved exploring the woods, gathering flowers, and chasing the elusive butterflies. One sunny afternoon, as she rummaged through the attic, she stumbled upon an old, dusty backpack. Intrigued, she decided to take it on her adventures. To her surprise, the backpack was magic! It could change size, weight, and even tell her the best items to put inside for her journey. Lila and her magical backpack went on many adventures, helping the villagers by gathering herbs, rescuing lost animals, and even befriending a dragon! The backpack taught her the value of teamwork, kindness, and sharing her magical gifts with others. They became the heroes of the village, reminding everyone that true magic lies in friendship, love, and helping one another.'

<h1>Testing Fine Tuned Model with Embedding Similarity</h1>

In [10]:
# Evaluate semantic similarity
similarities = []
start = timeit.default_timer()
# last_pause = timeit.default_timer()
for i,example in enumerate(test_data):
    try:
        input_text = example['input']
        expected_output = example['output']
        # Generate model response
        all_messages = [
        {"role": "system", "content": "You are a helpful tutor who answers questions about a class called Introduction to Deep Learning and LLM based Generative AI Systems"},
        {"role": "user", "content": f"Please answer this question: {input_text}"}
        ]
        response = openai.ChatCompletion.create(
        model="ft:gpt-4o-mini-2024-07-18:f-prime-capital::AbZYSjIT",
        max_tokens = 1500,
        messages=all_messages
        )
        # print(model_response)
        model_response_text = response['choices'][0]['message']['content']
        # Generate embeddings for expected and actual responses
        expected_embedding = get_embedding(expected_output)
        model_response_embedding = get_embedding(model_response_text)

        # Calculate similarity
        similarity = cosine_similarity(expected_embedding, model_response_embedding)
        similarities.append({'question':input_text ,'expected_output':expected_output,'model_output':model_response_text,'similarities':similarity})
    except Exception as e:
        print(i,e)
        # break
    if (i%5==0)&(i!=0):
        end = timeit.default_timer()
        print(f"{i} - Time Spent: {end-start}, Number of Errors: {i + 1 - len(similarities)}")
        start = timeit.default_timer()
        # print(f"Sleeping for {60-(timeit.default_timer()-last_pause)} seconds")
        # if (60-(timeit.default_timer()-last_pause))>0:
        #     time.sleep(60-(timeit.default_timer()-last_pause))
        # last_pause = timeit.default_timer()



5 - Time Spent: 14.322815584018826, Number of Errors: 0
10 - Time Spent: 10.151080583687872, Number of Errors: 0
15 - Time Spent: 15.418949166778475, Number of Errors: 0
20 - Time Spent: 11.93670737510547, Number of Errors: 0
25 - Time Spent: 9.323808333836496, Number of Errors: 0
30 - Time Spent: 8.024612166918814, Number of Errors: 0
35 - Time Spent: 9.83788524987176, Number of Errors: 0
40 - Time Spent: 11.682730374857783, Number of Errors: 0
45 - Time Spent: 7.758521792013198, Number of Errors: 0
50 - Time Spent: 11.4787029158324, Number of Errors: 0
55 - Time Spent: 7.5103811249136925, Number of Errors: 0
60 - Time Spent: 7.501676625106484, Number of Errors: 0
65 - Time Spent: 9.568124208133668, Number of Errors: 0
70 - Time Spent: 7.675871666986495, Number of Errors: 0
75 - Time Spent: 9.979165957774967, Number of Errors: 0
80 - Time Spent: 7.1760711669921875, Number of Errors: 0
85 - Time Spent: 8.06872524973005, Number of Errors: 0
90 - Time Spent: 8.022654834203422, Number of 

In [11]:
open_ai_fine_tuned_tested_data = pd.DataFrame(similarities)

In [12]:
open_ai_fine_tuned_tested_data['similarities'].mean()

np.float64(0.7569534683827671)

In [13]:
open_ai_fine_tuned_tested_data['similarities'].std()

np.float64(0.15092024521740652)

In [17]:
open_ai_fine_tuned_tested_data.to_pickle('./Non RAG Evaluation/open_ai_fine_tuned_tested_data.pkl')

<h1>Testing Baseline Model with Embedding Similarity</h1>

In [14]:
# Evaluate semantic similarity
similarities = []
start = timeit.default_timer()
# last_pause = timeit.default_timer()
for i,example in enumerate(test_data):
    try:
        input_text = example['input']
        expected_output = example['output']
        # Generate model response
        all_messages = [
        {"role": "system", "content": "You are a helpful tutor who answers questions about a class called Introduction to Deep Learning and LLM based Generative AI Systems"},
        {"role": "user", "content": f"Please answer this question: {input_text}"}
        ]
        response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        max_tokens = 1500,
        messages=all_messages
        )
        # print(model_response)
        model_response_text = response['choices'][0]['message']['content']
        # Generate embeddings for expected and actual responses
        expected_embedding = get_embedding(expected_output)
        model_response_embedding = get_embedding(model_response_text)

        # Calculate similarity
        similarity = cosine_similarity(expected_embedding, model_response_embedding)
        similarities.append({'question':input_text ,'expected_output':expected_output,'model_output':model_response_text,'similarities':similarity})
    except Exception as e:
        print(i,e)
        # break
    if (i%5==0)&(i!=0):
        end = timeit.default_timer()
        print(f"{i} - Time Spent: {end-start}, Number of Errors: {i + 1 - len(similarities)}")
        start = timeit.default_timer()
        # print(f"Sleeping for {60-(timeit.default_timer()-last_pause)} seconds")
        # if (60-(timeit.default_timer()-last_pause))>0:
        #     time.sleep(60-(timeit.default_timer()-last_pause))
        # last_pause = timeit.default_timer()



5 - Time Spent: 42.45609070919454, Number of Errors: 0
10 - Time Spent: 44.79717975016683, Number of Errors: 0
15 - Time Spent: 30.559102875180542, Number of Errors: 0
20 - Time Spent: 26.059100082609802, Number of Errors: 0
25 - Time Spent: 29.029177541844547, Number of Errors: 0
30 - Time Spent: 16.263122707605362, Number of Errors: 0
35 - Time Spent: 25.088259124662727, Number of Errors: 0
40 - Time Spent: 21.380300583783537, Number of Errors: 0
45 - Time Spent: 24.800243750214577, Number of Errors: 0
50 - Time Spent: 18.510034458246082, Number of Errors: 0
55 - Time Spent: 21.16916516702622, Number of Errors: 0
60 - Time Spent: 28.015404541045427, Number of Errors: 0
65 - Time Spent: 33.24137149984017, Number of Errors: 0
70 - Time Spent: 44.27602424984798, Number of Errors: 0
75 - Time Spent: 30.879963375162333, Number of Errors: 0
80 - Time Spent: 19.750233832746744, Number of Errors: 0
85 - Time Spent: 36.65350795770064, Number of Errors: 0
90 - Time Spent: 28.035486791748554, N

In [15]:
open_ai_base_line_tested_data = pd.DataFrame(similarities)

In [16]:
open_ai_base_line_tested_data.to_pickle('./Non RAG Evaluation/open_ai_base_line_tested_data.pkl')

<h1>Testing Fine Tuned Model with CoT Prompting with Embedding Similarity</h1>

In [18]:
# Evaluate semantic similarity
similarities = []
start = timeit.default_timer()
# last_pause = timeit.default_timer()
for i,example in enumerate(test_data):
    try:
        input_text = example['input']
        expected_output = example['output']
        # Generate model response
        all_messages = [
            {"role": "system", "content": "You are a helpful tutor who answers questions about a class called Introduction to Deep Learning and LLM based Generative AI Systems."},
            {"role": "user", "content": f"Please answer this question step by step and give me the final answer: {input_text}"},
            {"role": "user", "content": f"Remember to break down the question step by step before giving me the final answer"}
        ]
        response = openai.ChatCompletion.create(
        model="ft:gpt-4o-mini-2024-07-18:f-prime-capital::AbZYSjIT",
        max_tokens = 1500,
        messages=all_messages
        )
        # print(model_response)
        model_response_text = response['choices'][0]['message']['content']
        # Generate embeddings for expected and actual responses
        expected_embedding = get_embedding(expected_output)
        model_response_embedding = get_embedding(model_response_text)

        # Calculate similarity
        similarity = cosine_similarity(expected_embedding, model_response_embedding)
        similarities.append({'question':input_text ,'expected_output':expected_output,'model_output':model_response_text,'similarities':similarity})
    except Exception as e:
        print(i,e)
        # break
    if (i%5==0)&(i!=0):
        end = timeit.default_timer()
        print(f"{i} - Time Spent: {end-start}, Number of Errors: {i + 1 - len(similarities)}")
        start = timeit.default_timer()
        # print(f"Sleeping for {60-(timeit.default_timer()-last_pause)} seconds")
        # if (60-(timeit.default_timer()-last_pause))>0:
        #     time.sleep(60-(timeit.default_timer()-last_pause))
        # last_pause = timeit.default_timer()



In [19]:
open_ai_CoT_prompting = pd.DataFrame(similarities)

In [None]:
open_ai_CoT_prompting.to_pickle('./Non RAG Evaluation/open_ai_CoT_prompting.pkl')

<h1>Testing Fine Tuned Model with Self Consistency Prompting with Embedding Similarity</h1>

In [None]:
similarities = []
start = timeit.default_timer()
num_samples = 5  # Number of responses to generate per example

for i, example in enumerate(test_data):
    try:
        input_text = example['input']
        expected_output = example['output']

        # Collect multiple responses
        self_consistency_responses = []
        consistency_prompt = f"The question is: {input_text}\n\nThe expected answer is: {expected_output}\n\nHere are the generated responses:"
        for j in range(num_samples):
            response = openai.ChatCompletion.create(
                model="ft:gpt-4o-mini-2024-07-18:f-prime-capital::AbZYSjIT",
                max_tokens=1500,
                messages=[
                    {"role": "system", "content": "You are a helpful tutor who answers questions about a class called Introduction to Deep Learning and LLM based Generative AI Systems."},
                    {"role": "user", "content": f"Please answer this question: {input_text}"}
                ]
            )
            self_consistency_responses.append(response['choices'][0]['message']['content'])
            consistency_prompt+= f"\n {j}. {response['choices'][0]['message']['content']}"
        consistency_prompt+= "\nWhich response best matches the expected answer? Please answer text of the best response"
        evaluation_prompt = [
            {"role": "system", "content": "You are an evaluator for answers to a question about a class called Introduction to Deep Learning and LLM-based Generative AI Systems. Your task is to pick the best response to a question."},
            {"role": "user", "content": consistency_prompt}
        ]
        response = openai.ChatCompletion.create(
            model="ft:gpt-4o-mini-2024-07-18:f-prime-capital::AbZYSjIT",
            max_tokens=50,
            messages=evaluation_prompt
        )
        aggregated_response = response['choices'][0]['message']['content']

        # Generate embeddings for expected and aggregated responses
        expected_embedding = get_embedding(expected_output)
        aggregated_embedding = get_embedding(aggregated_response)

        # Calculate similarity
        similarity = cosine_similarity(expected_embedding, aggregated_embedding)
        
        # Store results
        similarities.append({
            'question': input_text,
            'expected_output': expected_output,
            'aggregated_output': aggregated_response,
            'individual_responses': self_consistency_responses,
            'similarity': similarity
        })
    except Exception as e:
        print(f"Error at example {i}: {e}")

    if (i % 5 == 0) and (i != 0):
        end = timeit.default_timer()
        print(f"{i} - Time Spent: {end - start}, Number of Errors: {i + 1 - len(similarities)}")
        start = timeit.default_timer()


In [None]:
open_ai_self_consistency_prompting = pd.DataFrame(similarities)

In [None]:
open_ai_self_consistency_prompting.to_pickle('./Non RAG Evaluation/open_ai_self_consistency_prompting.pkl')