# Try to create determnistic texts by calling openAI module

idea was based on the following code: https://github.com/TonySimonovsky/prompt_engineering_experiments/blob/main/experiments/DeterministicResultsOpenAI/Deterministic%20Results%20in%20OpenAI%20(report).ipynb

## Get API key(s)


normally not needed to provide your huggingface key:

In [1]:
import os
import sys

# Assuming 'src' is one level down (in the current directory or a subdirectory)
path_to_src = os.path.join('..','src')  # Moves one level down to 'src' folder

# Add the path to sys.path
sys.path.append(path_to_src)

# Now you can import your API_key module
import API_key as key

## Create deterministic data

define number of runs:

In [2]:
num_runs = 30

In [3]:
from openai import OpenAI


# set up client
client = OpenAI(
    api_key=key.openAI_key,
)

# Function to call OpenAI's API using the updated format
def get_response(prompt, model="gpt-3.5-turbo"):
    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt}],
    temperature=0.0,  # Set to 0 for deterministic results
    seed=555, # beta feature, but it allows you to obtain consistent results for every input submitted to GPT.
    max_tokens=50 # only 50 tokens
)
    return response.choices[0].message.content


# Function to call OpenAI's API using the updated format
def get_response2(prompt, model="gpt-3.5-turbo"):
    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt}],
    top_p=0,  # Set to 0 for deterministic results
    seed=555, # beta feature, but it allows you to obtain consistent results for every input submitted to GPT.
    max_tokens=50 # only 50 tokens
)
    return response.choices[0].message.content

# Generate variations of prompts
prompt = "Highlight application cases for Large Language Models."


# Collect responses
responses = []
responses2 = []
for i in range(num_runs):
    # print(i)
    response = get_response(prompt)
    response2 = get_response2(prompt)
    responses.append(response)
    responses2.append(response2)

In [4]:
import pandas as pd
# Convert the list to a DataFrame
responses_df = pd.DataFrame(responses)
# Export to Excel
responses_df.to_excel("deterministicResponses.xlsx", index=False)

# Convert the list to a DataFrame
responses_df2 = pd.DataFrame(responses2)
# Export to Excel
responses_df2.to_excel("deterministicResponses2.xlsx", index=False)

In [5]:
print(len(responses))
print(responses[0])

30
1. Natural language processing: Large language models can be used for tasks such as text generation, sentiment analysis, language translation, and speech recognition.

2. Chatbots and virtual assistants: Large language models can be used to create more advanced and human-like


## Compute Levenshtein distances

In [10]:
import Levenshtein as lev
import numpy as np

# Create a distance matrix
n = len(responses)
distance_matrix = np.zeros((n, n))

# Compute Levenshtein distances
for i in range(n):
    for j in range(n):
        if i != j:
            distance_matrix[i][j] = lev.distance(responses[i], responses[j])


# Display similarity matrix and calculate mean similarity score
mean_distance = np.mean(distance_matrix[np.triu_indices_from(distance_matrix, k=1)])
print("Mean Distance Score Levenshtein for temperature 0 and set seed:", mean_distance)
# Print the distance matrix
#print("Levenshtein Distance Matrix:")
#print(distance_matrix)

Mean Distance Score Levenshtein for temperature 0 and set seed: 73.34252873563219


In [11]:
# for responses2
import Levenshtein as lev
import numpy as np

# Create a distance matrix
n = len(responses2)
distance_matrix = np.zeros((n, n))

# Compute Levenshtein distances
for i in range(n):
    for j in range(n):
        if i != j:
            distance_matrix[i][j] = lev.distance(responses2[i], responses2[j])


# Display similarity matrix and calculate mean similarity score
mean_distance = np.mean(distance_matrix[np.triu_indices_from(distance_matrix, k=1)])
print("Mean Distance Score Levenshtein for top_p 0 and set seed:", mean_distance)
# Print the distance matrix
#print("Levenshtein Distance Matrix:")
#print(distance_matrix)

Mean Distance Score Levenshtein for top_p 0 and set seed: 109.32183908045977


## Measure similarity using TF-IDF and cosine similarity

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Measure similarity using TF-IDF and cosine similarity
vectorizer = TfidfVectorizer().fit_transform(responses)
vectors = vectorizer.toarray()
cosine_matrix = cosine_similarity(vectors)

# Create a structured DataFrame for the similarity matrix using response indices
similarity_df = pd.DataFrame(cosine_matrix, index=[f'Response {i+1}' for i in range(len(responses))],
                             columns=[f'Response {i+1}' for i in range(len(responses))])

# Display similarity matrix and calculate mean similarity score
mean_similarity = np.mean(cosine_matrix[np.triu_indices_from(cosine_matrix, k=1)])
print("Mean Similarity Score for temperature 0 and set seed: ", mean_similarity)
#print(similarity_df)

Mean Similarity Score for temperature 0 and set seed:  0.6829032744918881


In [13]:
# for responses2
# Measure similarity using TF-IDF and cosine similarity
vectorizer = TfidfVectorizer().fit_transform(responses2)
vectors = vectorizer.toarray()
cosine_matrix = cosine_similarity(vectors)

# Create a structured DataFrame for the similarity matrix using response indices
similarity_df = pd.DataFrame(cosine_matrix, index=[f'Response {i+1}' for i in range(len(responses2))],
                             columns=[f'Response {i+1}' for i in range(len(responses2))])

# Display similarity matrix and calculate mean similarity score
mean_similarity = np.mean(cosine_matrix[np.triu_indices_from(cosine_matrix, k=1)])
print("Mean Similarity Score for top_p 0 and set seed::", mean_similarity)
#print(similarity_df)

Mean Similarity Score for top_p 0 and set seed:: 0.5563508164354505
