# Try to create determnistic texts by calling openAI module

idea was based on the following code: https://github.com/TonySimonovsky/prompt_engineering_experiments/blob/main/experiments/DeterministicResultsOpenAI/Deterministic%20Results%20in%20OpenAI%20(report).ipynb

## Get API key(s)


normally not needed to provide your huggingface key:

In [1]:
import os
import sys

# Assuming 'src' is one level down (in the current directory or a subdirectory)
path_to_src = os.path.join('..','src')  # Moves one level down to 'src' folder

# Add the path to sys.path
sys.path.append(path_to_src)

# Now you can import your API_key module
import API_key as key

## Create deterministic data

define number of runs:

In [2]:
num_runs = 20

In [3]:
from openai import OpenAI


# set up client
client = OpenAI(
    api_key=key.openAI_key,
)

# Function to call OpenAI's API using the updated format
def get_response(prompt, model="gpt-3.5-turbo"):
    response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[{"role": "user", "content": prompt}],
    temperature=0.0,  # Set to 0 for deterministic results
    seed=555, # beta feature, but it allows you to obtain consistent results for every input submitted to GPT.
    max_tokens=50 # only 50 tokens
)
    return response.choices[0].message.content

# Generate variations of prompts
prompt = "Explain the significance of machine learning in modern industries."


# Collect responses
responses = []
for i in range(num_runs):
    # print(i)
    response = get_response(prompt)
    responses.append(response)

In [5]:
import pandas as pd
# Convert the list to a DataFrame
responses_df = pd.DataFrame(responses)

# Export to Excel
responses_df.to_excel("deterministicResponses.xlsx", index=False)

In [6]:
print(len(responses))
print(responses[0])

20
Machine learning has become increasingly significant in modern industries due to its ability to analyze and interpret large amounts of data quickly and accurately. This technology allows businesses to make data-driven decisions, improve efficiency, and optimize processes in various industries such as healthcare, finance,


## Compute Levenshtein distances

In [7]:
import Levenshtein as lev
import numpy as np

# Create a distance matrix
n = len(responses)
distance_matrix = np.zeros((n, n))

# Compute Levenshtein distances
for i in range(n):
    for j in range(n):
        if i != j:
            distance_matrix[i][j] = lev.distance(responses[i], responses[j])


# Display similarity matrix and calculate mean similarity score
mean_distance = np.mean(distance_matrix[np.triu_indices_from(distance_matrix, k=1)])
print("Mean Distance Score Levenshtein:", mean_distance)
# Print the distance matrix
print("Levenshtein Distance Matrix:")
print(distance_matrix)

Mean Distance Score Levenshtein: 2.2
Levenshtein Distance Matrix:
[[ 0.  0.  0.  0.  0.  0.  0.  0.  0. 22.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0. 22.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0. 22.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0. 22.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0. 22.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0. 22.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0. 22.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0. 22.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0. 22.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [22. 22. 22. 22. 22. 22. 22. 22. 22.  0. 22. 22. 22. 22. 22. 22. 22. 22.
  22. 22.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0. 22.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]

## Measure similarity using TF-IDF and cosine similarity

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Measure similarity using TF-IDF and cosine similarity
vectorizer = TfidfVectorizer().fit_transform(responses)
vectors = vectorizer.toarray()
cosine_matrix = cosine_similarity(vectors)

# Create a structured DataFrame for the similarity matrix using response indices
similarity_df = pd.DataFrame(cosine_matrix, index=[f'Response {i+1}' for i in range(len(responses))],
                             columns=[f'Response {i+1}' for i in range(len(responses))])

# Display similarity matrix and calculate mean similarity score
mean_similarity = np.mean(cosine_matrix[np.triu_indices_from(cosine_matrix, k=1)])
print("Mean Similarity Score:", mean_similarity)
print(similarity_df)

Mean Similarity Score: 0.9728725123549896
             Response 1  Response 2  Response 3  Response 4  Response 5  \
Response 1     1.000000    1.000000    1.000000    1.000000    1.000000   
Response 2     1.000000    1.000000    1.000000    1.000000    1.000000   
Response 3     1.000000    1.000000    1.000000    1.000000    1.000000   
Response 4     1.000000    1.000000    1.000000    1.000000    1.000000   
Response 5     1.000000    1.000000    1.000000    1.000000    1.000000   
Response 6     1.000000    1.000000    1.000000    1.000000    1.000000   
Response 7     1.000000    1.000000    1.000000    1.000000    1.000000   
Response 8     1.000000    1.000000    1.000000    1.000000    1.000000   
Response 9     1.000000    1.000000    1.000000    1.000000    1.000000   
Response 10    0.728725    0.728725    0.728725    0.728725    0.728725   
Response 11    1.000000    1.000000    1.000000    1.000000    1.000000   
Response 12    1.000000    1.000000    1.000000    1.00000