In [6]:
# Import
import pandas as pd
import numpy as np
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
# Helpers
def load_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

In [8]:
# Get DASS-21 questions
df_items = pd.read_csv('./Data/dass_21_items_text.csv')
#df_items.head()

In [10]:
# Set up your OpenAI API key
file_path = '/Users/jona/Library/CloudStorage/Dropbox/MyData/_Postdoc/AI_Learning/Ollama_Python/api_key.txt'  # Replace with the path to your .txt file
text_string = load_text_from_file(file_path)
client = OpenAI(api_key=text_string)

In [11]:
# Get one sentence
type(df_items)
text = df_items.iloc[:,3] # as an example
print(text)

0     Couldn't seem to experience any positive feeli...
1     Found it difficult to work up the initiative t...
2            Felt that I had nothing to look forward to
3                            Felt down-hearted and blue
4          Unable to become enthusiastic about anything
5                  Felt I wasn't worth much as a person
6                        Felt that life was meaningless
7                          Aware of dryness of my mouth
8     Experienced breathing difficulty (e.g., excess...
9            Experienced trembling (e.g., in the hands)
10    Worried about situations in which I might pani...
11                            Felt I was close to panic
12    Aware of the action of my heart in the absence...
13                  Felt scared without any good reason
14                           Found it hard to wind down
15                   Tended to over-react to situations
16        Felt that I was using a lot of nervous energy
17                        Found myself getting a

In [12]:
# Function to get the embedding for a given text string

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding


In [13]:
len(text)

21

In [14]:
embedding_vector = []

In [15]:
# Example usage
for i in range(0,21):
    embedding_i = get_embedding(text[i])
    embedding_vector.append(embedding_i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20


In [16]:
# To dataframe and save
df_embed = pd.DataFrame(embedding_vector).T
df_embed.to_csv('Files/text-embedding-3-small.csv', index=False)

df_embed.shape

(1536, 21)

In [17]:
# Function to Compute Cosine Similarity matrix
# Example function to compute cosine similarity matrix
def compute_cosine_similarity(df):
    # Compute the cosine similarity between rows (N dimensions)
    similarity_matrix = cosine_similarity(df)
    
    # Convert the result to a DataFrame for readability
    similarity_df = pd.DataFrame(similarity_matrix, index=df.index, columns=df.index)
    
    return similarity_df


In [23]:
cosine_sim_matrix = compute_cosine_similarity(df_embed.T)

cosine_sim_matrix.shape

type(cosine_sim_matrix)
cosine_sim_matrix.iloc[2,1], cosine_sim_matrix.iloc[1,2]

# Display the cosine similarity matrix
#print(cosine_sim_matrix)


(np.float64(0.500341514962459), np.float64(0.500341514962459))

In [24]:
# Save
cosine_sim_matrix.to_csv('Data/cos_matrices/text-embedding-3-small.csv', index=False)




In [25]:
# Do the same pipeline for "text-embedding-3-large"
# Example usage
embedding_vector_large = []
for i in range(0,21):
    embedding_i = get_embedding(text[i], model="text-embedding-3-large")
    embedding_vector_large.append(embedding_i)
    print(i)

# To dataframe and save
df_embed_large = pd.DataFrame(embedding_vector_large).T
df_embed_large.to_csv('Files/text-embedding-3-large.csv', index=False)

# Cosine Similarity
cosine_sim_matrix_large = compute_cosine_similarity(df_embed_large.T)
print(cosine_sim_matrix_large.shape)

# Save Cosine similarity Matrix
cosine_sim_matrix_large.to_csv('Data/cos_matrices/text-embedding-3-large.csv', index=False)


0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
(21, 21)
