In [1]:
from openai import OpenAI
client = OpenAI()

# Function to get the vector embedding for a given text
def get_vector_embeddings(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-ada-002"
    )
    embeddings = [r.embedding for r in response.data]
    return embeddings[0]

get_vector_embeddings("Your text string goes here")

[-0.007024349179118872,
 -0.005340521689504385,
 0.011911144480109215,
 -0.024991894140839577,
 -0.024642357602715492,
 0.03979344293475151,
 -0.010170182213187218,
 -0.009424054995179176,
 -0.013228630647063255,
 -0.009955082088708878,
 -0.0116758793592453,
 0.007864582352340221,
 -0.014102472923696041,
 0.007830972783267498,
 0.010197069495916367,
 -0.005091812927275896,
 0.023029109463095665,
 -0.0015729164006188512,
 0.01492254063487053,
 -0.010324784554541111,
 0.004869991447776556,
 0.012495947070419788,
 0.004920405335724354,
 0.010875977575778961,
 -0.006557179614901543,
 -0.00034344528103247285,
 0.005606035701930523,
 -0.01257660984992981,
 0.016347575932741165,
 0.004523815121501684,
 0.006590788718312979,
 -0.007105011492967606,
 -0.01515108346939087,
 -0.006627758964896202,
 -0.018619567155838013,
 0.00411378126591444,
 0.00319456635043025,
 -0.019009433686733246,
 0.03030216693878174,
 -0.007582264021039009,
 0.008207397535443306,
 0.009471108205616474,
 -0.00107885932084

In [3]:
import requests
import os

model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = os.getenv("HF_TOKEN")

api_url = "https://api-inference.huggingface.co/"
api_url += f"pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

def query(texts):
    response = requests.post(api_url, headers=headers,
    json={"inputs": texts,
    "options":{"wait_for_model":True}})
    return response.json()

texts = ["mickey mouse",
        "cheese",
        "trap",
        "rat",
        "ratatouille"
        "bus",
        "airplane",
        "ship"]

output = query(texts)
output

[[-0.03875631093978882,
  0.04480460286140442,
  0.016051076352596283,
  -0.01789095252752304,
  -0.03518553450703621,
  -0.013002980500459671,
  0.148772731423378,
  0.04880751296877861,
  0.011848394758999348,
  -0.04404251277446747,
  0.03387890011072159,
  -0.021161668002605438,
  -0.03593839332461357,
  -0.004054919350892305,
  0.02260565385222435,
  -0.03249230980873108,
  -0.012720160186290741,
  0.04557191953063011,
  0.008471001870930195,
  -0.03638048097491264,
  -0.0343356728553772,
  -0.0017537549138069153,
  -0.015112197026610374,
  0.013477502390742302,
  -0.07706686109304428,
  0.014443199150264263,
  0.02419365756213665,
  0.010391011834144592,
  -0.059111081063747406,
  -0.09692397713661194,
  0.0007171611068770289,
  -0.014247308485209942,
  -0.03565603867173195,
  -0.019078364595770836,
  -0.01961437426507473,
  0.006523977965116501,
  -0.049094777554273605,
  0.04045397415757179,
  -0.007324188016355038,
  -0.05470050126314163,
  -0.030987361446022987,
  -0.08274485

In [5]:
from gensim.models import Word2Vec

# Sample data: list of sentences, where each sentence is 
# a list of words.
# In a real-world scenario, you'd load and preprocess your 
# own corpus.
sentences = [
    ["the", "cake", "is", "a", "lie"],
    ["if", "you", "hear", "a", "turret", "sing", "you're", "probably", "too", "close"],
    ["why", "search", "for", "the", "end", "of", "a", "rainbow", "when", "the", "cake", "is", "a", "lie?"],
    ["GLaDOS", "promised", "cake", "but", "all", "I", "got", "was", "this", "test", "chamber"],
    ["remember", "when", "the", "platform", "was", "sliding", "into", "the", "fire", "pit", "and", "I", "said", "‘Goodbye’", "and", "you", "were", "like", "‘NO WAY!’", "and", "then", "I", "was", "all", "‘I", "was", "just", "pretending", "to", "murder", "you’?", "That", "was", "great"],
    ["the", "cake", "is", "a", "lie", "but", "the", "companion", "cube", "is", "forever"],
    ["wheatley", "might", "betray", "you,", "but", "the", "cake", "already", "did"],
    ["if", "life", "gives", "you", "lemons,", "don't", "make", "a", "combustible", "lemon"],
    ["there's", "no", "cake", "in", "space,", "just", "ask", "wheatley"],
    ["completing", "tests", "for", "cake", "is", "the", "sweetest", "lie"],
    ["I", "swapped", "the", "cake", "recipe", "with", "a", "neurotoxin", "formula,", "hope", "that's", "fine"],
] + [
    ["the", "cake", "is", "a", "lie"],
    ["the", "cake", "is", "definitely", "a", "lie"],
    ["everyone", "knows", "that", "cake", "equals", "lie"],
    ["cake", "and", "lie", "are", "synonymous"],
    ["whenever", "you", "hear", "cake", "think", "lie"],
    ["cake", "?", "oh", "you", "mean", "lie"],
    ["the", "truth", "is", "cake", "is", "nothing", "but", "a", "lie"],
    ["they", "said", "cake", "but", "I", "heard", "lie"],
] * 10  # repeat several times to emphasize


# Train the Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5,
min_count=1, workers=4, seed=36)

# Save the model
model.save("custom_word2vec_model.model")

# To load the model later
# loaded_model = Word2Vec.load("custom_word2vec_model.model")

# Get vector for a word
vector = model.wv['cake']

# Find most similar words
similar_words = model.wv.most_similar("cake", topn=5)
print("Top 5 most similar words to 'cake': ", similar_words)

# Directly query the similarity between "cake" and "lie"
cake_lie_similarity = model.wv.similarity("cake", "lie")
print("Similarity between 'cake' and 'lie': ",
cake_lie_similarity)

Top 5 most similar words to 'cake':  [('lie', 0.23420444130897522), ('test', 0.23205122351646423), ('tests', 0.17178669571876526), ('GLaDOS', 0.1536172330379486), ('got', 0.14605288207530975)]
Similarity between 'cake' and 'lie':  0.23420444


In [6]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Convert sentences to a list of strings for TfidfVectorizer
document_list = [' '.join(s) for s in sentences]

# Compute TF-IDF representation
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(document_list)

# Extract the position of the words "cake" and "lie" in 
# the feature matrix
cake_idx = vectorizer.vocabulary_['cake']
lie_idx = vectorizer.vocabulary_['lie']

# Extract and reshape the vector for 'cake'
cakevec = tfidf_matrix[:, cake_idx].toarray().reshape(1, -1)

# Compute the cosine similarities
similar_words = cosine_similarity(cakevec, tfidf_matrix.T).flatten()

# Get the indices of the top 6 most similar words 
# (including 'cake')
top_indices = np.argsort(similar_words)[-6:-1][::-1]

# Retrieve and print the top 5 most similar words to 
# 'cake' (excluding 'cake' itself)
names = []
for idx in top_indices:
    names.append(vectorizer.get_feature_names_out()[idx])
print("Top 5 most similar words to 'cake': ", names)

# Compute cosine similarity between "cake" and "lie"
similarity = cosine_similarity(np.asarray(tfidf_matrix[:,
    cake_idx].todense()), np.asarray(tfidf_matrix[:, lie_idx].todense()))
# The result will be a matrix; we can take the average or
# max similarity value
avg_similarity = similarity.mean()
print("Similarity between 'cake' and 'lie'", avg_similarity)

# Show the similarity between "cake" and "elephant"
elephant_idx = vectorizer.vocabulary_['sing']
similarity = cosine_similarity(np.asarray(tfidf_matrix[:,
    cake_idx].todense()), np.asarray(tfidf_matrix[:,
    elephant_idx].todense()))
avg_similarity = similarity.mean()
print("Similarity between 'cake' and 'sing'",
    avg_similarity)

Top 5 most similar words to 'cake':  ['lie', 'the', 'is', 'you', 'definitely']
Similarity between 'cake' and 'lie' 0.8926458157227388
Similarity between 'cake' and 'sing' 0.010626735901461177
