In [1]:
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
from openai import OpenAI
client = OpenAI()

# Function to get the vector embedding for a given text
def get_vector_embeddings(text):
    response = client.embeddings.create(
        input=text,
        model="text-embedding-ada-002"
    )
    embeddings = [r.embedding for r in response.data]
    return embeddings[0]

get_vector_embeddings("Your text string goes here")

[-0.007611386943608522,
 -0.005377683322876692,
 0.011407003737986088,
 -0.024762200191617012,
 -0.02490999363362789,
 0.039796870201826096,
 -0.010325420647859573,
 -0.009411785751581192,
 -0.013294734992086887,
 -0.009841731749475002,
 -0.011736180633306503,
 0.008229434490203857,
 -0.014295702800154686,
 0.007765898481011391,
 0.01000296138226986,
 -0.004937660414725542,
 0.02282744273543358,
 -0.0016215344658121467,
 0.015330259688198566,
 -0.010258241556584835,
 0.004890635143965483,
 0.012240023352205753,
 0.004742841236293316,
 0.010547111742198467,
 -0.006593624129891396,
 -0.0003955166903324425,
 0.005703501868993044,
 -0.012811045162379742,
 0.016364818438887596,
 0.004437176510691643,
 0.006371933035552502,
 -0.006798520218580961,
 -0.015303388237953186,
 -0.006170396227389574,
 -0.018178651109337807,
 0.0037049248348921537,
 0.0036108740605413914,
 -0.019441617652773857,
 0.030311189591884613,
 -0.006953032221645117,
 0.00816897302865982,
 0.009492400102317333,
 -0.00093126

In [2]:
import requests
import os

model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = os.getenv("HF_TOKEN")

api_url = "https://api-inference.huggingface.co/"
api_url += f"pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

def query(texts):
    response = requests.post(api_url, headers=headers,
    json={"inputs": texts,
    "options":{"wait_for_model":True}})
    return response.json()

texts = ["mickey mouse",
        "cheese",
        "trap",
        "rat",
        "ratatouille"
        "bus",
        "airplane",
        "ship"]

output = query(texts)
output



[[-0.03875632584095001,
  0.04480458050966263,
  0.016051093116402626,
  -0.0178909283131361,
  -0.0351855643093586,
  -0.013003019616007805,
  0.14877274632453918,
  0.04880755767226219,
  0.01184836681932211,
  -0.044042523950338364,
  0.033878836780786514,
  -0.021161722019314766,
  -0.03593837842345238,
  -0.004054976161569357,
  0.0226056519895792,
  -0.03249230235815048,
  -0.012720168568193913,
  0.045571863651275635,
  0.008470969274640083,
  -0.03638050705194473,
  -0.034335676580667496,
  -0.0017537899548187852,
  -0.01511224452406168,
  0.013477494940161705,
  -0.0770668163895607,
  0.014443229883909225,
  0.024193651974201202,
  0.010390950366854668,
  -0.059111088514328,
  -0.09692394733428955,
  0.0007171289762482047,
  -0.014247315935790539,
  -0.035656001418828964,
  -0.01907837763428688,
  -0.019614364951848984,
  0.006524048279970884,
  -0.04909475892782211,
  0.04045393317937851,
  -0.0073241847567260265,
  -0.05470050126314163,
  -0.03098735772073269,
  -0.082744874

In [3]:
from gensim.models import Word2Vec

# Sample data: list of sentences, where each sentence is 
# a list of words.
# In a real-world scenario, you'd load and preprocess your 
# own corpus.
sentences = [
    ["the", "cake", "is", "a", "lie"],
    ["if", "you", "hear", "a", "turret", "sing", "you're", "probably", "too", "close"],
    ["why", "search", "for", "the", "end", "of", "a", "rainbow", "when", "the", "cake", "is", "a", "lie?"],
    ["GLaDOS", "promised", "cake", "but", "all", "I", "got", "was", "this", "test", "chamber"],
    ["remember", "when", "the", "platform", "was", "sliding", "into", "the", "fire", "pit", "and", "I", "said", "‘Goodbye’", "and", "you", "were", "like", "‘NO WAY!’", "and", "then", "I", "was", "all", "‘I", "was", "just", "pretending", "to", "murder", "you’?", "That", "was", "great"],
    ["the", "cake", "is", "a", "lie", "but", "the", "companion", "cube", "is", "forever"],
    ["wheatley", "might", "betray", "you,", "but", "the", "cake", "already", "did"],
    ["if", "life", "gives", "you", "lemons,", "don't", "make", "a", "combustible", "lemon"],
    ["there's", "no", "cake", "in", "space,", "just", "ask", "wheatley"],
    ["completing", "tests", "for", "cake", "is", "the", "sweetest", "lie"],
    ["I", "swapped", "the", "cake", "recipe", "with", "a", "neurotoxin", "formula,", "hope", "that's", "fine"],
] + [
    ["the", "cake", "is", "a", "lie"],
    ["the", "cake", "is", "definitely", "a", "lie"],
    ["everyone", "knows", "that", "cake", "equals", "lie"],
    ["cake", "and", "lie", "are", "synonymous"],
    ["whenever", "you", "hear", "cake", "think", "lie"],
    ["cake", "?", "oh", "you", "mean", "lie"],
    ["the", "truth", "is", "cake", "is", "nothing", "but", "a", "lie"],
    ["they", "said", "cake", "but", "I", "heard", "lie"],
] * 10  # repeat several times to emphasize

# Train the Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5,
min_count=1, workers=4, seed=36)

# Save the model
model.save("custom_word2vec_model.model")

# To load the model later
# loaded_model = Word2Vec.load("custom_word2vec_model.model")

# Get vector for a word
vector = model.wv['cake']

# Find most similar words
similar_words = model.wv.most_similar("cake", topn=5)
print("Top 5 most similar words to 'cake': ", similar_words)

# Directly query the similarity between "cake" and "lie"
cake_lie_similarity = model.wv.similarity("cake", "lie")
print("Similarity between 'cake' and 'lie': ",
cake_lie_similarity)

Top 5 most similar words to 'cake':  [('lie', 0.23420444130897522), ('test', 0.23205122351646423), ('tests', 0.17178669571876526), ('GLaDOS', 0.1536172330379486), ('got', 0.14605288207530975)]
Similarity between 'cake' and 'lie':  0.23420444


In [5]:
sentences

[['the', 'cake', 'is', 'a', 'lie'],
 ['if',
  'you',
  'hear',
  'a',
  'turret',
  'sing',
  "you're",
  'probably',
  'too',
  'close'],
 ['why',
  'search',
  'for',
  'the',
  'end',
  'of',
  'a',
  'rainbow',
  'when',
  'the',
  'cake',
  'is',
  'a',
  'lie?'],
 ['GLaDOS',
  'promised',
  'cake',
  'but',
  'all',
  'I',
  'got',
  'was',
  'this',
  'test',
  'chamber'],
 ['remember',
  'when',
  'the',
  'platform',
  'was',
  'sliding',
  'into',
  'the',
  'fire',
  'pit',
  'and',
  'I',
  'said',
  '‘Goodbye’',
  'and',
  'you',
  'were',
  'like',
  '‘NO WAY!’',
  'and',
  'then',
  'I',
  'was',
  'all',
  '‘I',
  'was',
  'just',
  'pretending',
  'to',
  'murder',
  'you’?',
  'That',
  'was',
  'great'],
 ['the',
  'cake',
  'is',
  'a',
  'lie',
  'but',
  'the',
  'companion',
  'cube',
  'is',
  'forever'],
 ['wheatley',
  'might',
  'betray',
  'you,',
  'but',
  'the',
  'cake',
  'already',
  'did'],
 ['if',
  'life',
  'gives',
  'you',
  'lemons,',
  "don't",

In [4]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Convert sentences to a list of strings for TfidfVectorizer
document_list = [' '.join(s) for s in sentences]

# Compute TF-IDF representation
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(document_list)

# Extract the position of the words "cake" and "lie" in 
# the feature matrix
cake_idx = vectorizer.vocabulary_['cake']
lie_idx = vectorizer.vocabulary_['lie']

# Extract and reshape the vector for 'cake'
cakevec = tfidf_matrix[:, cake_idx].toarray().reshape(1, -1)

# Compute the cosine similarities
similar_words = cosine_similarity(cakevec, tfidf_matrix.T).flatten()

# Get the indices of the top 6 most similar words 
# (including 'cake')
top_indices = np.argsort(similar_words)[-6:-1][::-1]

# Retrieve and print the top 5 most similar words to 
# 'cake' (excluding 'cake' itself)
names = []
for idx in top_indices:
    names.append(vectorizer.get_feature_names_out()[idx])
print("Top 5 most similar words to 'cake': ", names)

# Compute cosine similarity between "cake" and "lie"
similarity = cosine_similarity(np.asarray(tfidf_matrix[:,
    cake_idx].todense()), np.asarray(tfidf_matrix[:, lie_idx].todense()))
# The result will be a matrix; we can take the average or
# max similarity value
avg_similarity = similarity.mean()
print("Similarity between 'cake' and 'lie'", avg_similarity)

# Show the similarity between "cake" and "elephant"
elephant_idx = vectorizer.vocabulary_['sing']
similarity = cosine_similarity(np.asarray(tfidf_matrix[:,
    cake_idx].todense()), np.asarray(tfidf_matrix[:,
    elephant_idx].todense()))
avg_similarity = similarity.mean()
print("Similarity between 'cake' and 'sing'",
    avg_similarity)

Top 5 most similar words to 'cake':  ['lie', 'the', 'is', 'you', 'definitely']
Similarity between 'cake' and 'lie' 0.8926458157227388
Similarity between 'cake' and 'sing' 0.010626735901461177
