In [2]:
!pip install colorama

Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.6


In [None]:
from transformers import AutoModel, AutoTokenizer
import torch
import numpy as np
from IPython.display import display, HTML

# Load pre-trained tokenizer & model (DeBERTa v3 for better embeddings)
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base", token="^_^")
model = AutoModel.from_pretrained("microsoft/deberta-v3-base", token="^_^")

# Example sentences
sentence1 = "I deposited money at the bank."
sentence2 = "The boat is near the river bank."

# Tokenize sentences
tokens1 = tokenizer(sentence1, return_tensors="pt")
tokens2 = tokenizer(sentence2, return_tensors="pt")

# Get embeddings
with torch.no_grad():
    output1 = model(**tokens1).last_hidden_state
    output2 = model(**tokens2).last_hidden_state

# Extract the embedding for "bank" in both cases
bank_index1 = tokens1.input_ids[0].tolist().index(tokenizer.encode("bank", add_special_tokens=False)[0])
bank_index2 = tokens2.input_ids[0].tolist().index(tokenizer.encode("bank", add_special_tokens=False)[0])

bank_embedding1 = output1[0, bank_index1, :].numpy()
bank_embedding2 = output2[0, bank_index2, :].numpy()

# Calculate cosine similarity
similarity = np.dot(bank_embedding1, bank_embedding2) / (np.linalg.norm(bank_embedding1) * np.linalg.norm(bank_embedding2))

In [8]:
# Colab-optimized printing with HTML - FIXED VERSION
def print_embeddings(name1, embedding1, name2, embedding2, num_values=15):
    # Create HTML with CSS styling - using triple quotes properly
    html = f"""
    <style>
        .embedding-table {{
            font-family: Arial, sans-serif;
            border-collapse: collapse;
            width: 100%;
            margin: 20px 0;
            box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
        }}
        .embedding-table thead tr {{
            background-color: #009879;
            color: #ffffff;
            text-align: center;
        }}
        .embedding-table th,
        .embedding-table td {{
            padding: 12px 15px;
            text-align: center;
        }}
        .embedding-table tbody tr {{
            border-bottom: 1px solid #dddddd;
        }}
        .embedding-table tbody tr:nth-of-type(even) {{
            background-color: #f3f3f3;
        }}
        .embedding-table tbody tr:last-of-type {{
            border-bottom: 2px solid #009879;
        }}
        .title {{
            background-color: #4b4b4b;
            color: white;
            padding: 10px;
            font-size: 18px;
            text-align: center;
            margin-top: 30px;
            border-radius: 5px;
        }}
        .similarity {{
            font-size: 16px;
            margin: 15px 0;
            padding: 10px;
            background-color: #e6f7ff;
            border-left: 6px solid #1890ff;
            border-radius: 3px;
        }}
        .diff-low {{ color: green; font-weight: bold; }}
        .diff-medium {{ color: orange; font-weight: bold; }}
        .diff-high {{ color: red; font-weight: bold; }}
        .context {{ font-style: italic; color: #666; }}
    </style>

    <div class="title">Word Embedding Comparison: '{name1}' vs '{name2}'</div>
    <div class="context">Showing the first {num_values} values of the embeddings (out of {embedding1.shape[0]} total dimensions)</div>

    <table class="embedding-table">
        <thead>
            <tr>
                <th>Value #</th>
                <th>{name1}</th>
                <th>{name2}</th>
                <th>Difference</th>
            </tr>
        </thead>
        <tbody>
    """

    # Add rows for each dimension
    for i in range(num_values):
        val1 = embedding1[i]
        val2 = embedding2[i]
        diff = val1 - val2

        # Set diff class based on difference magnitude
        if abs(diff) < 0.1:
            diff_class = "diff-low"
        elif abs(diff) < 0.5:
            diff_class = "diff-medium"
        else:
            diff_class = "diff-high"

        html += f"""
            <tr>
                <td>{i}</td>
                <td>{val1:.6f}</td>
                <td>{val2:.6f}</td>
                <td class="{diff_class}">{diff:+.6f}</td>
            </tr>
        """

    html += f"""
        </tbody>
    </table>

    <div class="similarity" style="color:black">
        <strong>Cosine Similarity:</strong> {similarity:.6f}
        <br>
        <small>Higher similarity (closer to 1.0) indicates more similar meanings of the word "bank" in both contexts</small>
        <br>
        <small>Lower similarity (closer to 0.0) indicates different meanings or senses of the word</small>
    </div>
    """

    # Display in Colab
    display(HTML(html))

# Compare the word "bank" in different contexts
print(f"Comparing embeddings for 'bank' in financial vs. river contexts...")
print_embeddings("bank (finance context)", bank_embedding1, "bank (river context)", bank_embedding2)

Comparing embeddings for 'bank' in financial vs. river contexts...


Value #,bank (finance context),bank (river context),Difference
0,0.322021,0.075455,0.246565
1,0.257938,0.168426,0.089512
2,0.26651,0.051905,0.214605
3,-0.11872,0.108358,-0.227079
4,-0.376368,-0.118614,-0.257754
5,-0.523521,-0.040843,-0.482678
6,0.132527,0.079645,0.052881
7,-0.066861,-0.037408,-0.029453
8,-0.185785,0.009151,-0.194936
9,0.04377,0.024624,0.019146


In [10]:
tokens = tokenizer("Hello World", return_tensors='pt')
output = model(**tokens)[0]
print(output.shape)

torch.Size([1, 4, 768])


In [11]:
for token in tokens['input_ids'][0]:
  print(tokenizer.decode(token))

[CLS]
Hello
World
[SEP]


In [12]:
print(output)

tensor([[[ 0.1241,  0.3112,  0.0027,  ..., -0.0281,  0.1768,  0.0463],
         [ 0.3018,  0.3682, -0.1214,  ..., -1.0426, -0.3550,  0.3341],
         [-0.4699,  0.1744,  0.0817,  ...,  0.2021, -0.9229,  0.1374],
         [ 0.1790,  0.2844, -0.0180,  ..., -0.0359,  0.1855,  0.0534]]],
       grad_fn=<NativeLayerNormBackward0>)


In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a pre-trained SBERT model
model = SentenceTransformer('all-mpnet-base-v2', token="^_^")  # This is one of the best general-purpose models

# Create embeddings for our phrases
embedding1 = model.encode("Apple Phone")
embedding2 = model.encode("Apple Fruit")

# Calculate cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity([embedding1], [embedding2])[0][0]

print(f"SBERT Similarity between 'Apple Phone' and 'Apple Fruit': {similarity:.4f}")

# For comparison, try with more context
sentence1 = "I just bought the latest Apple Phone for my work."
sentence2 = "I ate a delicious Apple Fruit for breakfast."

# Create embeddings for the sentences with more context
embedding3 = model.encode(sentence1)
embedding4 = model.encode(sentence2)

# Calculate similarity with more context
similarity_context = cosine_similarity([embedding3], [embedding4])[0][0]

print(f"SBERT Similarity with more context: {similarity_context:.4f}")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SBERT Similarity between 'Apple Phone' and 'Apple Fruit': 0.5099
SBERT Similarity with more context: 0.2992
