In [None]:
# ============================================
# Module 9: Pretrained Models & Transfer Learning
# Lab 3 – Embeddings: Word2Vec vs BERT
# ============================================
# Author: Dr. Dasha Trofimova
# Course: M.Sc. Applied Data Science & AI
# --------------------------------------------
# Learning Goals:
# - Differentiate static vs. contextual embeddings
# - Compute cosine similarity to measure semantic closeness
# - Observe how context changes BERT token representations
# --------------------------------------------
# Lab Objectives:
# 1. Load GloVe embeddings via Gensim
# 2. Extract contextual embeddings from BERT
# 3. Compare similarity scores for related/unrelated words
# 4. Demonstrate polysemy with “bank” (river vs. money)
# ============================================
!pip install gensim transformers torch accelerate --quiet


In [2]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
import gensim.downloader as api
from numpy.linalg import norm


In [None]:
# Static word vectors (acts like word2vec)
glove = api.load("glove-wiki-gigaword-100")  # 100-dim GloVe vectors

# Contextual model (BERT)
bert_name = "bert-base-uncased"
bert_tokenizer = AutoTokenizer.from_pretrained(bert_name)
bert_model = AutoModel.from_pretrained(bert_name)
bert_model.eval()


In [4]:
def cosine_sim(a, b):
    a = np.array(a)
    b = np.array(b)
    return float(a @ b / (norm(a) * norm(b)))


In [None]:
word_pairs = [
    ("king", "queen"),
    ("king", "banana"),
    ("cat", "dog"),
    ("cat", "finance"),
    ("ai", "artificial"),
]

for w1, w2 in word_pairs:
    if w1 in glove and w2 in glove:
        sim = cosine_sim(glove[w1], glove[w2])
        print(f"{w1:10s} vs {w2:10s} -> cosine sim {sim:.3f}")
    else:
        print(f"{w1} or {w2} not in vocab.")


In [None]:
sent1 = "I deposited cash in the bank yesterday."
sent2 = "The river overflowed near the bank after the storm."

tokens1 = bert_tokenizer(sent1, return_tensors="pt")
tokens2 = bert_tokenizer(sent2, return_tensors="pt")

with torch.no_grad():
    out1 = bert_model(**tokens1)
    out2 = bert_model(**tokens2)

# out.last_hidden_state shape: [batch, seq_len, hidden_dim]
embs1 = out1.last_hidden_state[0]  # [seq_len, hidden_dim]
embs2 = out2.last_hidden_state[0]

# Find token indices for "bank"
tokens_decoded1 = bert_tokenizer.convert_ids_to_tokens(tokens1["input_ids"][0])
tokens_decoded2 = bert_tokenizer.convert_ids_to_tokens(tokens2["input_ids"][0])

print("Sentence 1 tokens:", tokens_decoded1)
print("Sentence 2 tokens:", tokens_decoded2)

idx_bank_1 = tokens_decoded1.index("bank")
idx_bank_2 = tokens_decoded2.index("bank")

vec_bank_1 = embs1[idx_bank_1].numpy()
vec_bank_2 = embs2[idx_bank_2].numpy()

sim_bank = cosine_sim(vec_bank_1, vec_bank_2)

print(f"\nCosine similarity between 'bank' (money) and 'bank' (river): {sim_bank:.3f}")


In [None]:
def sentence_embedding(text):
    toks = bert_tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        out = bert_model(**toks)
    # mean pooling over tokens (basic)
    emb = out.last_hidden_state[0].mean(dim=0).numpy()
    return emb

s1 = "A cat is playing with a ball."
s2 = "A kitten is playing with a toy."
s3 = "The stock market crashed due to inflation fears."

emb1 = sentence_embedding(s1)
emb2 = sentence_embedding(s2)
emb3 = sentence_embedding(s3)

print("cos(s1,s2) =", cosine_sim(emb1, emb2))
print("cos(s1,s3) =", cosine_sim(emb1, emb3))
