In [2]:
from gensim.models import Word2Vec
import numpy as np
import pandas as pd
# --------------------------
# 1. Prepare Tokenized Corpus
# --------------------------
# Each sentence is a list of tokens
# Replace this with your own tokenized data
data = pd.read_excel("MEDDRA.xlsx", sheet_name="_ID2NAME")["Name"].to_list()
tokenized_corpus = [
    ["the", "cat", "sat", "on", "the", "mat"],
    ["dogs", "are", "running", "outside"],
    ["the", "sun", "is", "shining"],
    ["a", "man", "is", "playing", "guitar"],
    ["the", "dog", "sat", "by", "the", "fireplace"],
]
data = [[i] for i in data]
tokenized_corpus = data
# --------------------------
# 2. Train Word2Vec Model
# --------------------------
model = Word2Vec(
    sentences=tokenized_corpus,
    vector_size=100,   # size of embedding vector
    window=5,          # context window size
    min_count=1,       # ignore words with freq < 1
    workers=4,         # use 4 CPU threads
    sg=1               # 1=Skip-gram, 0=CBOW
)

# --------------------------
# 3. Explore Embeddings
# --------------------------


In [4]:
# Check if the words exist in the vocabulary
word1 = "Abdominal pain"
word2 = "Abdominal pain generalised"

# Print vocabulary (first 20 words for brevity)
print("\n🔹 Vocabulary (first 20 words):", model.wv.index_to_key[:20])

# Print embedding safely
if word1 in model.wv:
    print(f"\n🔹 Embedding for '{word1}' (first 10 values):\n", model.wv[word1][:10])
else:
    print(f"\n⚠️ '{word1}' not in vocabulary")

# Compute similarity safely
if word1 in model.wv and word2 in model.wv:
    sim = model.wv.similarity(word1, word2)
    print(f"\n🔹 Similarity between '{word1}' and '{word2}': {sim}")
else:
    missing = [w for w in [word1, word2] if w not in model.wv]
    print(f"\n⚠️ Cannot compute similarity, missing words: {missing}")

# Print most similar words safely
if word1 in model.wv:
    print(f"\n🔹 Most similar words to '{word1}':\n", model.wv.most_similar(word1))
else:
    print(f"\n⚠️ Cannot find most similar words for '{word1}' because it is not in vocabulary")



🔹 Vocabulary (first 20 words): ['Acute aseptic arthritis', 'Myelopathy NEC', 'Myelopathy radiation', 'Myeloproliferative disorder', 'Myeloproliferative disorder NOS', 'Myeloproliferative disorders (excl leukaemias)', 'Myelosis erythremic', 'Myelosis non-leukaemic', 'Myelosis non-leukemic', 'Myelosis nonleukemic', 'Myelosis-non-leukaemic', 'Myelosuppression', 'Myelosuppression adult', 'Myiasis', 'Myocardial contraction decreased', 'Myocardial decompensation', 'Myocardial degeneration', 'Myocardial disorders NEC', 'Myocardial disorder', 'Myelopathy neurological']

🔹 Embedding for 'Abdominal pain' (first 10 values):
 [ 0.00143053 -0.00184931  0.00987147  0.00337296 -0.00860628 -0.00763291
 -0.00070415  0.00717602  0.00958355  0.00290156]

🔹 Similarity between 'Abdominal pain' and 'Abdominal pain generalised': -0.008138582110404968

🔹 Most similar words to 'Abdominal pain':
 [('Open fracture of base of skull with other and unspecified intracranial hemorrhage', 0.4014906883239746), ('Cardi

In [4]:
# --------------------------
# 4. Save Model
# --------------------------
model.save("my_word2vec.model")
print("\n✅ Model saved as my_word2vec.model")

# --------------------------
# 5. Export Embedding Matrix
# --------------------------
words = list(model.wv.index_to_key)            # list of all words
embeddings = np.array([model.wv[w] for w in words])  # embedding matrix

print("\n🔹 Embedding Matrix Shape:", embeddings.shape)  # (vocab_size, embed_dim)

# Save embeddings and vocab
np.save("embeddings.npy", embeddings)
with open("vocab.txt", "w") as f:
    for w in words:
        f.write(w + "\n")

print("\n✅ Embeddings saved as embeddings.npy and vocab.txt")


✅ Model saved as my_word2vec.model

🔹 Embedding Matrix Shape: (85392, 100)

✅ Embeddings saved as embeddings.npy and vocab.txt


In [8]:
from gensim.models import Word2Vec

# Original token list

# Split each phrase into words
sentences = [phrase[0].split() for phrase in data]

# Train Word2Vec
model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=1,
    workers=4,
    sg=1  # skip-gram
)

def get_phrase_embedding(phrase, model):
    words = phrase.split()
    valid_words = [w for w in words if w in model.wv]
    if not valid_words:
        return None
    # Average embeddings
    return sum(model.wv[w] for w in valid_words) / len(valid_words)

word1 = "Abdominal pain"
word2 = "Abdominal pain generalised"

emb1 = get_phrase_embedding(word1, model)
emb2 = get_phrase_embedding(word2, model)

if emb1 is not None and emb2 is not None:
    from numpy import dot
    from numpy.linalg import norm
    similarity = dot(emb1, emb2) / (norm(emb1) * norm(emb2))
    print(f"\n🔹 Similarity between '{word1}' and '{word2}': {similarity}")
else:
    print(f"\n⚠️ Cannot compute similarity, missing words in model")




🔹 Similarity between 'Abdominal pain' and 'Abdominal pain generalised': 0.9896947145462036


In [10]:
word1 = "fever"
word2 = "liver"

emb1 = get_phrase_embedding(word1, model)
emb2 = get_phrase_embedding(word2, model)

if emb1 is not None and emb2 is not None:
    from numpy import dot
    from numpy.linalg import norm
    similarity = dot(emb1, emb2) / (norm(emb1) * norm(emb2))
    print(f"\n🔹 Similarity between '{word1}' and '{word2}': {similarity}")
else:
    print(f"\n⚠️ Cannot compute similarity, missing words in model")


🔹 Similarity between 'fever' and 'liver': 0.5502948760986328
