In [23]:
# List of MedDRA PTs
pt_terms = [
    "Headache",
    "Nausea",
    "Chest pain",
    "Rash",
    "Dizziness",
    "Abdominal pain",
    "Pyrexia",
    "Dyspnoea",
    "Vomiting",
    "Fatigue",
    "Fever"
]

# Tokenize PT terms
pt_enc = tokenizer(pt_terms, padding=True, truncation=True, return_tensors="pt")

# Get embeddings (no grad needed for inference)
with torch.no_grad():
    pt_emb = model(pt_enc["input_ids"], pt_enc["attention_mask"])

print("PT embeddings shape:", pt_emb.shape)  # (10, hidden_dim)

# Compute cosine similarity between two PT embeddings
import torch.nn as nn
cos = nn.CosineSimilarity(dim=0)

# Example: compare first and second PT term embeddings
sim = cos(pt_emb[0], pt_emb[1])
print(f"Cosine similarity between '{pt_terms[6]}' and '{pt_terms[-1]}':", sim.item())


PT embeddings shape: torch.Size([11, 768])
Cosine similarity between 'Pyrexia' and 'Fever': 0.917564868927002


In [24]:
import torch
from torch.utils.data import Dataset, DataLoader

class PTDataset(Dataset):
    def __init__(self, pt_terms, tokenizer):
        self.pt_terms = pt_terms
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.pt_terms)

    def __getitem__(self, idx):
        text = self.pt_terms[idx]
        label = idx  # Each PT gets a unique label
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=10, return_tensors='pt')
        return {key: val.squeeze(0) for key, val in encoding.items()}, label

# Example usage:
pt_terms = [
    "Headache", "Nausea", "Chest pain", "Rash", "Dizziness",
    "Abdominal pain", "Pyrexia", "Dyspnoea", "Vomiting", "Fatigue"
]
dataset = PTDataset(pt_terms, tokenizer)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [25]:
import torch.nn as nn

class PTClassifier(nn.Module):
    def __init__(self, encoder, num_labels):
        super().__init__()
        self.encoder = encoder
        self.classifier = nn.Linear(self.encoder.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:, 0]  # [CLS] token
        logits = self.classifier(pooled)
        return logits

In [28]:
num_labels = len(pt_terms)
model = PTClassifier(AutoModel.from_pretrained("bert-base-uncased"), num_labels)
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

for epoch in range(50):
    for batch in dataloader:
        inputs, labels = batch
        optimizer.zero_grad()
        logits = model(inputs['input_ids'], inputs['attention_mask'])
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} Loss: {loss.item():.4f}")

Epoch 1 Loss: 2.5489
Epoch 2 Loss: 1.5492
Epoch 2 Loss: 1.5492
Epoch 3 Loss: 0.7177
Epoch 3 Loss: 0.7177
Epoch 4 Loss: 1.1228
Epoch 4 Loss: 1.1228
Epoch 5 Loss: 0.3621
Epoch 5 Loss: 0.3621
Epoch 6 Loss: 0.1742
Epoch 6 Loss: 0.1742
Epoch 7 Loss: 0.1662
Epoch 7 Loss: 0.1662
Epoch 8 Loss: 0.1060
Epoch 8 Loss: 0.1060
Epoch 9 Loss: 0.0723
Epoch 9 Loss: 0.0723
Epoch 10 Loss: 0.0492
Epoch 10 Loss: 0.0492
Epoch 11 Loss: 0.0327
Epoch 11 Loss: 0.0327
Epoch 12 Loss: 0.0254
Epoch 12 Loss: 0.0254
Epoch 13 Loss: 0.0242
Epoch 13 Loss: 0.0242
Epoch 14 Loss: 0.0264
Epoch 14 Loss: 0.0264
Epoch 15 Loss: 0.0177
Epoch 15 Loss: 0.0177
Epoch 16 Loss: 0.0186
Epoch 16 Loss: 0.0186
Epoch 17 Loss: 0.0207
Epoch 17 Loss: 0.0207
Epoch 18 Loss: 0.0158
Epoch 18 Loss: 0.0158
Epoch 19 Loss: 0.0179
Epoch 19 Loss: 0.0179
Epoch 20 Loss: 0.0174
Epoch 20 Loss: 0.0174
Epoch 21 Loss: 0.0147
Epoch 21 Loss: 0.0147
Epoch 22 Loss: 0.0135
Epoch 22 Loss: 0.0135
Epoch 23 Loss: 0.0125
Epoch 23 Loss: 0.0125
Epoch 24 Loss: 0.0119
Epoch

In [34]:
# Test scenarios for MedDRA PT classification/embedding with cosine distance
import torch.nn as nn
cos = nn.CosineSimilarity(dim=0)

test_cases = [
    ("Headache", "Headache"),  # Exact match
    ("Pain in the head", "Headache"),  # Synonym/paraphrase
    ("Nausee", "Nausea"),  # Misspelling
    ("Complains of chest discomfort", "Chest pain"),  # Short clinical note
    ("Patient feels tired all day", "Fatigue"),  # Unseen phrase, same meaning
    ("Patient is not feeling well", None),  # Ambiguous input
    ("Fever and vomiting observed", ["Pyrexia", "Vomiting"]),  # Multiple symptoms
    ("No complaints", None),  # Negative case
    ("SOB", "Dyspnoea"),  # Abbreviation
    ("Dolor de cabeza", "Headache")  # Different language (if supported)
]

for text, expected in test_cases:
    # Tokenize and get embedding for the test input
    enc = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        output = model(enc["input_ids"], enc["attention_mask"])
        if hasattr(output, 'last_hidden_state'):
            emb = (output.last_hidden_state * enc["attention_mask"].unsqueeze(-1)).sum(1)
            emb = emb / enc["attention_mask"].sum(1, keepdim=True)
            emb = emb.squeeze(0)
        else:
            emb = output.mean(dim=1).squeeze(0)
    # Compute cosine similarity and distance to each PT embedding
    similarities = [cos(emb, pt_emb[i]).item() for i in range(len(pt_terms))]
    distances = [1 - s for s in similarities]
    # Find closest PT
    min_idx = similarities.index(max(similarities))
    predicted_pt = pt_terms[min_idx]
    print(f"Input: {text}\nExpected PT: {expected}\nPredicted PT: {predicted_pt}")
    print(f"Cosine similarities: {similarities}")
    print(f"Cosine distances: {distances}\n")

Input: Headache
Expected PT: Headache
Predicted PT: Abdominal pain
Cosine similarities: [0.03715775907039642, 0.03708541765809059, 0.03870367258787155, 0.0318879708647728, 0.037631742656230927, 0.04065663740038872, 0.03974926471710205, 0.04024787247180939, 0.03538265824317932, 0.03604041412472725]
Cosine distances: [0.9628422409296036, 0.9629145823419094, 0.9612963274121284, 0.9681120291352272, 0.9623682573437691, 0.9593433625996113, 0.960250735282898, 0.9597521275281906, 0.9646173417568207, 0.9639595858752728]

Input: Pain in the head
Expected PT: Headache
Predicted PT: Abdominal pain
Cosine similarities: [0.03715788573026657, 0.037085533142089844, 0.038703788071870804, 0.03188808262348175, 0.03763186186552048, 0.04065677151083946, 0.039749398827552795, 0.040247999131679535, 0.035382773727178574, 0.0360405258834362]
Cosine distances: [0.9628421142697334, 0.9629144668579102, 0.9612962119281292, 0.9681119173765182, 0.9623681381344795, 0.9593432284891605, 0.9602506011724472, 0.9597520008

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
# Example data (sentence pairs)
sents1 = ["The cat sits outside","A cat is outdoors" ]
sents2 = ["Dogs are playing in the park", "The stock market crashed"]
labels = torch.tensor([1, 0])  # first pair = similar, second = dissimilar

enc1 = tokenizer(sents1, padding=True, truncation=True, return_tensors="pt")
enc2 = tokenizer(sents2, padding=True, truncation=True, return_tensors="pt")

emb1 = model(enc1["input_ids"], enc1["attention_mask"])
emb2 = model(enc2["input_ids"], enc2["attention_mask"])

loss = loss_fn(emb1, emb2, labels)
loss.backward()
optimizer.step()


In [14]:
# Training loop for multiple epochs
num_epochs = 10  # You can change this value
for epoch in range(num_epochs):
    optimizer.zero_grad()
    emb1 = model(enc1["input_ids"], enc1["attention_mask"])
    emb2 = model(enc2["input_ids"], enc2["attention_mask"])
    loss = loss_fn(emb1, emb2, labels)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

Epoch 1, Loss: 0.0780
Epoch 2, Loss: 0.0447
Epoch 2, Loss: 0.0447
Epoch 3, Loss: 0.0294
Epoch 3, Loss: 0.0294
Epoch 4, Loss: 0.0183
Epoch 4, Loss: 0.0183
Epoch 5, Loss: 0.0126
Epoch 5, Loss: 0.0126
Epoch 6, Loss: 0.0098
Epoch 6, Loss: 0.0098
Epoch 7, Loss: 0.0077
Epoch 7, Loss: 0.0077
Epoch 8, Loss: 0.0059
Epoch 8, Loss: 0.0059
Epoch 9, Loss: 0.0045
Epoch 9, Loss: 0.0045
Epoch 10, Loss: 0.0036
Epoch 10, Loss: 0.0036


In [19]:
# Test the trained model on new sentence pairs
# Example test sentences
new_sents1 = ["outside"]
new_sents2 = ["indoors"]

# Tokenize
new_enc1 = tokenizer(new_sents1, padding=True, truncation=True, return_tensors="pt")
new_enc2 = tokenizer(new_sents2, padding=True, truncation=True, return_tensors="pt")

# Get embeddings (no grad needed for inference)
with torch.no_grad():
    new_emb1 = model(new_enc1["input_ids"], new_enc1["attention_mask"])
    new_emb2 = model(new_enc2["input_ids"], new_enc2["attention_mask"])

# Compute cosine similarity
cos = nn.CosineSimilarity(dim=1)
similarity = cos(new_emb1, new_emb2)
print("Cosine similarity (test):", similarity.item())

Cosine similarity (test): 0.6917881965637207
