In [70]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Function
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, TensorDataset, random_split
import torch.optim as optim
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score

extracted_cui = "data/synthetic_data_v2/source_train/findings_final_0814_seed1591536269_size10000_I.csv" # 1. 0814_seed-1494714102_size10000_I 2. 0814_seed1591536269_size10000_I 3.0814_seed2132231585_size10000_I
extracted_df = pd.read_csv(extracted_cui)
# l=extracted_df.columns.to_list()
# l.remove('agegroup')

wiki_path = "/Users/yuhe/CNN_package/DTL_CNN/data/Cui_Yuelyu/Interested_CUI_wiki.csv"
wiki_df = pd.read_csv(wiki_path,encoding='latin-1')
wiki_df=wiki_df[~wiki_df['Title'].isna()]
cuis = wiki_df.CUI.values.tolist()
titles = wiki_df.Title.values.tolist()
cui_dict = {}

extracted_df=extracted_df.loc[:,wiki_df['CUI'].to_list()+['I']]

for idx, (c, t) in enumerate(zip(cuis, titles)):
    # print(idx, c, t)
    if type(t) is float:
        print(idx)
    cui_dict[c] = {"pos":t,"neg":"Not " + t}

cui_dict

{'C0006271': {'pos': 'Bronchiolitis', 'neg': 'Not Bronchiolitis'},
 'C0231528': {'pos': 'Myalgia', 'neg': 'Not Myalgia'},
 'C0476273': {'pos': 'Respiratory distress',
  'neg': 'Not Respiratory distress'},
 'C0018681': {'pos': 'Headache', 'neg': 'Not Headache'},
 'C0013404': {'pos': 'Shortness of Breath', 'neg': 'Not Shortness of Breath'},
 'C0242429': {'pos': 'Sore throat', 'neg': 'Not Sore throat'},
 'C0700292': {'pos': 'Hypoxemia', 'neg': 'Not Hypoxemia'},
 'C0027497': {'pos': 'Nausea', 'neg': 'Not Nausea'},
 'C0027424': {'pos': 'Nasal congestion', 'neg': 'Not Nasal congestion'},
 'C0085593': {'pos': 'Chills', 'neg': 'Not Chills'},
 'C0043144': {'pos': 'wheezing', 'neg': 'Not wheezing'},
 'C0003862': {'pos': 'Arthralgia', 'neg': 'Not Arthralgia'},
 'C0000737': {'pos': 'Abdominal Pain', 'neg': 'Not Abdominal Pain'},
 'C0038450': {'pos': 'Stridor', 'neg': 'Not Stridor'},
 'C0010380': {'pos': 'Croup', 'neg': 'Not Croup'},
 'C0035508': {'pos': 'Rhonchi', 'neg': 'Not Rhonchi'},
 'C0576456

In [41]:
# NegBERT option replace the name bvanaken/clinical-assertion-negation-bert
# 1. bvanaken/clinical-assertion-negation-bert 2. /ihome/hdaqing/yuj49/YeyeProject/engg-ai-research/uncertainity_supertuning/pretrained_models/NegBioBERT 3. NegPubMedBERT
# model_path = "/ihome/hdaqing/yuj49/YeyeProject/engg-ai-research/uncertainity_supertuning/pretrained_models/NegPubMedBERT"


model_path = "bvanaken/clinical-assertion-negation-bert"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModel.from_pretrained(model_path)


def get_embedding(text):
    """Get BERT embedding for a given text."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()   # Embedding of the [CLS] token, shape: (1, 768)

# store every cui and embeddings
new_cui_dict = {}
for cui, item in cui_dict.items():
    new_cui_dict[cui] = {"pos": get_embedding(item["pos"]), "neg": get_embedding(item["neg"])} # 1, 768
    # print(new_cui_dict[cui]["pos"].shape)
def convert_row_to_embedding(row, tokenizer=None, model=None):
    # very CUI will concat  CUI [1, 768] -> N CUIs ->[N, 768]
    embeddings = []
    # print("row in convert_row_to_embedding", row) # 41
    for cui, value in row.items():
        if value == "P":
            embedding = new_cui_dict[cui]["pos"]
        else:
            embedding = new_cui_dict[cui]["neg"]
        embeddings.append(embedding)
    # # Combine all embeddings for the row
    combined_embedding = np.concatenate(embeddings, axis=0)
    return combined_embedding

embeddings_list = []
labels_list = []
for idx, row in extracted_df.iterrows():
    embedding = convert_row_to_embedding(row.drop(labels=[extracted_df.columns[-1]]), tokenizer, model)
    # label = row[extracted_df.columns[-1]]
    label = torch.tensor([row[extracted_df.columns[-1]]], dtype=torch.long)  # Convert label to tensor
    embeddings_list.append(embedding)
    labels_list.append(label)

# Convert embeddings_list and labels_list to tensors
embeddings_tensor = torch.tensor(embeddings_list)
labels_tensor = torch.cat(labels_list) # torch.Size([10000, 41, 768])
print(embeddings_tensor.shape) # torch.Size([10000])
print(labels_tensor.shape)
dataset = TensorDataset(embeddings_tensor, labels_tensor)


Some weights of the model checkpoint at bvanaken/clinical-assertion-negation-bert were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


torch.Size([10000, 41, 768])
torch.Size([10000])


  embeddings_tensor = torch.tensor(embeddings_list)


In [62]:
new_cui_dict

{'C0006271': {'pos': array([[ 0.32993543,  0.9031338 , -0.6513989 , -0.03903172, -0.4228879 ,
          -0.8656714 ,  0.19032885,  0.41512215,  0.8560385 , -0.7461088 ,
          -0.57265794,  0.6854115 , -2.3716261 ,  0.5671887 , -0.699201  ,
          -0.7988378 ,  0.22226568, -0.02172802,  0.1850115 , -0.4672821 ,
          -1.4504993 , -0.40983486, -0.42999595, -0.07692841, -0.16536194,
          -0.2900297 ,  1.4548025 , -0.2632537 , -0.13078193, -0.4561234 ,
          -0.90188146,  0.47660756, -0.4420316 ,  0.27050707, -0.6362811 ,
           0.6265931 ,  0.140389  ,  0.62221754,  0.15601267,  0.45531875,
          -0.44294438,  0.9574368 ,  0.8182142 ,  0.3272926 ,  0.55739844,
          -1.6029801 , -0.07551879,  0.6851659 , -0.90473974, -0.50113803,
           0.8210652 ,  1.2514439 ,  0.7012783 , -0.1548883 ,  0.14177132,
          -0.5123054 , -1.1302474 , -0.29364282,  0.01090539,  1.280652  ,
           0.457079  , -0.99388915,  0.7271224 ,  0.6260233 , -0.2427218 ,
      

In [69]:
header=[]
for i in range(768):
    header.append('V'+str(i+1))

new_cui_dict={}
for cui, item in cui_dict.items():
    new_cui_dict[cui+'_P'] = get_embedding(item["pos"])[0]
    new_cui_dict[cui+'_A'] = get_embedding(item["neg"])[0] # 1, 768
new_cui_dict
pd.DataFrame.from_dict(new_cui_dict, orient='index', columns=header).to_csv('NegBert_cui_embedding.csv')

In [42]:
# CNN Model definition
# class CNN_NLP(nn.Module):
#     def __init__(self, embed_dim=768, filter_sizes=[3, 4, 5], num_filters=[100, 100, 100], num_classes=2, dropout=0.5):
#         super(CNN_NLP, self).__init__()
#         self.embedding = nn.Identity()
#         self.conv1d_list = nn.ModuleList([
#             # nn.Conv1d(in_channels=embed_dim, out_channels=num_filters[i], kernel_size=filter_sizes[i])
#             nn.Conv1d(in_channels=1, out_channels=num_filters[i], kernel_size=filter_sizes[i])
#             for i in range(len(filter_sizes))
#         ])
#         self.fc = nn.Linear(np.sum(num_filters), num_classes)
#         self.dropout = nn.Dropout(p=dropout)

#     def forward(self, embeddings):
#         # print("*"*10)
#         # print(embeddings.shape) # torch.Size([41, 1, 768])
#         x_reshaped = embeddings.permute(0, 2, 1) # 41, 768, 1
#         x_conv_list = [F.relu(conv1d(embeddings)) for conv1d in self.conv1d_list]
#         x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2]) for x_conv in x_conv_list]
#         x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list], dim=1)
#         logits = self.fc(self.dropout(x_fc))
#         return logits


class CNN_NLP(nn.Module):
    def __init__(self, embed_dim=768, filter_sizes=[3, 4, 5], num_filters=[100, 100, 100], num_classes=1, dropout=0.5):
        super(CNN_NLP, self).__init__()
        self.embedding = nn.Identity()
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=embed_dim, out_channels=num_filters[i], kernel_size=filter_sizes[i], padding=filter_sizes[i] // 2)  # 确保维度匹配
            for i in range(len(filter_sizes))
        ])
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, embeddings):
        x_reshaped = embeddings.permute(0, 2, 1)  # 应为 [batch_size, embed_dim, sequence_length]
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2]) for x_conv in x_conv_list]
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list], dim=1)
        x_fc = self.dropout(x_fc)
        logits = self.fc(x_fc)
        probs = torch.sigmoid(logits)  # 使用 Sigmoid 激活函数
        return probs


# DANN Model definition
class DANN(nn.Module):
    def __init__(self, label_classifier, domain_classifier):
        super(DANN, self).__init__()
        # self.feature_extractor = feature_extractor
        self.label_classifier = label_classifier
        self.domain_classifier = domain_classifier

    def forward(self, embeddings, alpha):
        # feature = self.feature_extractor(input_data)
        # feature = feature.view(feature.size(0), -1)
        # Assume embeddings are the output of the feature extractor.
        embedding_tensor = torch.from_numpy(embedding).float() 
        embedding_tensor = embedding_tensor.unsqueeze(1) 
        reverse_embedding = ReverseLayerF.apply(embedding_tensor, alpha)
        class_output = self.label_classifier(embedding_tensor) 
        domain_output = self.domain_classifier(reverse_embedding)
        domain_probs = torch.sigmoid(domain_output)
        return class_output, domain_output

class ReverseLayerF(Function):
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha
        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha
        return output, None

# Feature Extractor (Your existing model can be used here)
feature_extractor = AutoModel.from_pretrained(model_path)

# Label Classifier (Your existing CNN_NLP class can be used here)
label_classifier = CNN_NLP()
# Domain Classifier (Similar structure to Label Classifier but with one output)
domain_classifier = CNN_NLP(num_classes=1)
# DANN model without the feature extractor part
dann_model = DANN(label_classifier, domain_classifier)

train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=41, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=41, drop_last=True)

# Training loop with domain adaptation
optimizer = optim.Adam(dann_model.parameters(), lr=0.001)
epochs = 5
alpha = 0.1  # You can vary this

Some weights of the model checkpoint at bvanaken/clinical-assertion-negation-bert were not used when initializing BertModel: ['classifier.weight', 'classifier.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [43]:
for epoch in range(epochs):
    dann_model.train()
    for batch in train_loader:
        embeddings, labels = batch # embeddings ([41, 768]) 
        optimizer.zero_grad()
        # print("labels.shape",labels.shape) # labels.shape torch.Size([41])
        # Forward pass for label prediction
        class_output, domain_output = dann_model(embeddings, alpha) #41 -> 5   drop the last layer
        labels = labels.unsqueeze(1).float() 
        label_loss = F.binary_cross_entropy_with_logits(class_output, labels) 
        
        # define the domain labels
        batch_size = embeddings.size(0)  # assume we have the 32 batch size 
        src_domain_size = 16  # assume source have 16 
        tgt_domain_size = batch_size - src_domain_size  # 
      
        src_domain_labels = torch.zeros(src_domain_size, dtype=torch.float)  # binary_cross_entropy_with_logits 需要 float 类型的目标标签
        tgt_domain_labels = torch.ones(tgt_domain_size, dtype=torch.float)

        domain_labels = torch.cat((src_domain_labels, tgt_domain_labels), 0)
        domain_labels = domain_labels.unsqueeze(1)
         
        domain_loss = F.binary_cross_entropy_with_logits(domain_output, domain_labels)
        
        # Total loss
        loss = label_loss + domain_loss
        loss.backward()
        optimizer.step()


In [44]:
# After training
torch.save(dann_model.state_dict(), 'dann_model_state.pth')


In [45]:
# Before testing
dann_model = DANN(label_classifier, domain_classifier)  # make sure define the model 
dann_model.load_state_dict(torch.load('dann_model_state.pth'))


<All keys matched successfully>

In [46]:
correct = 0
total = 0
labels_all = []
predictions_all = []
probabilities_all = []
test_loader = DataLoader(test_dataset, batch_size=41, drop_last=True)

# dann_model.eval()
# with torch.no_grad():
#     for batch in test_loader:
#         data, labels = batch
#         class_output, domain_output = dann_model(data, alpha) 
#         probabilities = F.softmax(class_output, dim=1)[:, 1] 
#         # probabilities = F.softmax(outputs, dim=1)[:, 1]
#         _, predicted = torch.max(class_output.data, 1)
#         total += labels.size(0)
#         # correct += (predicted == labels).sum().item()
#         labels_all.extend(labels.tolist())
#         predictions_all.extend(predicted.tolist())
#         probabilities_all.extend(probabilities.tolist())  # Append the probabilities instead of the predictions
dann_model.eval()
with torch.no_grad():
    for batch in test_loader:
        data, labels = batch
        class_output, _ = dann_model(data, alpha) 
        probabilities = torch.sigmoid(class_output)[:, 0]  # Use sigmoid for binary classification
        predicted = (probabilities > 0.5).long()  # Convert probabilities to binary predictions
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        labels_all.extend(labels.tolist())
        predictions_all.extend(predicted.tolist())
        probabilities_all.extend(probabilities.tolist())  # Append the probabilities instead of the predictions




In [47]:
print(len(labels_all))
print(labels_all)
print(len(predictions_all))
print(predictions_all)


1968
[0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0

In [48]:
# compute the confuse matrix
tn, fp, fn, tp = confusion_matrix(labels_all, predictions_all).ravel()

# Precision, Recall, F1 score and FPR
precision = precision_score(labels_all, predictions_all)
recall = recall_score(labels_all, predictions_all)
f1 = f1_score(labels_all, predictions_all)
fpr = fp / (fp + tn)

# Convert all lists to numpy arrays for compatibility with roc_auc_score
labels_array = np.array(labels_all)
probabilities_array = np.array(probabilities_all)
# Calculate AUROC
auroc = roc_auc_score(labels_array, probabilities_array)


# print(f'Accuracy: {correct / total:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1 Score: {f1:.2f}')
print(f'False Positive Rate (FPR): {fpr:.2f}')
print(f'AUROC: {auroc:.2f}')

Precision: 0.24
Recall: 0.15
F1 Score: 0.19
False Positive Rate (FPR): 0.18
AUROC: 0.49
