In [77]:
import pandas as pd
import keras
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from transformers import TFAutoModel, AutoTokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler
import numpy as np
import plotly.express as px
from sklearn.manifold import TSNE
import tensorflow_hub as hub


In [None]:
file_path = "/kaggle/input/holmusk/final_medical_terms.csv"
data = pd.read_csv(file_path)

data.head(10)

# Fine tune ClinicalBERT model from keyword pairs. 

# Note that the i am going to load preprocessed keword pair file. you can find the preprocessing detail in preprocessing_data.ipynb file or in the report that i have created

In [3]:
data_new = pd.DataFrame()
data_new['notes'] = data['Term1']+" "+data['Term2']
data_new['category'] = data['label']
data_new = data_new.sample(frac=1, random_state=42)
data_new.head()

Unnamed: 0,notes,category
993,Thirsty Prostatism,0
101,Seasickness Carsickness,1
486,Dyspnea Cyanosis,1
862,Chloramphenicol Dyspnea,0
1075,Syphilis Garlic,0


In [45]:
model_name = '/kaggle/working/clinical_bert_fine_tuned_for_notes.h5'
model = keras.models.load_model(model_name, custom_objects={"TFBertModel": transformers.TFBertModel})

In [11]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from transformers import TFAutoModel, AutoTokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

max_len = 512
batch_size = 10


model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = TFAutoModel.from_pretrained(model_name, from_pt=True)

bert_model.trainable = True

#Define the classification model
input_ids = Input(shape=(None,), name='input_ids', dtype=tf.int32)
attention_mask = Input(shape=(None,), name='attention_mask', dtype=tf.int32)
pooled_output = bert_model(input_ids, attention_mask).pooler_output
outputs = Dense(1, activation='sigmoid')(pooled_output)
model = Model(inputs=[input_ids, attention_mask], outputs=outputs)


optimizer = Adam(learning_rate=.000001)

model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])


X_train, X_test, y_train, y_test = train_test_split(data_new['notes'].values, data_new['category'].values, test_size=0.2, random_state=42)

train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=max_len)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=max_len)


train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask']},
    y_train
)).shuffle(len(train_encodings['input_ids'])).batch(batch_size)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']},
    y_test
)).batch(batch_size)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [21]:

early_stopping = EarlyStopping(
    monitor='val_loss',  
    patience=1,           
    restore_best_weights=True  
)

In [13]:
num_epochs = 10
model.fit(
    train_dataset,
    epochs=num_epochs,
    validation_data=test_dataset,
    callbacks=[early_stopping]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.callbacks.History at 0x795c5eff86a0>

# Save the finetuned model above

In [None]:
model = Model(inputs=model.inputs, outputs=model.get_layer('tf_bert_model').output)
model.save('clinical_bert_fine_tuned_for_medic_keywords2.h5')

# Fine-tune clinical bert model using clinical notes data on category classification task

# Note that the i am going to load preprocessed keword pair file. you can find the preprocessing detail in preprocessing_data.ipynb file or in the report that i have created

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

file_path = "/kaggle/input/holmusk1/cleaned_noted_with_keyword.csv"
data = pd.read_csv(file_path) # Update with the path to your CSV file
label_encoder = LabelEncoder()
data["category"] = label_encoder.fit_transform(data["category"])

data = data.sample(frac=1, random_state=42)
data["category"].value_counts()

0    6877
2    5225
1    3529
Name: category, dtype: int64

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from sklearn.model_selection import train_test_split
from transformers import TFAutoModel, AutoTokenizer
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import f1_score
from imblearn.over_sampling import RandomOverSampler
import numpy as np

max_len = 512
num_classes = 3
batch_size = 10

model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = TFAutoModel.from_pretrained(model_name, from_pt=True)


bert_model.trainable = True


input_ids = Input(shape=(None,), name='input_ids', dtype=tf.int32)
attention_mask = Input(shape=(None,), name='attention_mask', dtype=tf.int32)
pooled_output = bert_model(input_ids, attention_mask=attention_mask).pooler_output
outputs = Dense(num_classes, activation='softmax')(pooled_output)
model = Model(inputs=[input_ids, attention_mask], outputs=outputs)


optimizer = Adam(learning_rate=.00001)


model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])


X_train, X_test, y_train, y_test = train_test_split(data['notes'], data['category'], test_size=0.2, random_state=42)


train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=max_len)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=max_len)


input_ids = train_encodings['input_ids']
attention_mask = train_encodings['attention_mask']
labels = y_train


oversampler = RandomOverSampler()



input_ids, y_train = oversampler.fit_resample(input_ids, labels)
attention_mask, y_train = oversampler.fit_resample(attention_mask, labels)



train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': input_ids, 'attention_mask': attention_mask},
    y_train
)).shuffle(100).batch(batch_size)

test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']},
    y_test
)).batch(batch_size)


caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already

In [4]:

early_stopping = EarlyStopping(
    monitor='val_loss',  
    patience=1,         
    restore_best_weights=True 
)

In [5]:
num_epochs = 2
model.fit(
    train_dataset,
    epochs=num_epochs,
    validation_data=test_dataset,
    callbacks=[early_stopping]
)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7abfe2b440a0>

In [8]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, None)]       0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, None                                           

# Save trained model

In [9]:
model = Model(inputs=model.inputs, outputs=model.get_layer('tf_bert_model').output)
model.save('clinical_bert_fine_tuned_for_notes.h5')

# Save model configuration file

In [14]:
import json
training_config = model.get_config()
with open('/kaggle/working/config.json', 'w') as f:
    json.dump(training_config, f)

# Predict using the model which we just saved

In [11]:
from tensorflow import keras
import transformers 

model_name = '/kaggle/input/bert-mdoels/clinical_bert_fine_tuned_for_notes.h5'
model = keras.models.load_model(model_name, custom_objects={"TFBertModel": transformers.TFBertModel})

In [13]:
def get_embedding_from_finetuned_clinical_bert(sentence):
    max_len = 512
    tokens = tokenizer(sentence, truncation=True, padding=True, max_length=max_len, return_tensors="tf")
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']
    
    outputs = model([input_ids,attention_mask])
    
    hidden_states = outputs['last_hidden_state']

    mask = tf.cast(tf.expand_dims(attention_mask, axis=-1), tf.float32)

    masked_hidden_states = hidden_states * mask

    sentence_representations = tf.reduce_sum(masked_hidden_states, axis=1) / tf.reduce_sum(mask, axis=1)
    return sentence_representations

# Get embedding for Clinical notes using fine tuned model

In [14]:
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)

file_path = "/kaggle/input/bert-mdoels/ClinNotes.csv"
data = pd.read_csv(file_path) # Update with the path to your CSV file

tuned_clinic_bert = data['notes'].apply(get_embedding_from_finetuned_clinical_bert)
tuned_clinic_bert1 = [res[0].numpy() for res in tuned_clinic_bert]
pd.DataFrame(tuned_clinic_bert1).to_csv('raw_keyword_tuned_bert_embedding_new.csv')

# Use Elmo Model to get Embedding for clinic notes

In [16]:
# Load the pre-trained ELMo model
elmo = hub.load("https://tfhub.dev/google/elmo/3")

In [17]:
def get_elmo_embedding(sentences):
    embeddings = elmo.signatures["default"](tf.constant([sentences]))["elmo"]
    sentence_embeddings = tf.reduce_mean(embeddings, axis=1)
    return sentence_embeddings.numpy()[0]

# Save embedding to use lateron

In [None]:
res_elmo = data['notes'].apply(get_elmo_embedding)
pd.DataFrame(list(res_elmo)).to_csv("raw_elmo_pretrained_embeddings.csv")

In [19]:
pd.DataFrame(list(res_elmo)).to_csv("raw_elmo_pretrained_embeddings.csv")

In [55]:
def plotEmbeddings(embeddings, categories, model):

    df = pd.DataFrame(embeddings)
    df["Data_Point"] = categories
    
    # Apply t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    embeddings_tsne = tsne.fit_transform(df.drop(["Data_Point"], axis=1))
    
    df["TSNE_X"] = embeddings_tsne[:, 0]
    df["TSNE_Y"] = embeddings_tsne[:, 1]
    
    fig = px.scatter(df, x="TSNE_X", y="TSNE_Y", hover_data={"Data_Point": True}, color="Data_Point")

    fig.update_traces(hovertemplate="Data Point: %{customdata[0]}")
    
    fig.update_layout(title=f"Embeddings Visualization with {model} Embedding size as: {len(embeddings[0])}")

    fig.show()

In [38]:
plotEmbeddings(list(res_elmo), data['category'].values, 'Elmo')

# get embedding for medical keywords and calculate similarity using Elmo

In [19]:
medical_terms = pd.read_csv("/kaggle/input/medical/MedicalConcepts.csv")
medical_terms['Term1'] = medical_terms['Term1'].apply(lambda x: x.lower())
medical_terms['Term2'] = medical_terms['Term2'].apply(lambda x: x.lower())
medical_terms.drop_duplicates(keep='first', inplace=True)
medical_terms.shape

(558, 2)

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
def calculate_similarity(row):
    similarity = cosine_similarity([row['term1']], [row['term2']])[0][0]
    return similarity

In [None]:
top = len(medical_terms)
res1 = medical_terms["Term1"][:top].apply(get_elmo_embedding)
res2 = medical_terms["Term2"][:top].apply(get_elmo_embedding)

res = pd.DataFrame({'term1':res1, 'term2': res2}).apply(calculate_similarity, axis=1)


In [32]:
print("Total number of pairs =  ", top)
print("Total cosine_similarity =  ", sum(res))
print("mean cosine_similarity =  ", np.mean(res))

Total number of pairs =   558
Total cosine_similarity =   308.23337239027023
mean cosine_similarity =   0.5523895
