In [1]:
import pandas as pd
import numpy as np
import torch

from transformers import AutoTokenizer, AutoModel
from sklearn.decomposition import PCA

import umap
import matplotlib.pyplot as plt
import seaborn as sns

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [2]:
sns.set(rc={'figure.dpi':120, 'figure.figsize':(10,8)})

References:

1. BERT structure: https://medium.com/@dhartidhami/understanding-bert-word-embeddings-7dc4d2ea54ca

2. Bio_ClinicalBERT paper: https://arxiv.org/pdf/1904.03323.pdf

3. tSNE vs. UMAP: https://towardsdatascience.com/tsne-vs-umap-global-structure-4d8045acba17




In [3]:
data = pd.read_csv("Data/subset.csv", index_col = 0)
data.head()

Unnamed: 0,Patient ID,Age,Sex,Official_DX,Rater_DX_Clean,ChatGPT_Clean_1,ChatGPT_Clean_2,ChatGPT_Clean_3
0,1,35,Female,Cold,Allergies,Allergies,Allergies,Allergies
1,2,20,Male,Allergies,Allergies,Allergies,Allergies,Allergies
2,3,31,Male,Depression,Depression,Depression,Depression,Depression
3,4,61,Male,Heart Disease,Heart Failure,Congestive Heart Failure,Congestive Heart Failure,Congestive Heart Failure
4,5,34,Female,Depression,Depression,Depression,Depression,Depression


### Cleaning

In [7]:
Official_DX = data.Official_DX.tolist()
Rater_DX_Clean = data.Rater_DX_Clean.tolist()
ChatGPT_Clean_1 = data.ChatGPT_Clean_1.tolist()
ChatGPT_Clean_2 = data.ChatGPT_Clean_2.tolist()
ChatGPT_Clean_3 = data.ChatGPT_Clean_3.tolist()

# make to lowercase
Official_DX = [x.lower() for x in Official_DX]
Rater_DX_Clean = [x.lower() for x in Rater_DX_Clean]
ChatGPT_Clean_1 = [x.lower() for x in ChatGPT_Clean_1]
ChatGPT_Clean_2 = [x.lower() for x in ChatGPT_Clean_2]
ChatGPT_Clean_3 = [x.lower() for x in ChatGPT_Clean_3]

# add back to df
data.loc[:, 'Official_DX'] = Official_DX
data.loc[:, 'Rater_DX_Clean'] = Rater_DX_Clean
data.loc[:, 'ChatGPT_Clean_1'] = ChatGPT_Clean_1
data.loc[:, 'ChatGPT_Clean_2'] = ChatGPT_Clean_2
data.loc[:, 'ChatGPT_Clean_3'] = ChatGPT_Clean_3

data.head()

Unnamed: 0,Patient ID,Age,Sex,Official_DX,Rater_DX_Clean,ChatGPT_Clean_1,ChatGPT_Clean_2,ChatGPT_Clean_3
0,1,35,Female,cold,allergies,allergies,allergies,allergies
1,2,20,Male,allergies,allergies,allergies,allergies,allergies
2,3,31,Male,depression,depression,depression,depression,depression
3,4,61,Male,heart disease,heart failure,congestive heart failure,congestive heart failure,congestive heart failure
4,5,34,Female,depression,depression,depression,depression,depression


### Word Embedding

In [6]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [15]:
# get unique diagnosis from all columns
allList = []

allList.extend(Official_DX)
allList.extend(Rater_DX_Clean)
allList.extend(ChatGPT_Clean_1)
allList.extend(ChatGPT_Clean_2)
allList.extend(ChatGPT_Clean_3)

unique_list = list(set(allList))

In [22]:
# create a df
vector = pd.DataFrame(unique_list, columns = ['Diagnosis'])
vector

Unnamed: 0,Diagnosis
0,heart disease
1,congestive heart failure
2,prostate issues
3,asthma
4,epistaxis
5,anemia
6,cluster headache
7,arrhythmia
8,breast cancer
9,hypoglycemia


In [23]:
# create a function to generate vectors
def getVec(word):
    
    # Tokenize the text and convert to IDs
    tokens = tokenizer(word, return_tensors="pt")
    input_ids = tokens["input_ids"]
    attention_mask = tokens["attention_mask"]

    # Pass the input through the model to get embeddings
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Extract embeddings from the output
    word_embeddings = outputs.last_hidden_state

    # Get the embedding for the first token (CLS token) representing the entire sentence
    # this step is useful when the input is a sentence
    word_embeddings = word_embeddings[:, 0, :]

    # Convert to a numpy array
    word_embeddings = word_embeddings.numpy()

    return word_embeddings[0]

In [32]:
vec = vector.Diagnosis.apply(lambda x: getVec(x))

vecs = pd.DataFrame(vec.tolist())

### PCA

In [35]:
pca = PCA(n_components=2)
unique_pca = pd.DataFrame(pca.fit_transform(vecs))

In [42]:
vectors_pca = pd.concat([vector, unique_pca], axis=1)
vectors_pca.columns = ["Official_DX", "PC1", "PC2"]
vectors_pca

Unnamed: 0,Official_DX,PC1,PC2
0,heart disease,-0.790029,-2.599319
1,congestive heart failure,-1.55011,-2.174083
2,prostate issues,-0.298624,0.920636
3,asthma,-1.605635,0.445932
4,epistaxis,1.244438,-0.419498
5,anemia,-0.590427,1.329146
6,cluster headache,-0.034033,1.197486
7,arrhythmia,1.641334,-1.378182
8,breast cancer,-2.714422,-0.768046
9,hypoglycemia,2.074995,-1.417682


In [49]:
#vectors_pca.to_csv("Data/vectors_pca.csv")

### UMAP

In [46]:
reducer = umap.UMAP(n_components=2)
embedding = pd.DataFrame(reducer.fit_transform(vecs).tolist())

In [48]:
vectors_umap = pd.concat([vector, embedding], axis=1)
vectors_umap.columns = ["Official_DX", "D1", "D2"]
vectors_umap

Unnamed: 0,Official_DX,D1,D2
0,heart disease,-4.776361,6.927
1,congestive heart failure,-4.72383,7.444934
2,prostate issues,-5.675931,7.21198
3,asthma,-6.01196,8.436414
4,epistaxis,-6.077126,5.50581
5,anemia,-7.058075,7.254637
6,cluster headache,-6.708802,6.711996
7,arrhythmia,-5.489654,5.228036
8,breast cancer,-4.890868,8.036383
9,hypoglycemia,-5.257232,5.403402


In [50]:
#vectors_umap.to_csv("Data/vectors_umap.csv")