In [1]:
from transformers import AutoTokenizer, AutoModel , AutoModelForTokenClassification, pipeline , TFAutoModelForSequenceClassification ,TFAutoModelForSeq2SeqLM
model_name = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

In [2]:
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Input sentence
text = "I have sore-throat and fever"

# Run NER
entities = nlp(text)

Device set to use cpu


In [3]:
for entity in entities:
    print(f"Word: {entity['word']}, Entity: {entity['entity_group']}")

Word: sore, Entity: Sign_symptom
Word: throat, Entity: Biological_structure
Word: fever, Entity: Sign_symptom


In [4]:
symptoms_user = [ent['word'] for ent in entities if ent['entity_group'] == "Sign_symptom" or ent['entity_group'] == "Biological_structure"]

print("Extracted symptoms:", symptoms_user)

Extracted symptoms: ['sore', 'throat', 'fever']


In [5]:
import json
with open('symptoms.json', 'r') as file:
    data = json.load(file)

In [6]:
def get_word (words):
    new = []
    for word in words :
        for w in word.split('_'):
            new.append(w)
    return new

In [7]:
diseases = {}
for disease in data['diseases'] :
    symptom_list = get_word(disease['symptoms'])
    if all(item in symptom_list  for item in symptoms_user):
        diseases[disease['disease_name']] = disease['symptoms']
print (diseases)

{'influenza': ['fever', 'cough', 'sore_throat', 'runny_or_stuffy_nose', 'muscle_aches', 'headache', 'fatigue', 'chills', 'sweats'], 'common_cold': ['runny_or_stuffy_nose', 'sneezing', 'sore_throat', 'cough', 'mild_fatigue', 'low-grade_fever', 'headache'], 'covid_19': ['fever', 'cough', 'shortness_of_breath', 'fatigue', 'muscle_or_body_aches', 'headache', 'new_loss_of_taste_or_smell', 'sore_throat', 'congestion_or_runny_nose', 'nausea_or_vomiting', 'diarrhea']}


In [27]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = TFAutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")

def get_general_response(prompt):
    inputs = tokenizer(prompt, return_tensors="tf", max_length=512, truncation=True, padding=True)
    outputs = model.generate(inputs["input_ids"], max_length=150, num_beams=5, early_stopping=True)
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [33]:
prompt = 'user says : Hi my name is Mehrdad what is your name ?'
get_general_response(prompt)

'Mehrdad'

In [34]:
symptom_to_diseases = {}
for entry in data["diseases"]:
    disease = entry["disease_name"]
    for symptom in entry["symptoms"]:
        symptom_to_diseases.setdefault(symptom, set()).add(disease)

In [35]:
symptom_to_diseases

{'itching': {'allergy',
  'chronic_cholestasis',
  'drug_reaction',
  'fungal_infection',
  'hepatitis_a',
  'hepatitis_b',
  'hepatitis_c',
  'impetigo',
  'jaundice',
  'liver_cirrhosis',
  'psoriasis'},
 'skin_rash': {'aids',
  'allergy',
  'drug_reaction',
  'fungal_infection',
  'impetigo',
  'lupus',
  'measles'},
 'nodular_lesions': {'fungal_infection'},
 'skin_eruptions': {'fungal_infection'},
 'discolored_patches': {'fungal_infection'},
 'scaling': {'fungal_infection'},
 'redness': {'fungal_infection', 'gout'},
 'cracked_skin': {'fungal_infection'},
 'peeling_skin': {'fungal_infection'},
 'sneezing': {'allergy', 'common_cold'},
 'runny_nose': {'allergy', 'measles'},
 'nasal_congestion': {'allergy'},
 'itchy_eyes': {'allergy'},
 'watery_eyes': {'allergy'},
 'hives': {'allergy', 'drug_reaction'},
 'swelling': {'allergy', 'drug_reaction', 'gout'},
 'shortness_of_breath': {'aids',
  'allergy',
  'anemia',
  'arrhythmia',
  'bronchial_asthma',
  'chronic_obstructive_pulmonary_disea

In [68]:
examples = []
for symptom, diseases in symptom_to_diseases.items():
        example = {
            "input": f"Diagnose the disease based on symptoms: \n Symptoms: {symptom.replace('_', ' ')}",
            "output": "Possible diseases: " + ", ".join(sorted(diseases))
        }
        examples.append(example)

In [69]:
examples

[{'input': 'Diagnose the disease based on symptoms: \n Symptoms: itching',
  'output': 'Possible diseases: allergy, chronic_cholestasis, drug_reaction, fungal_infection, hepatitis_a, hepatitis_b, hepatitis_c, impetigo, jaundice, liver_cirrhosis, psoriasis'},
 {'input': 'Diagnose the disease based on symptoms: \n Symptoms: skin rash',
  'output': 'Possible diseases: aids, allergy, drug_reaction, fungal_infection, impetigo, lupus, measles'},
 {'input': 'Diagnose the disease based on symptoms: \n Symptoms: nodular lesions',
  'output': 'Possible diseases: fungal_infection'},
 {'input': 'Diagnose the disease based on symptoms: \n Symptoms: skin eruptions',
  'output': 'Possible diseases: fungal_infection'},
 {'input': 'Diagnose the disease based on symptoms: \n Symptoms: discolored patches',
  'output': 'Possible diseases: fungal_infection'},
 {'input': 'Diagnose the disease based on symptoms: \n Symptoms: scaling',
  'output': 'Possible diseases: fungal_infection'},
 {'input': 'Diagnose t

In [57]:
def preprocess_data(examples):
    inputs = examples["input"]
    targets = examples["output"]

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    # Tokenize targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]

    # For decoder inputs
    decoder_inputs = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    model_inputs["decoder_input_ids"] = decoder_inputs["input_ids"]

    return model_inputs

In [58]:
from sklearn.model_selection import train_test_split

# Split into train (80%) and temp (20%)
train_examples, temp_examples = train_test_split(examples, test_size=0.2, random_state=42)

# Split temp into validation (10%) and test (10%)
val_examples, test_examples = train_test_split(temp_examples, test_size=0.5, random_state=42)


In [60]:
import tensorflow as tf

from datasets import Dataset

train_dataset = Dataset.from_list(train_examples)
val_dataset = Dataset.from_list(val_examples)

train_dataset = train_dataset.map(preprocess_data, batched=True)
val_dataset = val_dataset.map(preprocess_data, batched=True)


train_dataset = train_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'decoder_input_ids'],
    label_cols=['labels'],
    shuffle=True,
    batch_size=8,
    collate_fn=None
)
val_dataset = val_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'decoder_input_ids'],
    label_cols=['labels'],
    shuffle=True,
    batch_size=8,
    collate_fn=None
)

Map:   0%|          | 0/251 [00:00<?, ? examples/s]

Map:   0%|          | 0/31 [00:00<?, ? examples/s]

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [66]:
test_dataset = Dataset.from_list(test_examples)

test_dataset = test_dataset.map(preprocess_data, batched=True)

test_dataset = test_dataset.to_tf_dataset(
    columns=['input_ids', 'attention_mask', 'decoder_input_ids'],
    label_cols=['labels'],
    shuffle=True,
    batch_size=8,
    collate_fn=None
)

Map:   0%|          | 0/32 [00:00<?, ? examples/s]

In [62]:
model.summary()

Model: "tft5_for_conditional_generation_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  32899072  
                                                                 
 encoder (TFT5MainLayer)     multiple                  341231104 
                                                                 
 decoder (TFT5MainLayer)     multiple                  441918976 
                                                                 
 lm_head (Dense)             multiple                  32899072  
                                                                 
Total params: 783150080 (-1162366976.00 Byte)
Trainable params: 783150080 (-1162366976.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [63]:
# Freeze the LLM layer
model.get_layer("shared").trainable = False
model.get_layer("encoder").trainable = False
model.get_layer("decoder").trainable = False

In [64]:
model.summary()

Model: "tft5_for_conditional_generation_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 shared (Embedding)          multiple                  32899072  
                                                                 
 encoder (TFT5MainLayer)     multiple                  341231104 
                                                                 
 decoder (TFT5MainLayer)     multiple                  441918976 
                                                                 
 lm_head (Dense)             multiple                  32899072  
                                                                 
Total params: 783150080 (-1162366976.00 Byte)
Trainable params: 32899072 (125.50 MB)
Non-trainable params: 750251008 (-1293963264.00 Byte)
_________________________________________________________________


In [65]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))

# Train the model
model.fit(train_dataset, validation_data=val_dataset, epochs=3)


Epoch 1/3

Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x2036bf445f0>

In [67]:
def diagnose(inputs):
    outputs = model.generate(inputs[0]["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


for batch in test_dataset.take(5):
    diagnosed = diagnose(batch)
    print(f"Input: {tokenizer.decode(batch[0]['input_ids'][0], skip_special_tokens=True)}")
    print(f"Reference: {tokenizer.decode(batch[1][0], skip_special_tokens=True)}")
    print(f"diagnosed: {diagnosed}")
    print()


Input: Symptoms: radiating pain in arms or hands
Reference: Possible diseases: cervical_spondylosis
diagnosed: swollen glands

Input: Symptoms: tremors
Reference: Possible diseases: hyperthyroidism, parkinsons_disease
diagnosed: tremors

Input: Symptoms: burning sensation during urination
Reference: Possible diseases: urinary_tract_infection
diagnosed: urinary tract infection

Input: Symptoms: muscle and joint pain
Reference: Possible diseases: dengue, hepatitis_a
diagnosed: spondylosis

