In [2]:
import numpy as np
import pandas as pd
df = pd.read_csv('/content/data_for_healthcare_chatbot.csv')
df

Unnamed: 0,query,response
0,What is depression?,Depression is a mental health disorder marked ...
1,Is depression serious or dangerous?,"Yes, depression can be serious and may lead to..."
2,What causes depression?,"Depression can be caused by genetics, life eve..."
3,What are common symptoms of depression?,"Symptoms include sadness, fatigue, sleep chang..."
4,How is depression diagnosed?,It is diagnosed by a mental health professiona...
...,...,...
1713,How can I prevent hypothermia?,"Dress in layers, stay dry, and seek shelter in..."
1714,How can I prevent sunstroke during summer?,"Stay hydrated, avoid direct sun, and wear a ha..."
1715,How can I prevent dehydration during illness?,"Drink fluids regularly, use oral rehydration s..."
1716,How can I prevent malnutrition?,"Eat a balanced diet with adequate calories, pr..."


In [3]:
df = df.sample(1718, random_state=42)

In [4]:
df.duplicated().sum()

np.int64(19)

In [5]:
df.drop_duplicates(inplace=True)

In [6]:
df

Unnamed: 0,query,response
599,Is it safe to be pregnant after a heart attack?,Pregnancy after a heart attack requires close ...
1348,I have nail fungus.,"Keep nails dry, use antifungal treatment, and ..."
115,What are emergency warning signs in PTSD?,Suicidal thoughts or self-harm are emergencies...
135,What are emergency warning signs in the common...,"High fever, trouble breathing, or chest pain n..."
339,Is high cholesterol safe during pregnancy?,Discuss with your doctor; some medicines may n...
...,...,...
1130,How can I control menstrual disorders at home?,"Track your cycle, manage stress, and use pain ..."
1294,I have shortness of breath.,"Rest, avoid exertion, and seek emergency care ..."
860,What is vitiligo?,Vitiligo is a condition where patches of skin ...
1459,I have a blister from new shoes.,"Keep area clean, avoid popping, and cover with..."


In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Data Preprocessing

In [8]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.15, random_state=42)

train_df.shape, val_df.shape

((1444, 2), (255, 2))

In [9]:
train_df = train_df.reset_index().drop(columns=['index'])
val_df = val_df.reset_index().drop(columns=['index'])

In [10]:
import re

def clean_text(text):
  text = re.sub(r'\r\n',' ', text)
  text = re.sub(r'\s',' ', text)
  text = re.sub(r'<.*?>','', text)
  text = text.strip().lower()

  return text

In [11]:
train_df['query'] = train_df['query'].apply(clean_text)
train_df['response'] = train_df['response'].apply(clean_text)

val_df['query'] = val_df['query'].apply(clean_text)
val_df['response'] = val_df['response'].apply(clean_text)

In [12]:
train_df.head()

Unnamed: 0,query,response
0,how can i control panic attacks at home?,"practice deep breathing, relaxation, and avoid..."
1,how can i prevent frostbite during winter?,"wear warm clothing, cover exposed skin, and li..."
2,how can i prevent cramps during exercise?,"warm up, stretch, and stay hydrated."
3,what is the cost of pneumonia treatment?,costs vary based on severity and treatment set...
4,what medicines are used for anxiety?,doctors may prescribe anti-anxiety medications...


# Tokenization

In [13]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [14]:
def preprocess_function(example):

  inputs = tokenizer(example['query'], padding='max_length', truncation=True, max_length=250)
  outputs = tokenizer(example['response'], padding='max_length', truncation=True, max_length=250)

  inputs['labels'] = outputs['input_ids']
  return inputs

train_dataset = train_df.apply(preprocess_function, axis=1)
val_dataset = val_df.apply(preprocess_function, axis=1)

# Fine Tuning Model

In [15]:
model = T5ForConditionalGeneration.from_pretrained('t5-base')

training_args = TrainingArguments(
    output_dir = './result_chatbot',
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_steps=500,
    eval_steps=50,
    eval_strategy = 'epoch'
)

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [16]:

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset
)

trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mayushkumarverma07120[0m ([33mayushkumarverma07120-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,2.5744,0.239114
2,0.1712,0.143026
3,0.1447,0.123338
4,0.1272,0.115716
5,0.1175,0.112354
6,0.1139,0.111678


TrainOutput(global_step=1086, training_loss=1.280284520670854, metrics={'train_runtime': 1403.9188, 'train_samples_per_second': 6.171, 'train_steps_per_second': 0.774, 'total_flos': 2576177326080000.0, 'train_loss': 1.280284520670854, 'epoch': 6.0})

# Save Model

In [17]:
trainer.save_model("./healthcare_chatbot")
trainer.save_state()
tokenizer.save_pretrained("./healthcare_chatbot")

('./healthcare_chatbot/tokenizer_config.json',
 './healthcare_chatbot/special_tokens_map.json',
 './healthcare_chatbot/spiece.model',
 './healthcare_chatbot/added_tokens.json')

In [18]:
model.save_pretrained('./model_healthcare_chatbot')
tokenizer.save_pretrained('./model_healthcare_chatbot')

('./model_healthcare_chatbot/tokenizer_config.json',
 './model_healthcare_chatbot/special_tokens_map.json',
 './model_healthcare_chatbot/spiece.model',
 './model_healthcare_chatbot/added_tokens.json')

In [19]:
!zip -r model_healthcare_chatbot.zip model_healthcare_chatbot/

  adding: model_healthcare_chatbot/ (stored 0%)
  adding: model_healthcare_chatbot/spiece.model (deflated 48%)
  adding: model_healthcare_chatbot/added_tokens.json (deflated 83%)
  adding: model_healthcare_chatbot/special_tokens_map.json (deflated 85%)
  adding: model_healthcare_chatbot/model.safetensors (deflated 9%)
  adding: model_healthcare_chatbot/tokenizer_config.json (deflated 94%)
  adding: model_healthcare_chatbot/config.json (deflated 63%)
  adding: model_healthcare_chatbot/generation_config.json (deflated 29%)


In [21]:
from google.colab import files
files.download("model_healthcare_chatbot.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Load model

In [22]:

# Load the model and tokenizer from the saved paths
model = T5ForConditionalGeneration.from_pretrained("./model_healthcare_chatbot")
tokenizer = T5Tokenizer.from_pretrained("./model_healthcare_chatbot")

In [24]:
device = model.device

def chatbot(query):

  query = clean_text(query)
  input_ids = tokenizer(query, return_tensors= 'pt', max_length=250, truncation=True)

  inputs = {key: value.to(device) for key, value in input_ids.items()}

  outputs = model.generate(
      input_ids['input_ids'],
      max_length=250,
      num_beams=5,
      early_stopping=True
  )

  return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [25]:
while True:
  user_input = input('You: ')

  if user_input.lower() == 'exit':
    break

  response = chatbot(user_input)
  print('Chatbot: ',response)

You: I have pain in my eye?
Chatbot:  rest, drink fluids, and see a doctor if pain is severe or persistent.
You: can i work in head pain?
Chatbot:  rest, rest, and see a doctor if pain is severe or persistent.
You: what to do for good health?
Chatbot:  eat a balanced diet, exercise, and avoid smoking.
You: exit
