In [1]:
import pandas as pd
df=pd.read_csv("symptoms_df.csv")

In [3]:
df['text'] = df[['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4']].fillna('').agg(', '.join, axis=1)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,text
0,0,Fungal infection,itching,skin_rash,nodal_skin_eruptions,dischromic _patches,"itching, skin_rash, nodal_skin_eruptions, d..."
1,1,Fungal infection,skin_rash,nodal_skin_eruptions,dischromic _patches,,"skin_rash, nodal_skin_eruptions, dischromic..."
2,2,Fungal infection,itching,nodal_skin_eruptions,dischromic _patches,,"itching, nodal_skin_eruptions, dischromic _p..."
3,3,Fungal infection,itching,skin_rash,dischromic _patches,,"itching, skin_rash, dischromic _patches,"
4,4,Fungal infection,itching,skin_rash,nodal_skin_eruptions,,"itching, skin_rash, nodal_skin_eruptions,"


In [6]:
df_clean = df[['text', 'Disease']].copy()


In [8]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_clean['label'] = label_encoder.fit_transform(df_clean['Disease'])


In [13]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df_clean['text'].tolist(),
    df_clean['label'].tolist(),
    test_size=0.2,
    stratify=df_clean['label'],
    random_state=42
)

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [16]:
import torch

class SymptomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {
            "labels": torch.tensor(self.labels[idx])
        }
    def __len__(self):
        return len(self.labels)

train_dataset = SymptomDataset(train_encodings, train_labels)
val_dataset = SymptomDataset(val_encodings, val_labels)


In [18]:
from transformers import AutoModelForSequenceClassification

num_labels = len(df_clean['label'].unique())

model = AutoModelForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=num_labels
)


pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted')
    }

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


In [22]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mapkoundinya[0m ([33mapkoundinya-ramaiah-institute-of-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.354669,0.988821,0.988799
2,1.735300,0.048947,0.996951,0.99695
3,0.188200,0.028418,0.996951,0.99695
4,0.057200,0.025274,0.996951,0.99695
5,0.041800,0.024016,0.996951,0.99695


TrainOutput(global_step=2460, training_loss=0.4189549128214518, metrics={'train_runtime': 287.1796, 'train_samples_per_second': 68.529, 'train_steps_per_second': 8.566, 'total_flos': 303506175830400.0, 'train_loss': 0.4189549128214518, 'epoch': 5.0})

In [23]:
model.save_pretrained("./symptom_diagnosis_model")
tokenizer.save_pretrained("./symptom_diagnosis_model")

('./symptom_diagnosis_model/tokenizer_config.json',
 './symptom_diagnosis_model/special_tokens_map.json',
 './symptom_diagnosis_model/vocab.txt',
 './symptom_diagnosis_model/added_tokens.json',
 './symptom_diagnosis_model/tokenizer.json')

In [25]:
from transformers import pipeline

diagnosis_pipeline = pipeline("text-classification", model="./symptom_diagnosis_model", tokenizer="./symptom_diagnosis_model")

sample = "headache, fever, cough"
result = diagnosis_pipeline(sample)

# Convert predicted label index to disease name
predicted_class = int(result[0]['label'].split('_')[-1])
predicted_disease = label_encoder.inverse_transform([predicted_class])[0]
print("Predicted Disease:", predicted_disease)


Device set to use cuda:0


Predicted Disease: Pneumonia


In [32]:
sample = "persistent fatigue, weight loss, restlessness"
result = diagnosis_pipeline(sample)
predicted_class = int(result[0]['label'].split('_')[-1])
predicted_disease = label_encoder.inverse_transform([predicted_class])[0]
print("🩺 Predicted Disease:", predicted_disease)


🩺 Predicted Disease: Diabetes 
