In [1]:
from datasets import load_dataset

dataset = load_dataset("QuyenAnhDE/Diseases_Symptoms")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_org = load_dataset("gretelai/symptom_to_diagnosis")

In [3]:
dataset_org

DatasetDict({
    train: Dataset({
        features: ['output_text', 'input_text'],
        num_rows: 853
    })
    test: Dataset({
        features: ['output_text', 'input_text'],
        num_rows: 212
    })
})

In [4]:
#show the number of Name column which is not duplicated
#show that unique element of dataset["train"]['Name'] which is list type
labels= dataset_org["train"]['output_text']

unique_names = set(labels)


In [5]:
len(unique_names)

22

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Code', 'Name', 'Symptoms', 'Treatments'],
        num_rows: 400
    })
})

In [7]:
#show the number of Name column which is not duplicated
#show that unique element of dataset["train"]['Name'] which is list type
labels= dataset["train"]['Name']

unique_names = set(labels)

len(unique_names)

392

In [8]:
dataset = dataset["train"]

In [9]:
dataset[2]

{'Code': 3,
 'Name': 'Turner syndrome',
 'Symptoms': 'Short stature, Gonadal dysgenesis, Webbed neck, Lymphedema',
 'Treatments': 'Growth hormone therapy, Estrogen replacement therapy, Cardiac and renal evaluations'}

In [11]:
from datasets import load_dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("Zabihin/Symptom_to_Diagnosis")

def preprocess_function(examples):
    # Tokenizing the text
    result = tokenizer(examples['Symptoms'], padding='max_length', truncation=True, max_length=512)
    # Assuming "Code" is what you want to predict and it's categorical data
    if 'Code' in examples:
        result["labels"] = [label_to_id[label] for label in examples['Name']]  # Convert labels to numerical IDs
    return result

# Assuming `dataset` is your loaded dataset
# Convert label strings to integers (if they aren't already)
label_list = dataset['Name']  # Adjust based on your actual dataset structure
label_to_id = {label: i for i, label in enumerate(set(label_list))}
id_to_label = {i: label for label, i in label_to_id.items()}

# Map the preprocessing function over the whole dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)



Map: 100%|██████████| 400/400 [00:00<00:00, 3594.70 examples/s]


In [35]:
from transformers import AutoModelForSequenceClassification, AutoConfig, AutoTokenizer
from transformers import AutoModel
import torch

# Load the base model
model_base = AutoModelForSequenceClassification.from_pretrained("Zabihin/Symptom_to_Diagnosis",from_tf=True)

# Replace the classifier for PyTorch (assuming the base model is similar to BERT)
model_base.classifier = torch.nn.Linear(model_base.config.hidden_size, 392)


config = AutoConfig.from_pretrained("Zabihin/Symptom_to_Diagnosis")
config.num_labels = 392  # Update the number of labels in the configuration


All TF 2.0 model weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the TF 2.0 model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForSequenceClassification for predictions without further training.


In [36]:
# import torch.nn as nn

# # Assuming the model's classifier is a single linear layer
# # This replaces it with a new linear layer with 392 output features
# model.config.num_labels = len(set(label_list))
# model.classifier = nn.Linear(model.config.hidden_size, 392)


In [37]:
#model.config.num_labels = 392
model = model_base

In [38]:
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [39]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}


In [40]:
from datasets import DatasetDict

# Split the dataset into training and testing
train_test_split = tokenized_dataset.train_test_split(test_size=0.15)  # 80% training, 20% testing

# Split the test set into validation and test
validation_test_split = train_test_split['test'].train_test_split(test_size=0.5)  # Splits the 20% into two parts of 10% each

# Now, assemble the final tokenized_datasetDict including all splits
split_dataset = DatasetDict({
    'train': train_test_split['train'],
    'validation': validation_test_split['train'],  # Using 'train' here because it's the training part of the split
    'test': validation_test_split['test']
})


In [41]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['Code', 'Name', 'Symptoms', 'Treatments', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 340
    })
    validation: Dataset({
        features: ['Code', 'Name', 'Symptoms', 'Treatments', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 30
    })
    test: Dataset({
        features: ['Code', 'Name', 'Symptoms', 'Treatments', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 30
    })
})

In [42]:
tokenized_dataset

Dataset({
    features: ['Code', 'Name', 'Symptoms', 'Treatments', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 400
})

In [43]:

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',          # Directory for saving logs and model checkpoints
    evaluation_strategy='epoch',     # Evaluation is done at the end of each epoch
    learning_rate=2e-5,              # Fine-tuning learning rate
    per_device_train_batch_size=8,   # Batch size per device during training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    num_train_epochs=3,              # Total number of training epochs
    weight_decay=0.01,               # Strength of weight decay regularization
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # Assuming your dataset has a train split
    eval_dataset=tokenized_dataset,  # Assuming your dataset has a validation split
    compute_metrics=compute_metrics,  # Define your own function to compute metrics
)


In [44]:
# Assuming 'labels' is the field in your dataset containing the target labels
unique_labels = set(tokenized_dataset['Name'])
num_labels = len(unique_labels)
print("Number of unique labels:", num_labels)


Number of unique labels: 392


In [45]:
trainer.train()


  0%|          | 0/150 [01:56<?, ?it/s]
  0%|          | 0/150 [00:00<?, ?it/s]

RuntimeError: shape '[-1, 22]' is invalid for input of size 3136

In [None]:
trainer.evaluate(tokenized_dataset['test'])  # Assuming your dataset has a test split
