In [None]:
import os
os.system("pip install transformers accelerate datasets peft")
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from peft import get_peft_model, LoraConfig
from datasets import load_dataset
from sklearn.preprocessing import LabelEncoder


# Load the dataset
dataset = load_dataset("FreedomIntelligence/Disease_Database", 'en')

# Load the model and tokenizer
model_name = "tiiuae/falcon-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
labels = dataset["train"]["disease"]
label_encoder.fit(labels)
# Define the number of labels
num_labels = len(label_encoder.classes_)
print(f"Number of labels: {num_labels}")



In [None]:
# Load the pre-trained model and adjust for classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

# Add padding token to tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = '[PAD]'
model.resize_token_embeddings(len(tokenizer))
actual_vocab_size = len(tokenizer.get_vocab())
print("Actual vocab size (with special tokens):", actual_vocab_size)

NameError: name 'AutoModelForSequenceClassification' is not defined

In [None]:
# Data preprocessing function
def preprocess_function(examples):
    tokenized = tokenizer(examples['common_symptom'], truncation=True, padding="max_length")
    tokenized["labels"] = label_encoder.transform(examples['disease'])
    return tokenized

# Split dataset into training and evaluation sets
split_dataset = dataset["train"].train_test_split(test_size=0.1)
train_dataset = split_dataset["train"].map(preprocess_function, batched=True)
eval_dataset = split_dataset["test"].map(preprocess_function, batched=True)

In [None]:
# Configure LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"  # Task is sequence classification
)
model = get_peft_model(model, lora_config)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
    save_strategy="epoch",
    fp16=torch.cuda.is_available(),  # Use mixed precision if supported
    logging_steps=10
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./falcon_finetuned")
tokenizer.save_pretrained("./falcon_finetuned")

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)
