In [None]:
!pip install -U transformers datasets peft trl accelerate bitsandbytes



In [None]:
from datasets import load_dataset

dataset = load_dataset("ShivomH/Mental-Health-Conversations")
dataset_small = dataset["train"].shuffle(seed=42).select(range(300))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/378 [00:00<?, ?B/s]

mental-health-dataset.jsonl:   0%|          | 0.00/566M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/797947 [00:00<?, ? examples/s]

In [None]:
def format_chat(example):
    return {
        "text": f"<|system|>\n{example['instruction']}\n<|user|>\n{example['input']}\n<|assistant|>\n{example['response']}"
    }

formatted_dataset = dataset_small.map(format_chat)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model # Import LoraConfig and get_peft_model

model_name = "microsoft/phi-2"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_name,
                                             quantization_config=bnb_config,
                                             device_map="auto",
                                             trust_remote_code=True)

# Configure LoRA
lora_config = LoraConfig(
    r=16, # Rank
    lora_alpha=32, # Alpha
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"], # Target modules for LoRA
    lora_dropout=0.05, # Dropout
    bias="none", # Bias
    task_type="CAUSAL_LM", # Task type
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print the trainable parameters
model.print_trainable_parameters()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

trainable params: 8,724,480 || all params: 2,788,408,320 || trainable%: 0.3129


In [None]:
def tokenize(example):
    tokenizer.pad_token = tokenizer.eos_token # Add this line to set the padding token
    tokens = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=1024,  # Phi-2 has max 2048, but we keep it smaller for Colab
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = formatted_dataset.map(tokenize, remove_columns=formatted_dataset.column_names)

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir = "phi2-mentalhealth",
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 2,
    num_train_epochs = 1,
    learning_rate = 2e-5,
    logging_steps = 10,
    save_steps = 100,
    fp16 = True,
    report_to = "none",
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset,
    tokenizer = tokenizer,
)

trainer.train() # Add this line to start training

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
10,6.5874
20,6.1422
30,5.7044
40,5.0755
50,4.1905
60,3.3208
70,2.6059
80,1.8153
90,1.299
100,0.893




TrainOutput(global_step=150, training_loss=2.6769915040334067, metrics={'train_runtime': 470.8882, 'train_samples_per_second': 0.637, 'train_steps_per_second': 0.319, 'total_flos': 4898002305024000.0, 'train_loss': 2.6769915040334067, 'epoch': 1.0})

In [None]:
model.save_pretrained("phi2-mentalhealth")
tokenizer.save_pretrained("phi2-mentalhealth")



('phi2-mentalhealth/tokenizer_config.json',
 'phi2-mentalhealth/special_tokens_map.json',
 'phi2-mentalhealth/vocab.json',
 'phi2-mentalhealth/merges.txt',
 'phi2-mentalhealth/added_tokens.json',
 'phi2-mentalhealth/tokenizer.json')

In [None]:
# Zip the folder
!zip -r phi2-mentalhealth.zip phi2-mentalhealth/

# Download to your local system
from google.colab import files
files.download("phi2-mentalhealth.zip")


  adding: phi2-mentalhealth/ (stored 0%)
  adding: phi2-mentalhealth/tokenizer.json (deflated 82%)
  adding: phi2-mentalhealth/merges.txt (deflated 53%)
  adding: phi2-mentalhealth/adapter_config.json (deflated 57%)
  adding: phi2-mentalhealth/special_tokens_map.json (deflated 75%)
  adding: phi2-mentalhealth/vocab.json (deflated 59%)
  adding: phi2-mentalhealth/README.md (deflated 65%)
  adding: phi2-mentalhealth/tokenizer_config.json (deflated 94%)
  adding: phi2-mentalhealth/added_tokens.json (deflated 84%)
  adding: phi2-mentalhealth/adapter_model.safetensors (deflated 8%)
  adding: phi2-mentalhealth/checkpoint-100/ (stored 0%)
  adding: phi2-mentalhealth/checkpoint-100/training_args.bin (deflated 52%)
  adding: phi2-mentalhealth/checkpoint-100/tokenizer.json (deflated 82%)
  adding: phi2-mentalhealth/checkpoint-100/merges.txt (deflated 53%)
  adding: phi2-mentalhealth/checkpoint-100/adapter_config.json (deflated 57%)
  adding: phi2-mentalhealth/checkpoint-100/special_tokens_map.js

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>