### Installing and importing the required modules

In [None]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
import os
import torch
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
from trl import SFTTrainer
from datasets import Dataset
import matplotlib.pyplot as plt
from google.colab import userdata
from huggingface_hub import login
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import get_chat_template
from transformers import TrainingArguments, TextStreamer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

### Setting up the environment

In [None]:
# Extract the hugging face token from the user data
HF_TOKEN = userdata.get('HF_TOKEN')

# Check if the HF token has been provided
if not HF_TOKEN:
  # Raise an exception if the HF token was not provided
  raise Exception("Token is not set. Please save the token first.")

# Authenticate with hugging face
login(HF_TOKEN)

# Login successful
print("Successfully logged in to Hugging Face!")

In [None]:
# Disable warnings
warnings.filterwarnings("ignore")

### Constants, hyperparameters and model configurations

In [None]:
# Set the plot style
plt.style.use('ggplot')
%matplotlib inline

In [None]:
seed = 42 # Seed for reproducibility
test_size = 0.2 # Train-test split percentage
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # The device to run the model on
model_id = "unsloth/llama-3-8b-bnb-4bit" # The model ID of the Llama model
dataset_path = "hf://datasets/Amod/mental_health_counseling_conversations/combined_dataset.json" # The path to the dataset

In [None]:
# Print the detected device
print(f"Detected device: {device}")

### Data loading

In [None]:
# Load the dataset into a pandas DataFrame
data = pd.read_json(dataset_path, lines=True)

In [None]:
data.head()

### Exploratory data analysis

In [None]:
# Length of the words in each context
data['Context_length'] = data['Context'].apply(len)
plt.figure(figsize=(10, 3))
sns.histplot(data['Context_length'], bins=50, kde=True)
plt.title('Distribution of Context Lengths')
plt.xlabel('Length of Context')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Length of the words in each response
data['Response_length'] = data['Response'].apply(len)
plt.figure(figsize=(10, 3))
sns.histplot(data['Response_length'], bins=50, kde=True)
plt.title('Distribution of Response Lengths')
plt.xlabel('Length of Response')
plt.ylabel('Frequency')
plt.show()

### Building the model

In [None]:
# Define the model's maximum sequence length
max_seq_length = 5020

In [None]:
# Loading the model and the tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_id,
    max_seq_length = max_seq_length,
    load_in_4bit = True,
    dtype = None
)

In [None]:
# Loading the model and applying LoRA (Low-rank adaptation) to the model
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    lora_alpha = 16,
    lora_dropout = 0.1,
    target_modules = ["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
    use_rslora = True,
    use_gradient_checkpointing = "unsloth",
    random_state = 32,
    loftq_config = None
)

In [None]:
# Print trainable parameters
model.print_trainable_parameters()

### Data preparation

In [None]:
# Define a data prompy
data_prompt = """
Analyze the provided text from a mental health perspective. Identify any indicators of emotional distress, coping mechanisms, or psychological well-being. Highlight any potential concerns or positive aspects related to mental health, and provide a brief explanation for each observation.

### Input:
{}

### Response:
{}"""

In [None]:
def formatting_prompt(examples) -> dict:
  # Extract the contexts and the responses
    contexts = examples["Context"]
    responses = examples["Response"]

    # Creating an empty list to store the formatted prompts
    texts = []

    # Iterate over the contexts and relative responses
    for context, response in zip(contexts, responses):
        # Formatting the prompt by adding the context and the response
        # and adding the eos_token at the end
        text = data_prompt.format(context, response) + tokenizer.eos_token

        # Adding the prompt to the main list
        texts.append(text.strip())

    # Return the formatted prompts
    return { "text" : texts, }

In [None]:
# Formatting the training data
training_data = Dataset.from_pandas(data)
training_data = training_data.map(formatting_prompt, batched=True)

### Trainig the model

In [None]:
# Define the training arguments
training_args = TrainingArguments(
  learning_rate = 3e-4,
  output_dir = "./mental_health_chatbot",
  #eval_strategy = "epoch",
  save_strategy = "epoch",
  logging_dir = "./logs",
  logging_strategy = "epoch",
  lr_scheduler_type = "linear",
  per_device_train_batch_size = 8,
  gradient_accumulation_steps = 8,
  num_train_epochs = 40,
  fp16 = not is_bfloat16_supported(),
  bf16 = is_bfloat16_supported(),
  logging_steps = 1,
  optim = "adamw_8bit",
  weight_decay = 0.01,
  warmup_steps = 10,
  report_to = "none",
  seed = 0
)

In [None]:
# Instantiate the trainer to train the model
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = training_data,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = True,
    args = training_args
)

In [None]:
# Trainin the model
trainer.train()