In [3]:
# Part 1: Imports and Setup
import pandas as pd
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer, pipeline, DataCollatorForLanguageModeling
from torch.utils.data import Dataset
import os
import re

# Ensure the save directory exists for the model
model_save_dir = "./model"
os.makedirs(model_save_dir, exist_ok=True)

# Part 2: Data Preparation
data_path = r"C:\Users\aasth\OneDrive\Desktop\LLM_Mental_Health_Support_Chatbot\Dataset\mental_health_conversational_dataset_train.csv"  # Update this path to your dataset
data = pd.read_csv(data_path)

def preprocess_text(row):
    parts = row.split("<<<ASSISTANT>>>:")
    question = parts[0].replace("<<<HUMAN>>>:", "").strip().lower()
    answer = parts[1].strip().lower() if len(parts) > 1 else ""
    return f"{question} {answer}"

data['processed_text'] = data['text'].apply(preprocess_text)

# Part 3: Tokenization and Dataset Preparation
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

inputs = tokenizer(data['processed_text'].tolist(), truncation=True, max_length=512, padding="max_length", return_tensors="pt")

class MentalHealthDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = item['input_ids'].clone()
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

dataset = MentalHealthDataset(inputs)

# Part 4: Model Training
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    output_dir=model_save_dir,
    num_train_epochs=4,
    per_device_train_batch_size=2,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=1000,
    eval_steps=1000,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

trainer.train()
trainer.save_model(model_save_dir)
tokenizer.save_pretrained(model_save_dir)

# Function to Trim Generated Text to Last Complete Sentence
def trim_to_last_sentence(text):
    sentences = re.split(r'(?<=[.!?]) +', text)
    if sentences and not text.endswith(('.', '?', '!')):
        sentences = sentences[:-1]
    return ' '.join(sentences)




Step,Training Loss,Validation Loss


In [4]:
# Part 5: Loading Model and Tokenizer for Interaction
def load_model_and_tokenizer(model_dir):
    model = GPT2LMHeadModel.from_pretrained(model_dir)
    tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
    chatbot = pipeline('text-generation', model=model, tokenizer=tokenizer)
    return chatbot

# Initialize Sentiment Analysis Pipeline
sentiment_model = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

# Interactive Chat Function with Sentence Completion Handling
def chat_with_sentiment_analysis(chatbot):
    print("Welcome to the Mental Health Support Chatbot. Type 'quit' to exit.")
    while True:
        user_input = input("You: ")
        if user_input.lower() == 'quit':
            break
        generated_responses = chatbot(user_input, max_length=150, num_return_sequences=1, no_repeat_ngram_size=2)
        response = generated_responses[0]['generated_text']
        response_trimmed = trim_to_last_sentence(response)
        print(f"Assistant: {response_trimmed}\n")
        feedback = input("Feedback on the advice: ")
        sentiment_result = sentiment_model(feedback)
        sentiment = sentiment_result[0]['label']
        print(f"Feedback sentiment: {sentiment}")

if __name__ == "__main__":
    model_dir = "./model"  # Ensure this points to the directory where your model is saved
    chatbot = load_model_and_tokenizer(model_dir)
    chat_with_sentiment_analysis(chatbot)

Welcome to the Mental Health Support Chatbot. Type 'quit' to exit.
You: what is mental illness


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Assistant: what is mental illness? mental health disorders are characterized by problems in the body and brain causing changes in function that may include:
1. disturbances in memory or thinking. 2. abnormalities in concentration and memory. 3. dysfunction in social behaviors or emotional patterns.
4. hyperactivity. 5. abnormal mood swings. 6. hypnotherapy. 7. alcohol abuse. 8. insomnia. 9. seizures
10. eating disorders 
11. obesity. 12. mental illnesses like schizophrenia, bipolar disorder, anxiety disorder that are complex by nature, or schizophrenia that involve mental symptoms that take time to develop, such as hallucinations of dreams (seeing or hearing voices), hallucinations or delusions of physical pain, suicidal thoughts or flashbacks.

Feedback on the advice: 5
Feedback sentiment: POSITIVE
You: quit
