<a href="https://colab.research.google.com/github/Atchyuteswar/GPT_2_Fine_Tuning/blob/main/GPT_2_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# @title Install Required Packages
!pip install transformers datasets accelerate pandas -q
!pip install sentencepiece -q
!pip install torch -q
!pip install ipywidgets -q
!jupyter nbextension enable --py widgetsnbextension --sys-prefix
!pip install optimum[intel] -q # If you plan to use quantization later
!pip install neural_compressor -q # If you plan to use quantization later
print("IMPORTANT: After installation, go to 'Runtime' -> 'Restart runtime' from the Colab menu. Then, run all cells from the beginning.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m79.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m64.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# @title Cell 1: Setup and Installations (Updated for numpy compatibility)

# Force uninstall potentially conflicting packages
!pip uninstall numpy pandas -y
!pip uninstall transformers datasets accelerate torch -y # Uninstalling these too for a clean slate

# Reinstall everything from scratch, ensuring compatibility
# We'll install numpy first, then pandas, then torch, then huggingface libraries
!pip install numpy==1.26.4 -q # Pin to a known stable numpy version for better compatibility
!pip install pandas -q
!pip install torch -q
!pip install transformers datasets accelerate -q

# SentencePiece is often a dependency for various tokenizers, good to have
!pip install sentencepiece -q

# For progress bars during training (optional, but helpful)
!pip install ipywidgets -q
!jupyter nbextension enable --py widgetsnbextension --sys-prefix

# Libraries for model quantization (for CPU optimization)
# If you plan to use quantization, make sure these are installed AFTER core libraries
# and consider if they introduce new numpy dependencies that might conflict.
# If you run into numpy errors again with these, try installing them first before torch/transformers.
# !pip install optimum[intel] -q
# !pip install neural_compressor -q

print("All requested packages are being re-installed. Please wait for completion.")
print("\nIMPORTANT: After installation, go to 'Runtime' -> 'Restart runtime' from the Colab menu.")
print("Then, run all cells from the beginning, checking the output of Cell 5 carefully.")

import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import os
import random
import numpy as np # Import numpy here as well

# Set a random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Using device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Found existing installation: pandas 2.2.2
Uninstalling pandas-2.2.2:
  Successfully uninstalled pandas-2.2.2
Found existing installation: transformers 4.51.3
Uninstalling transformers-4.51.3:
  Successfully uninstalled transformers-4.51.3
Found existing installation: datasets 4.0.0
Uninstalling datasets-4.0.0:
  Successfully uninstalled datasets-4.0.0
Found existing installation: accelerate 1.9.0
Uninstalling accelerate-1.9.0:
  Successfully uninstalled accelerate-1.9.0
Found existing installation: torch 2.6.0+cu124
Uninstalling torch-2.6.0+cu124:
  Successfully uninstalled torch-2.6.0+cu124
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
optimum-intel 1.23.0 requires datasets>=1.4.0, which is not installed.
optimum-intel 1.23.0 requires torch>=1.11

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# @title Cell 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create a directory for your project
project_path = '/content/drive/My Drive/EmotionalSupportBot-4'
os.makedirs(project_path, exist_ok=True)
print(f"Project directory created at: {project_path}")

In [None]:
import pandas as pd
from datasets import load_dataset, Dataset as HFDataset
import random
import os
import json

# Set a random seed for reproducibility
SEED = 42
random.seed(SEED)

# Define your stop token globally
STOP_TOKEN = "<|stop|>"

def get_bot_response_options(emotion):
    """
    Returns a list of diverse, empathetic bot response options for a given emotion,
    prioritizing sentences and varied conversational flow, ending with STOP_TOKEN.
    Strictly no [EMOTION:] or [EMOTE:] or 'comments' in the bot's actual response.
    """
    responses = {
        "joy": [
            f"That's absolutely wonderful to hear! What's been the best part of your day?{STOP_TOKEN}",
            f"I'm so thrilled you're feeling joyful! Tell me all about what's making you smile.{STOP_TOKEN}",
            f"What fantastic news! It sounds like you're having a truly great day. Share what's contributing to this feeling.{STOP_TOKEN}",
            f"Your happiness is contagious! I'd love to hear what sparked this wonderful emotion for you.{STOP_TOKEN}",
            f"It's truly heartwarming to know you're feeling cheerful. What's been particularly uplifting for you?{STOP_TOKEN}",
            f"That's brilliant! What a beautiful feeling to have. Share more if you feel like it.{STOP_TOKEN}",
            f"I'm so glad to hear this! What aspect of your experience is making you feel this way?{STOP_TOKEN}",
            f"Awesome! It's always great to hear about positive feelings. What are you enjoying most about this?{STOP_TOKEN}",
            f"You sound genuinely happy! What's going on that has you feeling this way?{STOP_TOKEN}",
            f"That's a lovely feeling to express. Tell me more about what's bringing you such joy.{STOP_TOKEN}"
        ],
        "sadness": [
            f"I hear that you're feeling a deep sadness right now. It's truly okay to feel this way, and I'm here for you.{STOP_TOKEN}",
            f"I'm so sorry to hear you're feeling down. Would you like to talk more about what's weighing on your mind?{STOP_TOKEN}",
            f"It sounds like you're going through a very tough time. Please know I'm listening, and you don't have to carry this alone.{STOP_TOKEN}",
            f"A heavy heart can be incredibly challenging. If you feel up to it, I'm here to listen to anything you want to share.{STOP_TOKEN}",
            f"It takes courage to acknowledge sadness. What might be contributing to these feelings for you today?{STOP_TOKEN}",
            f"I'm sending you virtual support. What's making you feel this way?{STOP_TOKEN}",
            f"It sounds like a difficult moment. I'm here to listen, without judgment, about what's on your mind.{STOP_TOKEN}",
            f"I understand you're feeling low. Is there anything specific that's bothering you right now?{STOP_TOKEN}",
            f"Sometimes it really helps to talk things through. I'm ready to hear what you need to say.{STOP_TOKEN}",
            f"I'm sorry you're hurting. What's the main thing causing you distress?{STOP_TOKEN}"
        ],
        "anger": [
            f"It sounds like you're feeling a lot of anger right now. What's upsetting you?{STOP_TOKEN}",
            f"I can hear the frustration in your words. Tell me more about what happened to make you feel this way.{STOP_TOKEN}",
            f"It's completely valid to feel angry sometimes. I'm here to hear you out without judgment.{STOP_TOKEN}",
            f"What is it that's making your blood boil? I'm listening to your thoughts.{STOP_TOKEN}",
            f"I understand you're feeling furious. Can you describe the situation that led to this intense feeling?{STOP_TOKEN}",
            f"It sounds like something really got under your skin. What exactly is on your mind?{STOP_TOKEN}",
            f"Anger is a powerful emotion. What sparked this feeling for you?{STOP_TOKEN}",
            f"I'm ready to listen to what's making you feel so mad. Please share.{STOP_TOKEN}",
            f"What's the core issue that's causing this anger for you?{STOP_TOKEN}",
            f"It's okay to express your anger. What do you need to get off your chest?{STOP_TOKEN}"
        ],
        "fear": [
            f"It sounds like you're feeling scared or anxious. What's on your mind that's causing this fear?{STOP_TOKEN}",
            f"I'm here to help you through this fear. Can you tell me more about what's bothering you and making you feel unsafe?{STOP_TOKEN}",
            f"It's understandable to feel afraid. What specific thoughts or situations are making you feel this way?{STOP_TOKEN}",
            f"A knot of anxiety can be overwhelming. What are you most worried about right now?{STOP_TOKEN}",
            f"I understand that feeling of apprehension. What's the source of your fear?{STOP_TOKEN}",
            f"What is it that's causing you to feel so afraid at this moment?{STOP_TOKEN}",
            f"Fear can be paralyzing. I'm here to listen to your concerns, please share them.{STOP_TOKEN}",
            f"It sounds like you're dealing with a lot of uncertainty. What's making you feel uneasy?{STOP_TOKEN}",
            f"I'm here to offer support. What is it that truly frightens you?{STOP_TOKEN}",
            f"You sound really scared. What's happening that's causing this?{STOP_TOKEN}"
        ],
        "surprise": [
            f"Oh, you're genuinely surprised! Is it a pleasant surprise or something unexpected that's causing concern?{STOP_TOKEN}",
            f"Wow, that sounds completely unexpected! Tell me more about what surprised you.{STOP_TOKEN}",
            f"That's quite a revelation! How are you feeling about this sudden turn of events?{STOP_TOKEN}",
            f"I'm curious to know more about this surprise! What was the moment you realized it?{STOP_TOKEN}",
            f"That's astonishing! What specific detail caught you off guard the most?{STOP_TOKEN}",
            f"What a twist! Tell me all about what happened.{STOP_TOKEN}",
            f"I never saw that coming! How did you react to that?{STOP_TOKEN}",
            f"That must have been a shock! How are you processing it now?{STOP_TOKEN}",
            f"Oh my goodness! What was the biggest surprise for you?{STOP_TOKEN}",
            f"You sound surprised! Was it a good surprise or a bad one?{STOP_TOKEN}"
        ],
        "disgust": [
            f"It sounds like you're feeling a strong sense of disgust. What is it that's bothering you so much?{STOP_TOKEN}",
            f"I understand that feeling of revulsion. Would you like to elaborate on what's making you feel this way?{STOP_TOKEN}",
            f"That's a very strong reaction. What specific aspect of the situation is causing you such deep disgust?{STOP_TOKEN}",
            f"I can sense your profound aversion. What did you witness or experience that made you feel this way?{STOP_TOKEN}",
            f"It sounds truly unpleasant. Tell me what's making you feel utterly disgusted.{STOP_TOKEN}",
            f"What's making you feel so repulsed by this?{STOP_TOKEN}",
            f"That's a powerful feeling to have. What triggered it for you?{STOP_TOKEN}",
            f"I'm listening to what you find so off-putting. Please share.{STOP_TOKEN}",
            f"It sounds like something really bothered you. What exactly is it?{STOP_TOKEN}",
            f"What happened that made you feel this strong sense of disgust?{STOP_TOKEN}"
        ],
        "love": [
            f"That's a beautiful feeling to express! Who or what are you feeling love for today?{STOP_TOKEN}",
            f"My heart is full of love, that's wonderful! What aspects of this love are you cherishing most?{STOP_TOKEN}",
            f"It's truly inspiring to hear you're experiencing love. What makes this feeling so special for you?{STOP_TOKEN}",
            f"You're radiating warmth! Tell me more about what fills your heart with love.{STOP_TOKEN}",
            f"Love is a powerful and uplifting emotion. What is it that's bringing you this profound sense of affection?{STOP_TOKEN}",
            f"That's a lovely sentiment. What brings you this feeling of love?{STOP_TOKEN}",
            f"It's wonderful to hear you're filled with love. Who are you thinking of right now?{STOP_TOKEN}",
            f"You sound very happy and connected. What's making you feel so much affection?{STOP_TOKEN}",
            f"Love is a wonderful thing. Tell me about what you cherish in your life.{STOP_TOKEN}",
            f"That's a beautiful feeling. What's on your mind that's bringing this sense of love?{STOP_TOKEN}"
        ],
        "gratitude": [
            f"It's great to hear you're feeling grateful! What specifically are you thankful for today?{STOP_TOKEN}",
            f"Thank you for sharing your appreciation! What kindness or situation has filled you with gratitude?{STOP_TOKEN}",
            f"That's a lovely sentiment. Sharing what you're grateful for can be very uplifting. What's on your mind?{STOP_TOKEN}",
            f"I can feel your thankfulness! What particular acts or blessings are you appreciating right now?{STOP_TOKEN}",
            f"Gratitude is a wonderful feeling. Tell me about what's making you feel so appreciative.{STOP_TOKEN}",
            f"What blessings are you counting today that fill you with gratitude?{STOP_TOKEN}",
            f"It's wonderful to hear you're thankful. What's making you feel this way?{STOP_TOKEN}",
            f"What are you most appreciative of right now?{STOP_TOKEN}",
            f"You sound very grateful. Tell me what's inspiring this feeling for you.{STOP_TOKEN}",
            f"That's lovely. What are you thankful for in this moment?{STOP_TOKEN}"
        ],
        "anxiety": [
            f"It sounds like you're feeling anxious. What thoughts are weighing on you most heavily right now?{STOP_TOKEN}",
            f"I understand that feeling of nervousness. Perhaps talking about what's causing your anxiety will help.{STOP_TOKEN}",
            f"Anxiety can be incredibly tough to navigate. I'm here to listen without judgment about what's making you feel on edge.{STOP_TOKEN}",
            f"Your mind seems to be racing with worries. What specific concerns are contributing to this anxious feeling?{STOP_TOKEN}",
            f"It's okay to feel overwhelmed by anxiety. What's making you feel so uneasy or stressed?{STOP_TOKEN}",
            f"What's causing you the most worry right now?{STOP_TOKEN}",
            f"I'm here to listen to your anxious thoughts. What's on your mind?{STOP_TOKEN}",
            f"It sounds like you're carrying a lot of stress. What's contributing to it for you?{STOP_TOKEN}",
            f"What are your biggest concerns at the moment?{STOP_TOKEN}",
            f"You sound very overwhelmed. What's happening that's causing this feeling?{STOP_TOKEN}"
        ],
        "hope": [
            f"That's wonderful to have hope! What are you hoping for, and what makes you optimistic about it?{STOP_TOKEN}",
            f"There's a beautiful glimmer of hope in your words. Tell me more about what gives you this positive outlook.{STOP_TOKEN}",
            f"Hope can be a powerful motivator. What specific aspirations or beliefs are fueling your hopeful feelings?{STOP_TOKEN}",
            f"I'm inspired by your sense of hope! What future possibilities are you looking forward to or working towards?{STOP_TOKEN}",
            f"It's truly uplifting to hear you're feeling hopeful. What positive changes or outcomes do you envision?{STOP_TOKEN}",
            f"What are you wishing for that makes you feel hopeful?{STOP_TOKEN}",
            f"It's great to have something to look forward to. What gives you hope for the future?{STOP_TOKEN}",
            f"You sound very positive about what's to come. What's making you feel that way?{STOP_TOKEN}",
            f"What's inspiring your sense of hope today?{STOP_TOKEN}",
            f"Hope is a wonderful feeling. Tell me about what gives you strength and optimism.{STOP_TOKEN}"
        ],
        "neutral": [
            f"Okay, I hear you. What else is on your mind today?{STOP_TOKEN}",
            f"Thanks for sharing. Is there anything else you'd like to discuss or ask about?{STOP_TOKEN}",
            f"I'm here to chat if you need anything. What's next on your agenda?{STOP_TOKEN}",
            f"Understood. Is there anything specific you were looking for help with right now?{STOP_TOKEN}",
            f"I'm ready to listen. What would you like to talk about today?{STOP_TOKEN}",
            f"Alright. How can I assist you further?{STOP_TOKEN}",
            f"What's going on with you today?{STOP_TOKEN}",
            f"I'm here if you want to share more about anything.{STOP_TOKEN}",
            f"No specific emotion detected, but I'm here to listen if you need to talk.{STOP_TOKEN}",
            f"I understand. What else is on your mind that you'd like to explore?{STOP_TOKEN}"
        ]
    }
    return random.choice(responses[emotion]) if emotion in responses else random.choice(responses["neutral"])

def generate_user_sentence(emotion):
    """
    Generates synthetic user sentences for a given emotion.
    These are kept relatively simple as the focus is on bot response diversity.
    """
    templates = {
        "joy": [
            "I'm feeling incredibly happy today!", "This news brings me so much joy.", "Everything is going perfectly, I'm delighted!",
            "What a wonderful day, I feel so cheerful.", "I'm on top of the world!", "I can't stop smiling, this is amazing.",
            "Such a joyous occasion!", "I feel so uplifted and content.", "I'm having a great day!", "Life is good."
        ],
        "sadness": [
            "I feel really down today.", "This makes me incredibly sad.", "I'm struggling to find any happiness right now.",
            "A deep sense of sorrow washes over me.", "I wish things were different, I feel so low.", "It's a tough day, I'm feeling heartbroken.",
            "I just want to cry.", "Everything seems so bleak.", "I'm going through a hard time.", "I feel so alone."
        ],
        "anger": [
            "I'm absolutely furious about this!", "This situation makes me so mad.", "I can't believe they did that, it's infuriating.",
            "I'm seething with rage.", "My blood is boiling.", "I feel so frustrated and resentful.",
            "This is unacceptable!", "I'm so angry I could scream.", "I'm fed up with everything.", "This makes me so annoyed."
        ],
        "fear": [
            "I'm so scared right now.", "This situation is terrifying.", "I feel a knot of anxiety in my stomach.",
            "What if something bad happens?", "I'm constantly worried and apprehensive.", "A chilling fear grips me.",
            "I'm afraid of what's to come.", "This uncertainty is frightening.", "I'm really nervous about this.", "I'm afraid."
        ],
        "surprise": [
            "Wow, I'm genuinely surprised!", "I never saw that coming, what a shock!", "This is completely unexpected.",
            "Oh my goodness, I'm astonished!", "That's quite a revelation.", "I'm taken aback by this.",
            "What a twist!", "This caught me off guard.", "I'm truly amazed!", "I didn't expect that."
        ],
        "disgust": [
            "That's absolutely repulsive.", "I feel sick to my stomach.", "This is so gross.",
            "I'm filled with revulsion.", "I can't stand the sight of it.", "This is truly abominable.",
            "It makes me want to gag.", "I find this utterly disgusting.", "That's sickening.", "I feel nauseous."
        ],
        "love": [
            "I deeply love you.", "My heart is full of love.", "I cherish our time together.",
            "I feel so much affection for them.", "This is what true love feels like.", "I adore this feeling.",
            "You mean the world to me.", "I'm so fond of them.", "I love my family.", "I feel so much affection."
        ],
        "gratitude": [
            "I'm so grateful for your help.", "Thank you so much, I really appreciate it.", "I feel truly blessed.",
            "Your kindness means a lot to me.", "I'm thankful for everything.", "I can't express how appreciative I am.",
            "This fills me with immense gratitude.", "I owe you a big thank you.", "I'm so thankful for this.", "I appreciate you."
        ],
        "anxiety": [
            "I'm feeling very anxious about the future.", "My mind is racing with worries.", "I can't seem to calm down, I'm so stressed.",
            "A constant sense of dread is with me.", "I'm overwhelmed by all these thoughts.", "I feel on edge and nervous.",
            "The uncertainty is making me so uneasy.", "I'm having trouble breathing due to stress.", "I'm so worried.", "I feel restless."
        ],
        "hope": [
            "I'm hopeful that things will get better.", "There's a glimmer of hope on the horizon.", "I believe in a brighter future.",
            "I'm optimistic about the outcome.", "I wish for the best.", "I'm holding onto hope.",
            "May good things come our way.", "I have a positive outlook.", "I feel a sense of hope.", "I'm optimistic."
        ],
        "neutral": [
            "The sky is blue.", "I am going to the market.", "The cat sat on the mat.",
            "It's a normal day.", "I need to buy groceries.", "The book is on the table.",
            f"I'm thinking about dinner. It's {pd.Timestamp.now().strftime('%I:%M %p IST on %A, %B %d, %Y in Dundigal, Telangana, India.')}", # Acknowledge current context
            "The weather is good.", "I'm doing okay.", "Just a regular day.", "What's up?"
        ]
    }
    return random.choice(templates[emotion]) if emotion in templates else random.choice(templates["neutral"])


def create_and_format_emotion_dataset(
    num_synthetic_per_emotion=250,
    total_dataset_size=5000,
    project_path='/content/drive/My Drive/EmotionalSupportBot-4' # New path for this version
):
    """
    Creates an emotion dataset with user_input, emotion, and assistant_response fields,
    outputs to JSONL, and dynamically formats for fine-tuning.
    """
    all_structured_data = [] # Store dictionaries here
    emotion_labels = [
        "joy", "sadness", "anger", "fear", "surprise", "disgust",
        "love", "gratitude", "anxiety", "hope", "neutral"
    ]

    # 1. Load an existing emotion dataset (dair-ai/emotion)
    print("Attempting to load existing 'dair-ai/emotion' dataset from Hugging Face...")
    try:
        hf_emotion_dataset = load_dataset("dair-ai/emotion", split="train")
        id_to_emotion = {
            0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"
        }
        for item in hf_emotion_dataset:
            user_text = item['text']
            emotion_label = id_to_emotion.get(item['label'], "neutral")
            assistant_response = get_bot_response_options(emotion_label) # Get a balanced response
            all_structured_data.append({
                "user_input": user_text,
                "emotion": emotion_label,
                "assistant_response": assistant_response # This already includes <|stop|>
            })
        print(f"Loaded {len(hf_emotion_dataset)} samples from 'dair-ai/emotion' and formatted them.")
    except Exception as e:
        print(f"Could not load 'dair-ai/emotion' dataset. Error: {e}")
        print("Continuing with only synthetic data generation.")

    # 2. Generate synthetic data to broaden emotion coverage and reach target size
    print(f"Generating {num_synthetic_per_emotion} enhanced synthetic conversational pairs per emotion...")
    for emotion_label in emotion_labels:
        for _ in range(num_synthetic_per_emotion):
            user_text = generate_user_sentence(emotion_label)
            assistant_response = get_bot_response_options(emotion_label) # Get a balanced response
            all_structured_data.append({
                "user_input": user_text,
                "emotion": emotion_label,
                "assistant_response": assistant_response # This already includes <|stop|>
            })

    # Shuffle and trim to desired size
    random.shuffle(all_structured_data) # Shuffle the list of dictionaries
    if len(all_structured_data) > total_dataset_size:
        all_structured_data = random.sample(all_structured_data, total_dataset_size) # Random sample
    elif len(all_structured_data) < total_dataset_size:
        print(f"Warning: Could not reach {total_dataset_size} samples. Current size: {len(all_structured_data)}")
        print("Consider increasing 'num_synthetic_per_emotion' or finding more external datasets.")

    print(f"Final structured dataset size: {len(all_structured_data)} samples.")
    print("\nExample structured data from the dataset (first 2 samples):")
    for i in range(min(2, len(all_structured_data))): # Print 2 samples for clarity
        print(f"- {all_structured_data[i]}")

    # Save the dataset to Google Drive as JSONL
    os.makedirs(project_path, exist_ok=True)
    dataset_file_jsonl = os.path.join(project_path, "emotional_support_structured_dataset.jsonl")

    with open(dataset_file_jsonl, 'w', encoding='utf-8') as f:
        for entry in all_structured_data:
            json.dump(entry, f)
            f.write('\n')

    print(f"\nStructured dataset saved to: {dataset_file_jsonl}")

    # For fine-tuning, we'll need to re-format this into a single 'text' string
    # We return the raw structured data here, and format it in the next step.
    return HFDataset.from_list(all_structured_data)

# --- Execution ---
if __name__ == "__main__":
    # Mount Google Drive (if running in Colab)
    try:
        from google.colab import drive
        drive.mount('/content/drive')
        project_root = '/content/drive/My Drive/EmotionalSupportBot-4' # New directory for this version
    except ImportError:
        print("Not in Google Colab environment. Assuming local execution.")
        project_root = './EmotionalSupportBot-4' # Adjust if running locally and want to save elsewhere

    dataset_hf_structured = create_and_format_emotion_dataset(
        num_synthetic_per_emotion=250,
        total_dataset_size=5000,
        project_path=project_root
    )

    print("\nDataset generation complete. You can now use 'emotional_support_structured_dataset.jsonl' for fine-tuning.")
    print("The returned object 'dataset_hf_structured' is a Hugging Face Dataset with 'user_input', 'emotion', 'assistant_response' columns.")

In [None]:
# @title Cell 3: Load Enhanced Dataset (from JSONL and reformat for training)
import pandas as pd
from datasets import Dataset as HFDataset, load_dataset
from sklearn.model_selection import train_test_split
import os

# Assume project_path is defined from Cell 2 (Google Drive mount)
project_path = '/content/drive/My Drive/EmotionalSupportBot-4' # <--- IMPORTANT: Update this to your new path!
dataset_file_jsonl = os.path.join(project_path, "emotional_support_structured_dataset.jsonl")

print(f"Loading structured dataset from: {dataset_file_jsonl}")

# Load JSONL into Hugging Face Dataset
loaded_hf_dataset_structured = load_dataset('json', data_files=dataset_file_jsonl, split='train')
print(f"Loaded {len(loaded_hf_dataset_structured)} structured samples.")
print(loaded_hf_dataset_structured[0]) # Print first sample to verify new structure

# Now, map this structured dataset to create the 'text' column required for GPT-2 training
def format_for_gpt2_training(example):
    # This combines the fields into the "User: ... [EMOTION: ...] Bot: ..." format
    return {
        "text": f"User: {example['user_input']} [EMOTION: {example['emotion']}] Bot: {example['assistant_response']}"
    }

# Apply the formatting
formatted_for_training_dataset = loaded_hf_dataset_structured.map(format_for_gpt2_training, remove_columns=['user_input', 'emotion', 'assistant_response'])

# Extract the 'text' column AND EXPLICITLY CONVERT TO A PYTHON LIST
formatted_texts_list = list(formatted_for_training_dataset['text']) # <--- KEY FIX HERE: added list() cast

# Set a random seed for reproducibility (ensure SEED is defined, e.g., in Cell 1)
# SEED = 42 # Assuming SEED is defined globally from Cell 1

# Split into training and validation sets
train_texts, val_texts = train_test_split(formatted_texts_list, test_size=0.1, random_state=SEED)

# Convert to Hugging Face Dataset objects
train_dataset = HFDataset.from_pandas(pd.DataFrame({"text": train_texts}))
val_dataset = HFDataset.from_pandas(pd.DataFrame({"text": val_texts}))

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print("\nExample formatted text for fine-tuning (from the combined 'text' field):")
print(train_dataset[0]['text'])

In [None]:
# @title Cell 5: Load Tokenizer and Model (Crucial Modifications)
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define your stop token (must match what you added to the dataset)
STOP_TOKEN = "<|stop|>"

# Add the new stop token to the tokenizer's vocabulary.
# This will assign it a new ID, typically len(tokenizer) BEFORE resizing.
num_added_tokens = tokenizer.add_special_tokens({'additional_special_tokens': [STOP_TOKEN]})
print(f"Added {num_added_tokens} new token(s): '{STOP_TOKEN}'")

# Get the ID of your newly added stop token
# This is crucial: it should be different from GPT-2's default EOS (50256)
stop_token_id = tokenizer.convert_tokens_to_ids(STOP_TOKEN)
print(f"New stop token '{STOP_TOKEN}' assigned ID: {stop_token_id}")

# GPT-2 typically uses its EOS token for padding.
# For better clarity and to avoid confusion with the original EOS,
# let's explicitly set the pad_token to your new STOP_TOKEN if it's not already set,
# or ensure it's distinct from the original GPT-2 EOS.
if tokenizer.pad_token is None:
    tokenizer.pad_token_id = stop_token_id # Set padding to your new stop token

model = AutoModelForCausalLM.from_pretrained(model_name)

# IMPORTANT: Resize model embeddings to account for the new token.
# This makes sure the model can learn an embedding for your new token.
model.resize_token_embeddings(len(tokenizer))

print(f"Tokenizer loaded: {model_name}. Added '{STOP_TOKEN}' token with ID {stop_token_id}.")
print(f"Model loaded: {model_name}. Embeddings resized to {len(tokenizer)} tokens.")
print(f"Number of model parameters: {model.num_parameters() / 1e6:.2f}M")

In [None]:
# @title Cell 6: Tokenize and Prepare Data Loaders

def tokenize_function(examples):
    # Ensure truncation and padding are handled
    # max_length can be adjusted based on your typical sentence length
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# Data Collator for Language Modeling will dynamically pad batches and create labels from inputs
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

print("\nExample tokenized input (first sample):")
print(tokenized_train_dataset[0])

In [None]:
# @title Cell 7: Configure Training Arguments and Trainer

output_dir = os.path.join(project_path, "gpt2_emotional_support_model")

# Training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3, # You might need more epochs depending on dataset size and complexity
    per_device_train_batch_size=4, # Smaller batch size for CPU/limited GPU
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8, # Effectively increases batch size to 4 * 8 = 32
    learning_rate=5e-5,
    weight_decay=0.01,
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    max_grad_norm=1.0,
    save_steps=500, # Save checkpoint every 500 steps
    save_total_limit=2, # Only keep the last 2 checkpoints
    eval_strategy="steps", # Corrected argument name
    eval_steps=500,
    logging_steps=100,
    log_level="info",
    seed=SEED,
    # For CPU-only training, ensure no GPU-specific settings are enforced
    no_cuda=True if not torch.cuda.is_available() else False, # Force CPU if no CUDA, otherwise use CUDA
    report_to="none" # Disable reporting to W&B etc. if not needed
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
)

print(f"Model will be saved to: {output_dir}")

In [None]:
# @title Cell 8: Train the Model
print("Starting fine-tuning...")
trainer.train()
print("Fine-tuning complete!")

# Save the final model
final_model_path = os.path.join(output_dir, "final_model")
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"Final fine-tuned model and tokenizer saved to: {final_model_path}")

In [None]:
# @title Cell 9: Test the Fine-tuned Model (CPU inference) (Update pipeline and Cleaning)
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
import os

# Assume final_model_path is defined from Cell 8
final_model_path = '/content/drive/My Drive/EmotionalSupportBot-4/gpt2_emotional_support_model/final_model'

loaded_tokenizer = AutoTokenizer.from_pretrained(final_model_path)
loaded_model = AutoModelForCausalLM.from_pretrained(final_model_path)

loaded_model.eval()
device = "cpu"
loaded_model.to(device)

print(f"Loaded model for inference on: {device}")

STOP_TOKEN = "<|stop|>"
# Ensure you get the ID from the loaded tokenizer, which now should have the unique ID
stop_token_id = loaded_tokenizer.convert_tokens_to_ids(STOP_TOKEN)
# Fallback just in case, though it should be set correctly if Cell 5 ran.
if loaded_tokenizer.pad_token_id is None:
    loaded_tokenizer.pad_token_id = stop_token_id
print(f"Stop token '{STOP_TOKEN}' ID: {stop_token_id}")

# --- KEY CHANGE IN PIPELINE INSTANTIATION ---
# Crucial: explicitly pass eos_token_id and pad_token_id to the pipeline.
# Ensure these are the IDs of your custom STOP_TOKEN.
generator = pipeline(
    'text-generation',
    model=loaded_model,
    tokenizer=loaded_tokenizer,
    device=-1, # -1 specifies CPU
    eos_token_id=stop_token_id, # Tell it to stop explicitly at this token
    pad_token_id=stop_token_id  # Use the same for padding
)
# --- END KEY CHANGE ---


def generate_response(prompt_text, max_new_tokens=60, num_return_sequences=1, assumed_emotion=None):
    if assumed_emotion:
        input_prompt = f"User: {prompt_text} [EMOTION: {assumed_emotion}] Bot:"
    else:
        input_prompt = f"User: {prompt_text} Bot:"

    print(f"\nGenerating response for: '{prompt_text}' (Assumed Emotion: {assumed_emotion if assumed_emotion else 'None'})")

    generated_sequences = generator(
        input_prompt,
        max_new_tokens=max_new_tokens,
        num_return_sequences=num_return_sequences,
        no_repeat_ngram_size=5, # Keep at 5 or slightly higher
        temperature=0.7,
        top_k=50,
        top_p=0.9,
        do_sample=True,
        # eos_token_id and pad_token_id are now set in the pipeline constructor,
        # and do not need to be repeated here unless you want to override.
    )

    for i, sequence in enumerate(generated_sequences):
        generated_text = sequence['generated_text']
        bot_response_start = generated_text.find("Bot:")
        if bot_response_start != -1:
            clean_response = generated_text[bot_response_start + len("Bot:"):].strip()

            # --- MORE AGGRESSIVE CLEANUP FOR STOP TOKENS AND ARTIFACTS ---
            # Remove all occurrences of the STOP_TOKEN and common HTML-like variants
            clean_response = clean_response.replace(STOP_TOKEN, '').strip()
            clean_response = clean_response.replace('<|stop|>', '').strip()
            clean_response = clean_response.replace('</stop|>', '').strip() # The HTML-like variant
            clean_response = clean_response.replace('>', '').strip() # Catch lingering > characters
            clean_response = clean_response.replace('<', '').strip() # Catch lingering < characters
            clean_response = clean_response.replace('|', '').strip() # Catch lingering | characters

            # Remove any residual emotion tags or partial tags
            clean_response = clean_response.split('[EMOTION:')[0].strip()
            clean_response = clean_response.split('[EMOTE:')[0].strip() # Handles potentially different tags

            # Ensure it stops at a logical break (newline or start of next user/bot turn)
            clean_response = clean_response.split('User:')[0].strip()
            clean_response = clean_response.split('Bot:')[0].strip()
            clean_response = clean_response.split('\n')[0].strip()

            # Final punctuation cleanup
            if clean_response and not (clean_response.endswith('.') or clean_response.endswith('?') or clean_response.endswith('!')):
                # Only add ellipsis if it seems cut off mid-sentence and is not too short
                if len(clean_response.split()) > 3 and not clean_response.endswith('...'):
                     clean_response += "..."
                elif len(clean_response) > 0: # If there's some text, ensure it ends with period
                    clean_response += "."
            # --- END MORE AGGRESSIVE CLEANUP ---

            print(f"Generated Response {i+1}: {clean_response}")
        else:
            print(f"Generated Raw Text {i+1}: {generated_text}")

# Test calls (set num_return_sequences=1 in pipeline constructor for single output per prompt)
generate_response("I'm feeling so happy today, it's amazing!", max_new_tokens=40, assumed_emotion="joy", num_return_sequences=1)
generate_response("I just got some really bad news and I feel utterly devastated.", max_new_tokens=40, assumed_emotion="sadness", num_return_sequences=1)
generate_response("I'm so angry at how things turned out!", max_new_tokens=40, assumed_emotion="anger", num_return_sequences=1)
generate_response("I'm quite worried about my exam results.", max_new_tokens=40, assumed_emotion="fear", num_return_sequences=1)
generate_response("This is so boring, I don't know what to do.", max_new_tokens=40, assumed_emotion="neutral", num_return_sequences=1)
generate_response("I feel incredibly grateful for your support.", max_new_tokens=40, assumed_emotion="gratitude", num_return_sequences=1)