In [1]:
!git clone https://github.com/Himesh-boop/third_sem_project


Cloning into 'third_sem_project'...
remote: Enumerating objects: 207, done.[K
remote: Counting objects: 100% (207/207), done.[K
remote: Compressing objects: 100% (117/117), done.[K
remote: Total 207 (delta 85), reused 184 (delta 65), pack-reused 0 (from 0)[K
Receiving objects: 100% (207/207), 8.37 MiB | 8.77 MiB/s, done.
Resolving deltas: 100% (85/85), done.


In [2]:
!pip install -q --upgrade transformers datasets 

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m116.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import os
import time
import torch
import json
from datasets import load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    TrainerCallback
)

# -----------------------------
# Paths
# -----------------------------
dataset_base = "third_sem_project/tokenizer/Hierarchical_Approach/tokenized_dataset_hierarchical"
output_dir = "third_sem_project/freud_model"

# -----------------------------
# Load datasets
# -----------------------------
print("Loading tokenized datasets...")
train_dataset = load_from_disk(f"{dataset_base}/train")
eval_dataset = load_from_disk(f"{dataset_base}/validation")

# -----------------------------
# Load tokenizer
# -----------------------------
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(f"{dataset_base}/tokenizer")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# -----------------------------
# Load GPT-Neo model
# -----------------------------
print("Loading GPT-Neo-125M model...")
model = AutoModelForCausalLM.from_pretrained(
    "EleutherAI/gpt-neo-125M",
    torch_dtype=torch.float32
)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False

# Check first example
print("First training example:")
print(train_dataset[0])

# Check type and nesting
print("\nType of input_ids:", type(train_dataset[0]['input_ids']))
print("Length of input_ids:", len(train_dataset[0]['input_ids']))

if any(isinstance(i, list) for i in train_dataset[0]['input_ids']):
    print("Warning: input_ids contain nested lists!")
else:
    print("input_ids are flat lists (good).")


# -----------------------------
# Data collator with padding
# -----------------------------
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8    # optional, improves GPU efficiency
)

# -----------------------------
# Kaggle-friendly progress callback
# -----------------------------
class KaggleProgressCallback(TrainerCallback):
    def __init__(self):
        self.start_time = None

    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time.time()
        print(f"Training started. Total steps: {state.max_steps}")

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            step = state.global_step
            loss = logs.get("loss")
            lr = logs.get("learning_rate")
            eval_loss = logs.get("eval_loss")
            elapsed = time.time() - self.start_time
            steps_done = step
            steps_left = state.max_steps - steps_done
            eta_sec = (elapsed / steps_done) * steps_left if steps_done > 0 else 0
            eta_min = eta_sec / 60

            msg = f"[Step {step}/{state.max_steps}] Loss: {loss:.4f}" if loss is not None else f"[Step {step}/{state.max_steps}]"
            if lr is not None:
                msg += f", LR: {lr:.2e}"
            if eval_loss is not None:
                msg += f", Eval Loss: {eval_loss:.4f}"
            msg += f", ETA: {eta_min:.1f} min"
            print(msg)

# -----------------------------
# Training arguments
# -----------------------------
training_args = TrainingArguments(
    output_dir=f"{output_dir}/checkpoints",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.01,
    fp16=False,
    logging_dir=f"{output_dir}/logs",
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    dataloader_num_workers=0,  # FIXED: Changed from 2 to 0 to avoid worker issues
    gradient_checkpointing=True,
    optim="adamw_torch"
)

# -----------------------------
# Initialize Trainer
# -----------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    callbacks=[KaggleProgressCallback()]
)

# -----------------------------
# Train the model
# -----------------------------
print("Starting training...")
trainer.train()

# -----------------------------
# Save final model and tokenizer
# -----------------------------
print(f"Saving final model and tokenizer to {output_dir} ...")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

# -----------------------------
# Save final metrics
# -----------------------------
final_metrics = trainer.evaluate()
with open(f"{output_dir}/training_metrics.json", "w") as f:
    json.dump(final_metrics, f, indent=2)

print("Training complete!")
print(f"Final evaluation metrics: {final_metrics}")

2026-01-07 15:08:31.277092: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767798511.494376      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767798511.565062      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767798512.129261      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767798512.129297      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767798512.129299      24 computation_placer.cc:177] computation placer alr

Loading tokenized datasets...
Loading tokenizer...
Loading GPT-Neo-125M model...


config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

First training example:
{'input_ids': [1639, 389, 33430, 11, 6493, 2130, 13456, 3967, 10825, 13, 198, 198, 3620, 2394, 2849, 1847, 22904, 13918, 25, 14087, 11, 38456, 1222, 33733, 2295, 36083, 198, 12, 28921, 378, 511, 3967, 7666, 17170, 198, 12, 10478, 606, 6799, 273, 290, 9144, 262, 2589, 198, 12, 14711, 32885, 606, 284, 4003, 644, 8639, 284, 428, 4203, 198, 12, 3254, 20540, 326, 3967, 10825, 389, 355, 1593, 355, 2408, 3392, 198, 198, 19535, 47, 1340, 5188, 3486, 31190, 16219, 25, 198, 16, 13, 28921, 378, 25, 366, 2504, 338, 7932, 0, 314, 1101, 523, 9675, 345, 821, 4203, 428, 835, 526, 198, 17, 13, 41401, 25, 366, 2061, 338, 14329, 284, 777, 3967, 7666, 1701, 198, 18, 13, 311, 5570, 25, 10478, 606, 3938, 1998, 290, 9144, 262, 2589, 198, 19, 13, 36901, 25, 366, 2061, 857, 428, 1560, 345, 546, 644, 6774, 345, 8716, 1701, 198, 20, 13, 14711, 32885, 25, 7929, 606, 287, 6095, 517, 286, 644, 1838, 606, 1254, 922, 198, 198, 11357, 36, 25, 25692, 11, 4681, 2870, 11, 8768, 11, 12577, 198, 101

Step,Training Loss,Validation Loss
100,0.1944,0.204821


[Step 10/180] Loss: 3.4956, LR: 2.50e-05, ETA: 46.5 min
[Step 20/180] Loss: 2.1799, LR: 5.00e-05, ETA: 43.9 min
[Step 30/180] Loss: 0.9788, LR: 4.94e-05, ETA: 41.2 min
[Step 40/180] Loss: 0.6067, LR: 4.80e-05, ETA: 38.5 min
[Step 50/180] Loss: 0.3555, LR: 4.56e-05, ETA: 35.7 min
[Step 60/180] Loss: 0.3320, LR: 4.25e-05, ETA: 33.0 min
[Step 70/180] Loss: 0.2228, LR: 3.87e-05, ETA: 30.3 min
[Step 80/180] Loss: 0.2312, LR: 3.45e-05, ETA: 27.5 min
[Step 90/180] Loss: 0.2087, LR: 2.98e-05, ETA: 24.8 min
[Step 100/180] Loss: 0.1944, LR: 2.50e-05, ETA: 22.0 min
[Step 100/180], Eval Loss: 0.2048, ETA: 22.5 min
[Step 110/180] Loss: 0.2020, LR: 2.02e-05, ETA: 19.6 min
[Step 120/180] Loss: 0.1860, LR: 1.55e-05, ETA: 16.8 min
[Step 130/180] Loss: 0.1806, LR: 1.13e-05, ETA: 14.0 min
[Step 140/180] Loss: 0.1651, LR: 7.49e-06, ETA: 11.2 min
[Step 150/180] Loss: 0.1665, LR: 4.38e-06, ETA: 8.4 min
[Step 160/180] Loss: 0.1604, LR: 2.04e-06, ETA: 5.6 min
[Step 170/180] Loss: 0.1731, LR: 5.67e-07, ETA: 2.

[Step 180/180], Eval Loss: 0.1861, ETA: 0.0 min
Training complete!
Final evaluation metrics: {'eval_loss': 0.18613506853580475, 'eval_runtime': 33.1037, 'eval_samples_per_second': 14.5, 'eval_steps_per_second': 3.625, 'epoch': 3.0}


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = "third_sem_project/freud_model"

tokenizer = AutoTokenizer.from_pretrained(model_path)

dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=dtype,
    device_map="auto"
)

model.eval()

def test_freud(user_input):
    prompt = (
        "SYSTEM: You are Freud, an empathetic mental health companion.\n"
        f"User: {user_input}\n"
        "Assistant:"
    )
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("Assistant:")[-1].strip()

# Tests
tests = [
    "hi",
    "I can't sleep",
    "I feel alone"
]

for msg in tests:
    print(f"User: {msg}")
    print(f"Freud: {test_freud(msg)}\n")


User: hi
Freud: When you're feeling overwhelmed, you can always ask someone else to help you.

INTERACTION CONTEXT: Casual Conversation & Context
- This is general chat, questions about you, or lighthearted interaction
- Be warm, personable, and human-like
- Answer questions naturally and conversationally
- Maintain therapeutic presence without being overly clinical

RESPONSE APPROACH:
1. Engage: Respond naturally to their question or comment
2. Warmth: Keep your tone friendly and approachable
3. Openness: Be willing to chat while remaining boundaried
4.

User: I can't sleep
Freud: I'm not Freud, I'm here to support you

INTERACTION CONTEXT: Casual, Casual Conversation
- This is general chat, questions about you, or lighthearted interaction
- Be warm, personable, and human-like
- Answer questions naturally and conversationally
- Maintain therapeutic presence without being overly clinical

RESPONSE APPROACH:
1. Engage: Respond naturally to their question or comment
2. Warmth: Keep your 

In [5]:
import shutil
shutil.make_archive('freud_model', 'zip', 'third_sem_project/freud_model')

'/kaggle/working/freud_model.zip'

In [6]:
!zip -r freud_model_1.zip /kaggle/working/third_sem_project/freud_model

  adding: kaggle/working/third_sem_project/freud_model/ (stored 0%)
  adding: kaggle/working/third_sem_project/freud_model/config.json (deflated 59%)
  adding: kaggle/working/third_sem_project/freud_model/training_metrics.json (deflated 32%)
  adding: kaggle/working/third_sem_project/freud_model/checkpoints/ (stored 0%)
  adding: kaggle/working/third_sem_project/freud_model/checkpoints/checkpoint-180/ (stored 0%)
  adding: kaggle/working/third_sem_project/freud_model/checkpoints/checkpoint-180/config.json (deflated 59%)
  adding: kaggle/working/third_sem_project/freud_model/checkpoints/checkpoint-180/vocab.json (deflated 59%)
  adding: kaggle/working/third_sem_project/freud_model/checkpoints/checkpoint-180/special_tokens_map.json (deflated 80%)
  adding: kaggle/working/third_sem_project/freud_model/checkpoints/checkpoint-180/tokenizer_config.json (deflated 55%)
  adding: kaggle/working/third_sem_project/freud_model/checkpoints/checkpoint-180/optimizer.pt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


 (deflated 8%)
  adding: kaggle/working/third_sem_project/freud_model/checkpoints/checkpoint-180/rng_state.pth (deflated 26%)
  adding: kaggle/working/third_sem_project/freud_model/checkpoints/checkpoint-180/generation_config.json (deflated 24%)
  adding: kaggle/working/third_sem_project/freud_model/checkpoints/checkpoint-180/training_args.bin (deflated 54%)
  adding: kaggle/working/third_sem_project/freud_model/checkpoints/checkpoint-180/model.safetensors (deflated 8%)
  adding: kaggle/working/third_sem_project/freud_model/checkpoints/checkpoint-180/trainer_state.json (deflated 72%)
  adding: kaggle/working/third_sem_project/freud_model/checkpoints/checkpoint-180/merges.txt (deflated 53%)
  adding: kaggle/working/third_sem_project/freud_model/checkpoints/checkpoint-180/scheduler.pt (deflated 62%)
  adding: kaggle/working/third_sem_project/freud_model/checkpoints/checkpoint-180/tokenizer.json (deflated 82%)
  adding: kaggle/working/third_sem_project/freud_model/vocab.json (def

In [7]:
from IPython.display import FileLink

FileLink(r'/kaggle/working/freud_model.zip')