In [1]:
!pip install -q protobuf==4.25.3

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.26.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
grain 0.2.15 requires protobuf>=5.28.3, but you have protobuf 4.25.3 which is incompatible.
ray 2.52.1 requires click!=8.3.*,>=7.0, but you have click 8.3.1 which is incompatible.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 4.25.3 which is incompatible.
ydf 0.13.0 requires protobuf<7.0.0,>=5.29.1, but you have protobuf 4.25.3 which is incompatible.
bigframes 2.26.0 requires rich<14,>=12.4.4, but you have rich 14.2.0 which is incompatible.
grpcio-status 1.71.2 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 4.25.3 which is incompatible.
gcsf

In [2]:
!git clone -b test --single-branch https://github.com/Himesh-boop/Freud.git

Cloning into 'Freud'...
remote: Enumerating objects: 244, done.[K
remote: Counting objects: 100% (244/244), done.[K
remote: Compressing objects: 100% (113/113), done.[K
remote: Total 244 (delta 109), reused 240 (delta 106), pack-reused 0 (from 0)[K
Receiving objects: 100% (244/244), 8.86 MiB | 13.34 MiB/s, done.
Resolving deltas: 100% (109/109), done.


In [None]:
import os
import time
import torch
import json
from datasets import load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    TrainerCallback
)

dataset_base = "Freud/tokenizer/Hierarchical_Approach/tokenized_dataset_hierarchical"
output_dir = "Freud/freud_model"

print("Loading tokenized datasets...")
train_dataset = load_from_disk(f"{dataset_base}/train")
eval_dataset = load_from_disk(f"{dataset_base}/validation")

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(f"{dataset_base}/tokenizer")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading GPT-Neo-125M model...")
model = AutoModelForCausalLM.from_pretrained(
    "EleutherAI/gpt-neo-125M",
    torch_dtype=torch.float32
)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False

print("First training example:")
print(train_dataset[0])

print("\nType of input_ids:", type(train_dataset[0]['input_ids']))
print("Length of input_ids:", len(train_dataset[0]['input_ids']))

if any(isinstance(i, list) for i in train_dataset[0]['input_ids']):
    print("Warning: input_ids contain nested lists!")
else:
    print("input_ids are flat lists (good).")


data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8   
)

class KaggleProgressCallback(TrainerCallback):
    def __init__(self):
        self.start_time = None

    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time.time()
        print(f"Training started. Total steps: {state.max_steps}")

    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            step = state.global_step
            loss = logs.get("loss")
            lr = logs.get("learning_rate")
            eval_loss = logs.get("eval_loss")
            elapsed = time.time() - self.start_time
            steps_done = step
            steps_left = state.max_steps - steps_done
            eta_sec = (elapsed / steps_done) * steps_left if steps_done > 0 else 0
            eta_min = eta_sec / 60

            msg = f"[Step {step}/{state.max_steps}] Loss: {loss:.4f}" if loss is not None else f"[Step {step}/{state.max_steps}]"
            if lr is not None:
                msg += f", LR: {lr:.2e}"
            if eval_loss is not None:
                msg += f", Eval Loss: {eval_loss:.4f}"
            msg += f", ETA: {eta_min:.1f} min"
            print(msg)

training_args = TrainingArguments(
    output_dir=f"{output_dir}/checkpoints",
    overwrite_output_dir=True,
    num_train_epochs=3,

    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,

    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    weight_decay=0.01,

    fp16=True,                    
    logging_steps=10,

    eval_strategy="steps",
    eval_steps=100,

    save_strategy="steps",
    save_steps=200,
    save_total_limit=3,

    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

    report_to="none",
    dataloader_num_workers=2,       
    gradient_checkpointing=False,    
    optim="adamw_torch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    callbacks=[KaggleProgressCallback()]
)

print("Starting training...")
trainer.train()

print(f"Saving final model and tokenizer to {output_dir} ...")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

final_metrics = trainer.evaluate()
with open(f"{output_dir}/training_metrics.json", "w") as f:
    json.dump(final_metrics, f, indent=2)

print("Training complete!")
print(f"Final evaluation metrics: {final_metrics}")

2026-01-09 15:28:04.488446: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767972484.700074      23 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767972484.758586      23 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767972485.268710      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767972485.268758      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767972485.268761      23 computation_placer.cc:177] computation placer alr

Loading tokenized datasets...
Loading tokenizer...
Loading GPT-Neo-125M model...


config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

First training example:
{'input_ids': [1639, 389, 33430, 11, 6493, 2130, 13456, 3967, 10825, 13, 198, 198, 3620, 2394, 2849, 1847, 22904, 13918, 25, 14087, 11, 38456, 1222, 33733, 2295, 36083, 198, 12, 28921, 378, 511, 3967, 7666, 17170, 198, 12, 10478, 606, 6799, 273, 290, 9144, 262, 2589, 198, 12, 14711, 32885, 606, 284, 4003, 644, 8639, 284, 428, 4203, 198, 12, 3254, 20540, 326, 3967, 10825, 389, 355, 1593, 355, 2408, 3392, 198, 198, 19535, 47, 1340, 5188, 3486, 31190, 16219, 25, 198, 16, 13, 28921, 378, 25, 366, 2504, 338, 7932, 0, 314, 1101, 523, 9675, 345, 821, 4203, 428, 835, 526, 198, 17, 13, 41401, 25, 366, 2061, 338, 14329, 284, 777, 3967, 7666, 1701, 198, 18, 13, 311, 5570, 25, 10478, 606, 3938, 1998, 290, 9144, 262, 2589, 198, 19, 13, 36901, 25, 366, 2061, 857, 428, 1560, 345, 546, 644, 6774, 345, 8716, 1701, 198, 20, 13, 14711, 32885, 25, 7929, 606, 287, 6095, 517, 286, 644, 1838, 606, 1254, 922, 198, 198, 11357, 36, 25, 25692, 11, 4681, 2870, 11, 8768, 11, 12577, 198, 101

Step,Training Loss,Validation Loss
100,0.1857,0.212402
200,0.1378,0.162611
300,0.1237,0.152923


[Step 10/360] Loss: 3.5748, LR: 1.25e-05, ETA: 12.4 min
[Step 20/360] Loss: 2.7899, LR: 2.64e-05, ETA: 11.7 min
[Step 30/360] Loss: 1.6382, LR: 4.03e-05, ETA: 11.3 min
[Step 40/360] Loss: 0.8238, LR: 5.00e-05, ETA: 10.9 min
[Step 50/360] Loss: 0.5415, LR: 4.98e-05, ETA: 10.5 min
[Step 60/360] Loss: 0.4018, LR: 4.94e-05, ETA: 10.2 min
[Step 70/360] Loss: 0.3298, LR: 4.87e-05, ETA: 9.8 min
[Step 80/360] Loss: 0.2519, LR: 4.79e-05, ETA: 9.5 min
[Step 90/360] Loss: 0.2413, LR: 4.68e-05, ETA: 9.1 min
[Step 100/360] Loss: 0.1857, LR: 4.55e-05, ETA: 8.8 min
[Step 100/360], Eval Loss: 0.2124, ETA: 9.8 min
[Step 110/360] Loss: 0.2121, LR: 4.40e-05, ETA: 9.3 min
[Step 120/360] Loss: 0.2332, LR: 4.23e-05, ETA: 8.9 min
[Step 130/360] Loss: 0.1611, LR: 4.05e-05, ETA: 8.4 min
[Step 140/360] Loss: 0.1542, LR: 3.85e-05, ETA: 8.0 min
[Step 150/360] Loss: 0.1682, LR: 3.64e-05, ETA: 7.6 min
[Step 160/360] Loss: 0.1666, LR: 3.42e-05, ETA: 7.2 min
[Step 170/360] Loss: 0.1451, LR: 3.19e-05, ETA: 6.8 min
[St

There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


[Step 360/360], ETA: 0.0 min
Saving final model and tokenizer to Freud/freud_model ...


[Step 360/360], Eval Loss: 0.1626, ETA: 0.0 min
Training complete!
Final evaluation metrics: {'eval_loss': 0.16261079907417297, 'eval_runtime': 22.6486, 'eval_samples_per_second': 21.193, 'eval_steps_per_second': 2.649, 'epoch': 3.0}


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = "Freud/freud_model"

tokenizer = AutoTokenizer.from_pretrained(model_path)

dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=dtype,
    device_map="auto"
)

model.eval()

def test_freud(user_input):
    prompt = (
        "SYSTEM: You are Freud, an empathetic mental health companion.\n"
        f"User: {user_input}\n"
        "Assistant:\n<ANSWER>\n"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("Assistant:")[-1].strip()

# Tests
tests = [
    "hi",
    "I can't sleep",
    "I feel alone"
]

for msg in tests:
    print(f"User: {msg}")
    print(f"Freud: {test_freud(msg)}\n")

User: hi
Freud: <ANSWER>
I'm Freud, supporting you in any difficult area.
</ANSWER>
</docx>
</sect2>
</sect1>
</sect2>
</sect3>
</sect4>
</sect5>
</sect6>
</sect7>
</sect8>
</sect9>
</sect10>
</sect11>
</sect12>
</sect12>
</sect13>
</sect14>
</sect15>
</sect16>
</sect17>
</sect18>
</sect19>
</sect20>
</sect21>
</sect22>
</sect22a>
</sect

User: I can't sleep
Freud: <ANSWER>
It's okay to feel anxious. You're not alone.
</ANSWER>
</BR>

<BR>
<HR>
</HR>
</DIV>
</EM>
<EM>
</EM>
</GENERAL>
<CRITICAL>
</CRITICAL>
<CREDENTAL>
<IMMEDIMATORY>
<TABLE>
<TR>
<TD>ANSWER
</TD>
</TR>
</WMOL10>
</TABLE>
</EM>
</EM>
</GENERAL>
</BLOCKEDRESPONSE>
</LEFTBRIGHT>
</EM>
</

User: I feel alone
Freud: <ANSWER>
I'm here to help you.
</ANSWER>
</docbook>
</div>
<div class="detail">
<p>
<a href="/overview/overview-detail" target="_blank">
<img alt="overview" src="overview.png"></a>
</p>
</p>
</div>
</div>
</div>
</body>
</html>

I'm using JSP to save information.
<!-- JAVABILITY CONTEXT: This attribute is disabl

In [5]:
import shutil
shutil.make_archive('freud_model_2', 'zip', 'Freud/freud_model')

'/kaggle/working/freud_model_2.zip'

In [6]:
from IPython.display import FileLink

FileLink(r'freud_model_2.zip')