In [1]:
# Keep session alive
import time
from datetime import datetime

def keep_alive():
    """Print timestamp every 30 minutes to keep session active"""
    while True:
        time.sleep(1800)  # 30 minutes
        print(f"‚è∞ Keep-alive: {datetime.now().strftime('%H:%M:%S')}")

# Start keep-alive in background
import threading
thread = threading.Thread(target=keep_alive, daemon=True)
thread.start()
print("‚úÖ Keep-alive started!")

‚úÖ Keep-alive started!


## Step 1: Install Dependencies

In [2]:
%%capture
!pip uninstall -y diffusers bitsandbytes
!pip install -q torch==2.1.2
!pip install -q transformers==4.37.2
!pip install -q datasets==2.16.1
!pip install -q accelerate==0.27.0
!pip install -q peft==0.9.0
!pip install -q trl==0.8.1

print("‚úÖ Packages installed!")

## Step 2: Import Libraries

In [3]:
import torch
import json
import os
import gc
from pathlib import Path
from datetime import datetime

from datasets import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from peft import (
    LoraConfig,
    get_peft_model,
    PeftModel,
)
from trl import SFTTrainer

print(f"üî• PyTorch: {torch.__version__}")
print(f"üéÆ CUDA: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"üéØ GPU: {torch.cuda.get_device_name(0)}")
    print(f"üíæ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

2026-02-04 16:55:01.479884: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770224101.662532      23 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770224101.713657      23 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770224102.137630      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770224102.137674      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770224102.137676      23 computation_placer.cc:177] computation placer alr

üî• PyTorch: 2.8.0+cu126
üéÆ CUDA: True
üéØ GPU: Tesla P100-PCIE-16GB
üíæ VRAM: 15.9 GB


## Step 3: Configuration

In [4]:
print("‚öôÔ∏è Configuration\n")

# Paths
TRAIN_DATA_PATH = "/kaggle/input/freud-2-0/freud_training_data/train.json"
VAL_DATA_PATH = "/kaggle/input/freud-2-0/freud_training_data/validation.json"
OUTPUT_DIR = "./freud_phi2_finetuned"
HF_MODEL_NAME = "Dalton-Khatri/freud-phi2"

# Model
BASE_MODEL = "microsoft/phi-2"

# Training (optimized for P100 + FP32)
NUM_EPOCHS = 3
BATCH_SIZE = 1  # Small due to FP32
GRADIENT_ACCUMULATION_STEPS = 8
LEARNING_RATE = 2e-4
WARMUP_RATIO = 0.03
MAX_SEQ_LENGTH = 512

# LoRA
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

# Checkpointing
LOGGING_STEPS = 50
SAVE_STEPS = 500

print(f"‚úÖ Config loaded")
print(f"üìä Effective batch: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")
print(f"üíæ Max sequence: {MAX_SEQ_LENGTH}")

‚öôÔ∏è Configuration

‚úÖ Config loaded
üìä Effective batch: 8
üíæ Max sequence: 512


## Step 4: Load Training Data

In [5]:
print("üìÇ Loading data...\n")

with open(TRAIN_DATA_PATH, 'r') as f:
    train_data = json.load(f)

with open(VAL_DATA_PATH, 'r') as f:
    val_data = json.load(f)

train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)

print(f"‚úÖ Train: {len(train_dataset):,} samples")
print(f"‚úÖ Val: {len(val_dataset):,} samples\n")

print("üîç Sample:")
print("="*80)
print(train_dataset[0]['text'][:400])
print("="*80)

üìÇ Loading data...

‚úÖ Train: 10,025 samples
‚úÖ Val: 1,114 samples

üîç Sample:
<|system|>: You are Freud, a calm, empathetic therapeutic AI assistant. You respond thoughtfully, kindly, and supportively. You ask gentle follow-up questions and never judge the user.
<|user|>:
[emotion: done]
That's the end of my input.
<|assistant|>:
Alright, take care and goodbye.
<|user|>:
[emotion: done]
I've given my perspective.
<|assistant|>:
Got it. Have a good one.


## Step 5: Load Model in FP32 (NO FP16!)

**üîß KEY FIX:** Loading in `torch.float32` to avoid gradient scaling errors.

In [6]:
print(f"ü§ñ Loading {BASE_MODEL} in FP32...\n")

# Load in FULL PRECISION (FP32)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float32,  # ‚Üê KEY FIX: FP32 not FP16
    device_map="auto",
    trust_remote_code=True,
)

print("‚úÖ Model loaded!")
print(f"üìä Parameters: {model.num_parameters():,}")
print(f"üíæ Data type: {next(model.parameters()).dtype}")
print(f"üéÆ Device: {next(model.parameters()).device}")

ü§ñ Loading microsoft/phi-2 in FP32...





config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

‚úÖ Model loaded!
üìä Parameters: 2,779,683,840
üíæ Data type: torch.float32
üéÆ Device: cuda:0


## Step 6: Load Tokenizer

In [7]:
print("üî§ Loading tokenizer...\n")

tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL,
    trust_remote_code=True,
)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

print("‚úÖ Tokenizer ready!")
print(f"üìù Vocab: {len(tokenizer):,}")
print(f"üîë EOS: {tokenizer.eos_token} ({tokenizer.eos_token_id})")
print(f"üîë PAD: {tokenizer.pad_token} ({tokenizer.pad_token_id})")

üî§ Loading tokenizer...



tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


‚úÖ Tokenizer ready!
üìù Vocab: 50,295
üîë EOS: <|endoftext|> (50256)
üîë PAD: <|endoftext|> (50256)


## Step 7: Apply LoRA

In [8]:
print("üéØ Applying LoRA...\n")

# Enable gradient checkpointing
model.gradient_checkpointing_enable()

lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["Wqkv", "fc1", "fc2"],  # Phi-2 specific
)

model = get_peft_model(model, lora_config)

trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())

print("‚úÖ LoRA applied!")
print(f"üìä Trainable: {trainable:,} ({100*trainable/total:.2f}%)")
print(f"üìä Total: {total:,}")

üéØ Applying LoRA...

‚úÖ LoRA applied!
üìä Trainable: 6,553,600 (0.24%)
üìä Total: 2,786,237,440


## Step 8: Configure Training Arguments

**üîß KEY FIX:** `fp16=False` and `bf16=False` to prevent gradient errors.

In [9]:
print("üìù Setting up training...\n")

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    learning_rate=LEARNING_RATE,
    warmup_ratio=WARMUP_RATIO,
    
    # üîß CRITICAL: Disable mixed precision
    fp16=False,  # ‚Üê NO FP16!
    bf16=False,  # ‚Üê NO BF16!
    
    # Optimizer
    optim="adamw_torch",
    weight_decay=0.01,
    max_grad_norm=1.0,
    
    # Logging
    logging_steps=LOGGING_STEPS,
    logging_dir=f"{OUTPUT_DIR}/logs",
    
    # Saving
    save_strategy="steps",
    save_steps=SAVE_STEPS,
    save_total_limit=2,
    
    # Evaluation
    evaluation_strategy="steps",
    eval_steps=SAVE_STEPS,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    greater_is_better=False,
    
    # Misc
    report_to="none",
    dataloader_num_workers=0,
    remove_unused_columns=False,
    seed=42,
)

print("‚úÖ Training args set!")
print(f"\nüéØ Settings:")
print(f"   - Precision: FP32 (no mixed precision)")
print(f"   - Batch: {BATCH_SIZE} x {GRADIENT_ACCUMULATION_STEPS} = {BATCH_SIZE*GRADIENT_ACCUMULATION_STEPS}")
print(f"   - Learning rate: {LEARNING_RATE}")

üìù Setting up training...

‚úÖ Training args set!

üéØ Settings:
   - Precision: FP32 (no mixed precision)
   - Batch: 1 x 8 = 8
   - Learning rate: 0.0002


## Step 9: Create SFTTrainer

**üîß KEY FIX:** Using `formatting_func` instead of relying on dataset_text_field alone.

In [10]:
print("üèãÔ∏è Creating trainer...\n")

# Set environment variable
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Clear GPU cache
gc.collect()
torch.cuda.empty_cache()

print(f"üíæ GPU allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
print(f"üíæ GPU reserved: {torch.cuda.memory_reserved()/1024**3:.2f} GB")

# üîß KEY FIX: Tokenize the datasets BEFORE creating trainer
def tokenize_function(examples):
    """Tokenize the text field"""
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=MAX_SEQ_LENGTH,
        padding=False,  # Will be done by data collator
    )

print("üî§ Tokenizing datasets...")
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Tokenizing train",
)
tokenized_val = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=val_dataset.column_names,
    desc="Tokenizing validation",
)
print("‚úÖ Tokenization complete!\n")

# Create data collator
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
)

# Create trainer with tokenized data
from transformers import Trainer  # Use regular Trainer, not SFTTrainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
)

print("‚úÖ Trainer ready!\n")

üèãÔ∏è Creating trainer...

üíæ GPU allocated: 10.52 GB
üíæ GPU reserved: 10.52 GB
üî§ Tokenizing datasets...


Tokenizing train:   0%|          | 0/10025 [00:00<?, ? examples/s]

Tokenizing validation:   0%|          | 0/1114 [00:00<?, ? examples/s]

‚úÖ Tokenization complete!

‚úÖ Trainer ready!



## Step 10: Start Training üöÄ

**This will take ~4-5 hours on P100 GPU.**

In [11]:
print("="*80)
print("üöÄ STARTING TRAINING")
print("="*80)
print(f"‚è∞ Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("\nüí° Expected time: ~4-5 hours")
print("üíæ Checkpoints every 500 steps\n")

# Train!
train_result = trainer.train()

print("\n" + "="*80)
print("‚úÖ TRAINING COMPLETE!")
print("="*80)
print(f"‚è∞ Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"\nüìä Final loss: {train_result.training_loss:.4f}")

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


üöÄ STARTING TRAINING
‚è∞ Started: 2026-02-04 16:55:59

üí° Expected time: ~4-5 hours
üíæ Checkpoints every 500 steps



  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
500,0.2704,0.249055
1000,0.2164,0.209081
1500,0.2049,0.193492
2000,0.194,0.180349
2500,0.1787,0.171568
3000,0.1723,0.165991
3500,0.1619,0.161966


‚è∞ Keep-alive: 17:24:11


  return fn(*args, **kwargs)


‚è∞ Keep-alive: 17:54:11


  return fn(*args, **kwargs)


‚è∞ Keep-alive: 18:24:11
‚è∞ Keep-alive: 18:54:11


  return fn(*args, **kwargs)


‚è∞ Keep-alive: 19:24:11


  return fn(*args, **kwargs)


‚è∞ Keep-alive: 19:54:11


  return fn(*args, **kwargs)


‚è∞ Keep-alive: 20:24:11
‚è∞ Keep-alive: 20:54:11


  return fn(*args, **kwargs)


‚è∞ Keep-alive: 21:24:11


  return fn(*args, **kwargs)


‚è∞ Keep-alive: 21:54:11

‚úÖ TRAINING COMPLETE!
‚è∞ Finished: 2026-02-04 21:57:19

üìä Final loss: 0.2314


## Step 11: Save Model

In [12]:
print("\nüíæ Saving model...\n")

trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

print(f"‚úÖ Saved to: {OUTPUT_DIR}/")
print("\nüìÅ Files:")
for f in Path(OUTPUT_DIR).glob("*"):
    print(f"   - {f.name}")


üíæ Saving model...





‚úÖ Saved to: ./freud_phi2_finetuned/

üìÅ Files:
   - adapter_config.json
   - merges.txt
   - training_args.bin
   - tokenizer_config.json
   - special_tokens_map.json
   - added_tokens.json
   - vocab.json
   - README.md
   - adapter_model.safetensors
   - checkpoint-3000
   - tokenizer.json
   - checkpoint-3500


## Step 12: Test the Model üß™

In [13]:
print("üß™ Testing model...\n")

def test_model(user_input, emotion="neutral"):
    prompt = (
        "<|system|>: You are Freud, a calm, empathetic therapeutic AI assistant. "
        "You respond thoughtfully, kindly, and supportively. "
        "You ask gentle follow-up questions and never judge the user.\n"
        f"<|user|>:\n"
        f"[emotion: {emotion}]\n"
        f"{user_input}\n"
        f"<|assistant|>:\n"
    )
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    
    full = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if "<|assistant|>:" in full:
        response = full.split("<|assistant|>:")[-1].strip()
        if "<|user|>" in response:
            response = response.split("<|user|>")[0].strip()
    else:
        response = full.strip()
    
    return response

# Test cases
tests = [
    ("Hi", "greeting"),
    ("I feel sad today", "sad"),
    ("I'm anxious about my exam", "anxious"),
    ("I had a great day!", "happy"),
]

print("="*80)
for user, emotion in tests:
    print(f"\nüë§ User ({emotion}): {user}")
    resp = test_model(user, emotion)
    print(f"ü§ñ Freud: {resp}")
    print("-"*80)

print("\n‚úÖ Testing complete!")

üß™ Testing model...


üë§ User (greeting): Hi
ü§ñ Freud: Hola! ¬øQu√© est√°s h
--------------------------------------------------------------------------------

üë§ User (sad): I feel sad today
ü§ñ Freud: Oh no! What's troubling you? Please tell me more.
<|user
--------------------------------------------------------------------------------

üë§ User (anxious): I'm anxious about my exam
ü§ñ Freud: I understand how exhausting that must be. Please
--------------------------------------------------------------------------------

üë§ User (happy): I had a great day!
ü§ñ Freud: That's wonderful news! Anything special behind your happiness?
--------------------------------------------------------------------------------

‚úÖ Testing complete!


## Step 13: Merge LoRA Adapter (Optional)

Merges adapter with base model for easier deployment.

In [14]:
print("üîÑ Merging LoRA adapter...\n")

# Reload base
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    device_map="auto",
    trust_remote_code=True,
)

# Load and merge
merged = PeftModel.from_pretrained(base, OUTPUT_DIR)
merged = merged.merge_and_unload()

# Save
MERGED_DIR = f"{OUTPUT_DIR}_merged"
merged.save_pretrained(MERGED_DIR)
tokenizer.save_pretrained(MERGED_DIR)

print(f"‚úÖ Merged model: {MERGED_DIR}/")

üîÑ Merging LoRA adapter...



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



OutOfMemoryError: CUDA out of memory. Tried to allocate 26.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 27.12 MiB is free. Process 3323 has 15.86 GiB memory in use. Of the allocated memory 15.55 GiB is allocated by PyTorch, and 5.74 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
import shutil
shutil.make_archive('freud_model_new_phi', 'zip', '/kaggle/working/freud_phi2_finetuned')

In [None]:
!zip -r freud_model_1.zip /kaggle/working/third_sem_project/freud_model

In [None]:
from IPython.display import FileLink

FileLink(r'freud_model_new_phi.zip')

## Step 14: Upload to HuggingFace

In [None]:
print(f"üì§ Uploading to: {HF_MODEL_NAME}...\n")

merged.push_to_hub(HF_MODEL_NAME, use_temp_dir=False)
tokenizer.push_to_hub(HF_MODEL_NAME, use_temp_dir=False)

print("‚úÖ Upload complete!")
print(f"\nüîó https://huggingface.co/{HF_MODEL_NAME}")