In [6]:
!nvidia-smi

Sun Nov 23 08:29:29 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   35C    P8              9W /   70W |       3MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting pyarrow>=21.0.0 (from datasets>=2.0.0->evaluate)
  Downloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp311-cp311-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m47.7/47.7 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: pyarrow, evaluate
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 19.0.1
    Uninstalling pyarrow-19.0.1:
      Successfully uninstalled pyarrow-19.0.1
[31mERR

In [None]:
import os

print("Input directory:", os.listdir('/kaggle/input'))

for root, dirs, files in os.walk('/kaggle/input', topdown=True):
    print(root, dirs, files)

Input directory: ['.virtual_documents']
/kaggle/working ['.virtual_documents'] []
/kaggle/working/.virtual_documents [] []


In [None]:
import torch
import pandas as pd
from datasets import Dataset, load_from_disk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import TrainerCallback
import numpy as np
import evaluate
import subprocess
import psutil
import os

os.environ["WANDB_DISABLED"] = "true"


# ====================== Function to check the memory usage
def print_memory_usage(tag=""):
    print(f"\n===== Memory Usage {tag} =====")
    try:
        gpu_info = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.used,memory.total", "--format=csv,noheader,nounits"]
        )
        gpu_info = gpu_info.decode("utf-8").strip().split("\n")
        for i, gpu in enumerate(gpu_info):
            used, total = map(int, gpu.split(','))
            print(f"GPU {i}: {used}MB / {total}MB")
    except:
        print("No GPU found")

    ram = psutil.virtual_memory()
    print(f"CPU RAM: {ram.used/1024**2:.2f}MB / {ram.total/1024**2:.2f}MB")
    print("====================================\n")




class MemoryCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 100 == 0 and state.global_step > 0:
            print_memory_usage(f"Training step {state.global_step}")


cache_path = "/tokenized_enron_spam"

if os.path.exists(cache_path):
    tokenized_datasets = load_from_disk(cache_path)

else:
    df = pd.read_csv("/kaggle/input/classification-data/enron_spam_data.csv")
    dataset = Dataset.from_pandas(df)
    dataset = dataset.train_test_split(test_size=0.2)

    model_name = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def prepare_dataset(example):
        example["label"] = 1 if example["Spam/Ham"].lower() == "spam" else 0
        return example

    dataset = dataset.map(prepare_dataset)

    def preprocess_function(examples):
        messages = [msg if msg is not None else "" for msg in examples["Message"]]
        return tokenizer(messages, truncation=True, padding="max_length")

    dataset = dataset.filter(lambda x: x["Message"] is not None and len(str(x["Message"]).strip()) > 0)

    tokenized_datasets = dataset.map(preprocess_function, batched=True)

    print("üíæ Saving tokenized dataset...")
    tokenized_datasets.save_to_disk(cache_path)
    print(f"‚úî Tokenized dataset saved to {cache_path}")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Label map
id2label = {0: "HAM", 1: "SPAM"}
label2id = {"HAM": 0, "SPAM": 1}

# ====================== Load model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="kagglespam_classification_results",
    learning_rate=2e-5,
    dataloader_num_workers=0,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[MemoryCallback()],
)

print_memory_usage("Before training")
print("Starting training...")
trainer.train()
print_memory_usage("After training")

print("Evaluating...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

trainer.save_model("spam_classifier_model__full")
print("Model saved to spam_classifier_model__full")


2025-11-23 08:43:59.951971: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763887440.336696      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763887440.478062      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

üÜï No cache found ‚Äî processing raw dataset...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/26972 [00:00<?, ? examples/s]

Map:   0%|          | 0/6744 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26972 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6744 [00:00<?, ? examples/s]

Map:   0%|          | 0/26671 [00:00<?, ? examples/s]

Map:   0%|          | 0/6674 [00:00<?, ? examples/s]

üíæ Saving tokenized dataset...


Saving the dataset (0/1 shards):   0%|          | 0/26671 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6674 [00:00<?, ? examples/s]

‚úî Tokenized dataset saved to /tokenized_enron_spam


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script: 0.00B [00:00, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(



===== Memory Usage Before training =====
GPU 0: 395MB / 15360MB
GPU 1: 3MB / 15360MB
CPU RAM: 2728.25MB / 32102.90MB

Starting training...




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.034197,0.987863
2,No log,0.030358,0.990261



===== Memory Usage Training step 100 =====
GPU 0: 11411MB / 15360MB
GPU 1: 10621MB / 15360MB
CPU RAM: 3498.99MB / 32102.90MB






===== Memory Usage Training step 200 =====
GPU 0: 11411MB / 15360MB
GPU 1: 10621MB / 15360MB
CPU RAM: 3475.33MB / 32102.90MB






===== Memory Usage Training step 300 =====
GPU 0: 11411MB / 15360MB
GPU 1: 10621MB / 15360MB
CPU RAM: 3513.51MB / 32102.90MB






===== Memory Usage Training step 400 =====
GPU 0: 11411MB / 15360MB
GPU 1: 10621MB / 15360MB
CPU RAM: 3506.30MB / 32102.90MB






===== Memory Usage After training =====
GPU 0: 11411MB / 15360MB
GPU 1: 10621MB / 15360MB
CPU RAM: 3494.85MB / 32102.90MB

Evaluating...




Evaluation results: {'eval_loss': 0.030358023941516876, 'eval_accuracy': 0.9902607132154629, 'eval_runtime': 58.0805, 'eval_samples_per_second': 114.91, 'eval_steps_per_second': 0.913, 'epoch': 2.0}
Model saved to spam_classifier_model__full


In [None]:
# We also have to import those librarys again in each new cell, as the code running on kaggle would not release the memory automatically, we have to reset the environment each time
import os
import subprocess
import psutil
import numpy as np
import pandas as pd

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    TrainerCallback
)
from datasets import Dataset, load_from_disk
from peft import LoraConfig, get_peft_model
import evaluate  
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Same , we need to reset the environment, function in a new cell
def print_memory_usage(tag=""):
    print(f"\n===== Memory Usage {tag} =====")
    try:
        gpu_info = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.used,memory.total", "--format=csv,noheader,nounits"]
        )
        gpu_info = gpu_info.decode("utf-8").strip().split("\n")
        for i, gpu in enumerate(gpu_info):
            used, total = map(int, gpu.split(','))
            print(f"GPU {i}: {used}MB / {total}MB")
    except Exception as e:
        print("No GPU found or nvidia-smi not available")

    ram = psutil.virtual_memory()
    print(f"CPU RAM: {ram.used/1024**2:.2f}MB / {ram.total/1024**2:.2f}MB")
    print("====================================\n")


class MemoryCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 100 == 0 and state.global_step > 0:
            print_memory_usage(f"Training step {state.global_step}")

cache_path = "/tokenized_enron_spam"

if os.path.exists(cache_path):
    tokenized_datasets = load_from_disk(cache_path)
else:
    df = pd.read_csv("/kaggle/input/classification-data/enron_spam_data.csv")
    dataset = Dataset.from_pandas(df)
    dataset = dataset.train_test_split(test_size=0.2)

    model_name = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    def prepare_dataset(example):
        example["label"] = 1 if example["Spam/Ham"].lower() == "spam" else 0
        return example

    dataset = dataset.map(prepare_dataset)

    def preprocess_function(examples):
        messages = [msg if msg is not None else "" for msg in examples["Message"]]
        return tokenizer(messages, truncation=True, padding="max_length", max_length=512)

    dataset = dataset.filter(lambda x: x["Message"] is not None and len(str(x["Message"]).strip()) > 0)

    tokenized_datasets = dataset.map(preprocess_function, batched=True, batch_size=1000)

    print("Saving tokenized dataset...")
    tokenized_datasets.save_to_disk(cache_path)
    print(f"‚úî Tokenized dataset saved to {cache_path}")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

id2label = {0: "HAM", 1: "SPAM"}
label2id = {"HAM": 0, "SPAM": 1}


# =================================== Load Model with LoRA
print("Loading DistilBERT for LoRA...")
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id,
    torch_dtype=torch.float16,  # FP16 for memory efficiency
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_lin", "k_lin", "v_lin", "out_lin"],  # DistilBERT specific
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


training_args = TrainingArguments(
    output_dir="/kaggle/working/spam_classification_LoRA_result",
    learning_rate=1e-4,
    dataloader_num_workers=0,  # Avoid tokenizers fork warning
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    max_grad_norm=1.0,
    report_to="none",  
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[MemoryCallback()],
)

print_memory_usage("Before LoRA training")
print("üöÄ Starting LoRA training...")
trainer.train()
print_memory_usage("After LoRA training")

print("Evaluating...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

trainer.save_model("/kaggle/working/spam_classifier_LoRA_model")
print("LoRA model saved.")

2025-11-23 06:07:55.601300: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763878076.023848      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763878076.145067      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

üÜï No cache found ‚Äî processing raw dataset...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/26972 [00:00<?, ? examples/s]

Map:   0%|          | 0/6744 [00:00<?, ? examples/s]

Filter:   0%|          | 0/26972 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6744 [00:00<?, ? examples/s]

Map:   0%|          | 0/26684 [00:00<?, ? examples/s]

Map:   0%|          | 0/6661 [00:00<?, ? examples/s]

üíæ Saving tokenized dataset...


Saving the dataset (0/1 shards):   0%|          | 0/26684 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6661 [00:00<?, ? examples/s]

‚úî Tokenized dataset saved to /tokenized_enron_spam
Loading DistilBERT for LoRA...


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,181,954 || all params: 68,136,964 || trainable%: 1.7347


Downloading builder script: 0.00B [00:00, ?B/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



===== Memory Usage Before LoRA training =====
GPU 0: 253MB / 15360MB
GPU 1: 3MB / 15360MB
CPU RAM: 2687.07MB / 32102.89MB

üöÄ Starting LoRA training...




Epoch,Training Loss,Validation Loss,Accuracy
1,No log,,0.487614
2,No log,,0.487614



===== Memory Usage Training step 100 =====
GPU 0: 6991MB / 15360MB
GPU 1: 6989MB / 15360MB
CPU RAM: 3518.64MB / 32102.89MB






===== Memory Usage Training step 200 =====
GPU 0: 6991MB / 15360MB
GPU 1: 6989MB / 15360MB
CPU RAM: 3517.01MB / 32102.89MB



  if operator(metric_value, self.state.best_metric):



===== Memory Usage Training step 300 =====
GPU 0: 7017MB / 15360MB
GPU 1: 7015MB / 15360MB
CPU RAM: 3506.80MB / 32102.89MB






===== Memory Usage Training step 400 =====
GPU 0: 7017MB / 15360MB
GPU 1: 7015MB / 15360MB
CPU RAM: 3511.20MB / 32102.89MB



  if operator(metric_value, self.state.best_metric):



===== Memory Usage After LoRA training =====
GPU 0: 7017MB / 15360MB
GPU 1: 7015MB / 15360MB
CPU RAM: 3501.22MB / 32102.89MB

Evaluating...




Evaluation results: {'eval_loss': nan, 'eval_accuracy': 0.4876144723014562, 'eval_runtime': 18.5303, 'eval_samples_per_second': 359.464, 'eval_steps_per_second': 2.86, 'epoch': 2.0}
LoRA model saved.


In [None]:
import os
import subprocess
import psutil
import pandas as pd
import numpy as np
import torch

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    TrainerCallback,
    DataCollatorForLanguageModeling
)
from datasets import Dataset, load_from_disk

os.environ["WANDB_DISABLED"] = "true"


def print_memory_usage(tag=""):
    print(f"\n===== Memory Usage {tag} =====")
    try:
        gpu_info = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.used,memory.total",
             "--format=csv,noheader,nounits"]
        ).decode("utf-8").strip().split("\n")

        for i, line in enumerate(gpu_info):
            used, total = map(int, line.split(","))
            print(f"GPU {i}: {used}MB / {total}MB")
    except:
        print("No GPU detected")

    ram = psutil.virtual_memory()
    print(f"CPU RAM: {ram.used/1024**2:.2f}MB / {ram.total/1024**2:.2f}MB")
    print("======================================\n")

class MemoryCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 100 == 0 and state.global_step > 0:
            print_memory_usage(f"Training step {state.global_step}")

cache_dir = "/content/drive/MyDrive/gpt2_reply_tokenized_cache"


if os.path.exists(cache_dir):
    tokenized_datasets = load_from_disk(cache_dir)

else:
    df = pd.read_csv("/kaggle/input/generation-data/synthetic_reply_dataset.csv")
    dataset = Dataset.from_pandas(df)
    dataset = dataset.train_test_split(test_size=0.1)

    model_name = "gpt2"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token


    def preprocess_function(examples):
        inputs = examples["Message"]
        targets = examples["reply"]

        texts = []
        for i, t in zip(inputs, targets):
            i = str(i) if i is not None else ""
            t = str(t) if t is not None else ""
            text = f"Email: {i[:512]}\n\nReply: {t}{tokenizer.eos_token}"
            texts.append(text)

        return tokenizer(texts, truncation=True, padding="max_length", max_length=256)

    print("Tokenizing dataset (first time, this is slow)...")
    tokenized_datasets = dataset.map(preprocess_function, batched=True)

    print(f"Saving tokenized dataset to cache: {cache_dir}")
    tokenized_datasets.save_to_disk(cache_dir)

    print("‚úÖ Cache created!")

model_name = "gpt2"
print(f"Loading model: {model_name}...")

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)

output_dir = "/kaggle/working/reply_generation_model_full"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    dataloader_num_workers=0,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    save_strategy="epoch",
    eval_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir=f"{output_dir}/logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks=[MemoryCallback()],
)

print_memory_usage("Before training")
print("üöÄ Starting training...")
trainer.train()
print_memory_usage("After training")

print(f"Saving final model and tokenizer to {output_dir}...")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(" Model saved.")


2025-11-23 07:53:15.986000: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763884396.171969      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763884396.226954      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

‚ö† No cache found! Creating tokenized dataset...
Loading synthetic dataset...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Tokenizing dataset (first time, this is slow)...


Map:   0%|          | 0/14843 [00:00<?, ? examples/s]

Map:   0%|          | 0/1650 [00:00<?, ? examples/s]

Saving tokenized dataset to cache: /content/drive/MyDrive/gpt2_reply_tokenized_cache


Saving the dataset (0/1 shards):   0%|          | 0/14843 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1650 [00:00<?, ? examples/s]

‚úÖ Cache created!
Loading model: gpt2...




model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).



===== Memory Usage Before training =====
GPU 0: 645MB / 15360MB
GPU 1: 3MB / 15360MB
CPU RAM: 2661.81MB / 32102.90MB

üöÄ Starting training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,2.375518
2,1.340400,2.282806
3,1.186000,2.25983



===== Memory Usage Training step 100 =====
GPU 0: 9797MB / 15360MB
GPU 1: 105MB / 15360MB
CPU RAM: 3374.21MB / 32102.90MB


===== Memory Usage Training step 200 =====
GPU 0: 9797MB / 15360MB
GPU 1: 105MB / 15360MB
CPU RAM: 3371.68MB / 32102.90MB


===== Memory Usage Training step 300 =====
GPU 0: 9797MB / 15360MB
GPU 1: 105MB / 15360MB
CPU RAM: 3352.39MB / 32102.90MB


===== Memory Usage Training step 400 =====
GPU 0: 9797MB / 15360MB
GPU 1: 105MB / 15360MB
CPU RAM: 3357.43MB / 32102.90MB






===== Memory Usage Training step 500 =====
GPU 0: 11395MB / 15360MB
GPU 1: 3149MB / 15360MB
CPU RAM: 3557.25MB / 32102.90MB


===== Memory Usage Training step 600 =====
GPU 0: 11395MB / 15360MB
GPU 1: 3149MB / 15360MB
CPU RAM: 3572.18MB / 32102.90MB


===== Memory Usage Training step 700 =====
GPU 0: 11395MB / 15360MB
GPU 1: 3149MB / 15360MB
CPU RAM: 3575.00MB / 32102.90MB


===== Memory Usage Training step 800 =====
GPU 0: 11395MB / 15360MB
GPU 1: 3149MB / 15360MB
CPU RAM: 3563.14MB / 32102.90MB


===== Memory Usage Training step 900 =====
GPU 0: 11395MB / 15360MB
GPU 1: 3149MB / 15360MB
CPU RAM: 3554.83MB / 32102.90MB






===== Memory Usage Training step 1000 =====
GPU 0: 11395MB / 15360MB
GPU 1: 3149MB / 15360MB
CPU RAM: 3583.08MB / 32102.90MB


===== Memory Usage Training step 1100 =====
GPU 0: 11395MB / 15360MB
GPU 1: 3149MB / 15360MB
CPU RAM: 3582.09MB / 32102.90MB


===== Memory Usage Training step 1200 =====
GPU 0: 11395MB / 15360MB
GPU 1: 3149MB / 15360MB
CPU RAM: 3593.32MB / 32102.90MB


===== Memory Usage Training step 1300 =====
GPU 0: 11395MB / 15360MB
GPU 1: 3149MB / 15360MB
CPU RAM: 3571.47MB / 32102.90MB






===== Memory Usage After training =====
GPU 0: 11395MB / 15360MB
GPU 1: 3149MB / 15360MB
CPU RAM: 3572.68MB / 32102.90MB

Saving final model and tokenizer to /kaggle/working/reply_generation_model_full...
‚úÖ Done! Model saved to Google Drive.


In [None]:
import os
import subprocess
import psutil
import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    TrainerCallback,
    DataCollatorForLanguageModeling
)
from datasets import Dataset, load_from_disk
from peft import LoraConfig, get_peft_model

os.environ["WANDB_DISABLED"] = "true"

def print_memory_usage(tag=""):
    print(f"\n===== Memory Usage {tag} =====")
    try:
        gpu_info = subprocess.check_output(
            ["nvidia-smi", "--query-gpu=memory.used,memory.total",
             "--format=csv,noheader,nounits"]
        ).decode("utf-8").strip().split("\n")

        for i, line in enumerate(gpu_info):
            used, total = map(int, line.split(","))
            print(f"GPU {i}: {used}MB / {total}MB")
    except:
        print("No GPU detected")

    ram = psutil.virtual_memory()
    print(f"CPU RAM: {ram.used/1024**2:.2f}MB / {ram.total/1024**2:.2f}MB")
    print("======================================\n")

class MemoryCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 100 == 0 and state.global_step > 0:
            print_memory_usage(f"Training step {state.global_step}")

cache_dir = "/content/drive/MyDrive/gpt2_reply_tokenized_cache_lora"

if os.path.exists(cache_dir):
    tokenized_datasets = load_from_disk(cache_dir)
else:
    print("Loading synthetic dataset...")
    df = pd.read_csv("/kaggle/input/generation-data/synthetic_reply_dataset.csv")
    dataset = Dataset.from_pandas(df)
    dataset = dataset.train_test_split(test_size=0.1)

    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token

    def preprocess_function(examples):
        inputs = examples["Message"]
        targets = examples["reply"]

        texts = []
        for i, t in zip(inputs, targets):
            i = str(i) if i else ""
            t = str(t) if t else ""
            text = f"Email: {i[:512]}\n\nReply: {t}{tokenizer.eos_token}"
            texts.append(text)

        return tokenizer(texts, truncation=True, padding="max_length", max_length=256)

    print("Tokenizing dataset (this may take time)...")
    tokenized_datasets = dataset.map(preprocess_function, batched=True)

    print(f"Saving tokenized dataset to cache: {cache_dir}")
    tokenized_datasets.save_to_disk(cache_dir)
    print("Cache created!")

# Loding GPT-2 with LoRA
print("Loading GPT-2 + LoRA...")

base_model = AutoModelForCausalLM.from_pretrained(
    "gpt2",
    torch_dtype=torch.float16
)

target_modules = ["c_attn", "c_proj"]

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=target_modules,
    task_type="CAUSAL_LM",
)

model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()


output_dir = "/kaggle/working/reply_generation_model_LoRA"

training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=3,
    dataloader_num_workers=0,                 
    per_device_train_batch_size=16,           
    per_device_eval_batch_size=16,            
    save_strategy="epoch",
    eval_strategy="epoch",
    learning_rate=2e-4, # 5e-5 -> 2e-4
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir=f"{output_dir}/logs",
)

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
    callbacks=[MemoryCallback()],
)

print_memory_usage("Before LoRA Training")
print("Starting LoRA training...")
trainer.train()
print_memory_usage("After LoRA Training")

print(f"Saving LoRA model to {output_dir}...")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print("‚úÖ Done! LoRA model saved to Google Drive.")

üîÑ Loading cached tokenized dataset...
Loading GPT-2 + LoRA...


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475


No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



===== Memory Usage Before LoRA Training =====
GPU 0: 373MB / 15360MB
GPU 1: 3MB / 15360MB
CPU RAM: 2947.89MB / 32102.90MB

üöÄ Starting LoRA training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,3.334808
2,1.956900,3.187673
3,1.714800,3.151971



===== Memory Usage Training step 100 =====
GPU 0: 6561MB / 15360MB
GPU 1: 105MB / 15360MB
CPU RAM: 3494.79MB / 32102.90MB


===== Memory Usage Training step 200 =====
GPU 0: 6561MB / 15360MB
GPU 1: 105MB / 15360MB
CPU RAM: 3488.26MB / 32102.90MB


===== Memory Usage Training step 300 =====
GPU 0: 6561MB / 15360MB
GPU 1: 105MB / 15360MB
CPU RAM: 3479.69MB / 32102.90MB


===== Memory Usage Training step 400 =====
GPU 0: 6561MB / 15360MB
GPU 1: 105MB / 15360MB
CPU RAM: 3485.19MB / 32102.90MB






===== Memory Usage Training step 500 =====
GPU 0: 6587MB / 15360MB
GPU 1: 2821MB / 15360MB
CPU RAM: 3710.10MB / 32102.90MB


===== Memory Usage Training step 600 =====
GPU 0: 6587MB / 15360MB
GPU 1: 2821MB / 15360MB
CPU RAM: 3723.44MB / 32102.90MB


===== Memory Usage Training step 700 =====
GPU 0: 6587MB / 15360MB
GPU 1: 2821MB / 15360MB
CPU RAM: 3720.54MB / 32102.90MB


===== Memory Usage Training step 800 =====
GPU 0: 6587MB / 15360MB
GPU 1: 2821MB / 15360MB
CPU RAM: 3704.08MB / 32102.90MB


===== Memory Usage Training step 900 =====
GPU 0: 6587MB / 15360MB
GPU 1: 2821MB / 15360MB
CPU RAM: 3724.56MB / 32102.90MB






===== Memory Usage Training step 1000 =====
GPU 0: 6587MB / 15360MB
GPU 1: 2821MB / 15360MB
CPU RAM: 3718.64MB / 32102.90MB


===== Memory Usage Training step 1100 =====
GPU 0: 6587MB / 15360MB
GPU 1: 2821MB / 15360MB
CPU RAM: 3703.11MB / 32102.90MB


===== Memory Usage Training step 1200 =====
GPU 0: 6587MB / 15360MB
GPU 1: 2821MB / 15360MB
CPU RAM: 3700.98MB / 32102.90MB


===== Memory Usage Training step 1300 =====
GPU 0: 6587MB / 15360MB
GPU 1: 2821MB / 15360MB
CPU RAM: 3699.83MB / 32102.90MB






===== Memory Usage After LoRA Training =====
GPU 0: 6587MB / 15360MB
GPU 1: 2821MB / 15360MB
CPU RAM: 3700.64MB / 32102.90MB

Saving LoRA model to /kaggle/working/reply_generation_model_LoRA...
‚úÖ Done! LoRA model saved to Google Drive.
