In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# AGI1_05b_text_pretraining.ipynb
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset

# ✅ Device Check
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🔧 Using device: {device}")

# ✅ Step 1: Load Alpaca or ShareGPT data
csv_path = '/kaggle/input/alpaca-dataset-v1/alpaca_data_cleaned.csv'  # update path as needed
df = pd.read_csv(csv_path)

# ✅ Step 2: Prepare dataset in HuggingFace format
def format_alpaca(example):
    return {
        "text": f"### Instruction: {example['instruction']}\n### Input: {example.get('input', '')}\n### Output: {example['output']}"
    }

dataset = Dataset.from_pandas(df)
dataset = dataset.map(format_alpaca)
dataset = dataset.train_test_split(test_size=0.1)

print("✅ Dataset prepared")

# ✅ Step 3: Load tokenizer & model (e.g., LLaMA-2 if access, else distilgpt2)
model_name = 'distilgpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["instruction", "input", "output", "text"])
print("✅ Tokenization complete")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# ✅ Step 4: Training Arguments
training_args = TrainingArguments(
    output_dir="./text_pretrain_output",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    push_to_hub=False
)

# ✅ Step 5: Trainer Setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ✅ Step 6: Start Training
trainer.train()

# ✅ Step 7: Save Model
trainer.save_model("./text_pretrain_output")
tokenizer.save_pretrained("./text_pretrain_output")
print("✅ LLM Fine-tuning complete & saved to /text_pretrain_output")

2025-07-14 09:38:49.499030: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752485929.715794      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752485929.779541      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🔧 Using device: cpu


FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/alpaca-dataset-v1/alpaca_data_cleaned.csv'

In [None]:
# ✅ AGI1_05b_text_pretraining.ipynb
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

# ✅ Force CPU
device = torch.device('cpu')
print(f"🔧 Using device: {device}")

# ✅ Step 1: Find your CSV path
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# ✅ Step 2: Load CSV (update this path based on previous output)
csv_path = '/kaggle/input/alpaca-dataset-v1/alpaca_data_cleaned.csv'  # change if different
if not os.path.isfile(csv_path):
    raise FileNotFoundError(f"CSV file not found at {csv_path}")

df = pd.read_csv(csv_path)
print(f"✅ Loaded CSV with {len(df)} rows")

# ✅ Step 3: Prepare dataset format for HuggingFace
def format_example(example):
    instruction = str(example['instruction']).strip()
    input_text = str(example.get('input', '')).strip()
    output = str(example['output']).strip()
    return {"text": f"### Instruction: {instruction}\n### Input: {input_text}\n### Output: {output}"}

dataset = Dataset.from_pandas(df)
dataset = dataset.map(format_example)
dataset = dataset.train_test_split(test_size=0.1)
print("✅ Dataset formatted and split into train/test")

# ✅ Step 4: Load tokenizer and model (distilgpt2 for lightweight CPU training)
model_name = 'distilgpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=dataset['train'].column_names)
print("✅ Tokenization complete")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# ✅ Step 5: Training arguments
training_args = TrainingArguments(
    output_dir='/kaggle/working/text_pretrain_output',
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    weight_decay=0.01,
    per_device_train_batch_size=2,  # smaller batch for CPU
    per_device_eval_batch_size=2,
    num_train_epochs=1,
    save_strategy="epoch",
    logging_steps=5,
    push_to_hub=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ✅ Step 6: Start Training
trainer.train()

# ✅ Step 7: Save model and tokenizer
output_dir = '/kaggle/working/text_pretrain_output'
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"✅ Fine-tuned model saved to {output_dir}")