In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# ✅ AGI1_05c_text_pretraining.ipynb — Advanced Version
import os
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

# ✅ Detect Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"🔧 Device detected: {device}")

# ✅ Step 1: Confirm CSV Path
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

csv_path = '/kaggle/input/alpaca-dataset-v1/alpaca_data_cleaned.csv'  # update as per your input
if not os.path.isfile(csv_path):
    raise FileNotFoundError(f"CSV not found: {csv_path}")

df = pd.read_csv(csv_path)
print(f"✅ Loaded dataset with {len(df)} rows")

# ✅ Step 2: Format Data for Pretraining
def format_instruction(row):
    instruction = str(row['instruction']).strip()
    input_txt = str(row.get('input', '')).strip()
    output = str(row['output']).strip()
    return f"### Instruction: {instruction}\n### Input: {input_txt}\n### Output: {output}"

df['text'] = df.apply(format_instruction, axis=1)
dataset = Dataset.from_pandas(df[['text']])
dataset = dataset.train_test_split(test_size=0.1)
print("✅ Dataset formatted")

# ✅ Step 3: Choose Model
model_name = 'gpt2-medium'  # Change to 'gpt2-large', 'meta-llama/Llama-2-7b-hf' etc if you have access + GPU
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
print(f"✅ Loaded model: {model_name}")

# ✅ Step 4: Tokenization
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])
print("✅ Tokenization completed")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# ✅ Step 5: Training Arguments (adjust batch size for CPU/GPU)
training_args = TrainingArguments(
    output_dir='/kaggle/working/agi1_05c_output',
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    weight_decay=0.01,
    per_device_train_batch_size=4 if device.type == 'cuda' else 2,
    per_device_eval_batch_size=4 if device.type == 'cuda' else 2,
    num_train_epochs=1,
    logging_steps=5,
    save_strategy="epoch",
    fp16=True if device.type == 'cuda' else False,
    logging_dir='/kaggle/working/agi1_05c_logs'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# ✅ Step 6: Start Training
trainer.train()

# ✅ Step 7: Save Model
trainer.save_model('/kaggle/working/agi1_05c_output')
tokenizer.save_pretrained('/kaggle/working/agi1_05c_output')
print("✅ Advanced Pretraining Complete. Model saved to /kaggle/working/agi1_05c_output")

2025-07-14 09:59:53.464177: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752487193.689066      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752487193.757231      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🔧 Device detected: cpu


FileNotFoundError: CSV not found: /kaggle/input/alpaca-dataset-v1/alpaca_data_cleaned.csv