In [1]:
import os
import pandas as pd
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import psutil
import gc
from tqdm.auto import tqdm

# Disable WandB logging
os.environ["WANDB_DISABLED"] = "true"

# Memory management functions
def get_gpu_memory():
    """Return GPU memory usage in MB"""
    torch.cuda.empty_cache()
    return round(torch.cuda.memory_allocated() / 1024 / 1024, 2)

def free_memory():
    """Free up memory"""
    gc.collect()
    torch.cuda.empty_cache()

# Check GPU availability
if torch.cuda.is_available():
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU available, using CPU.")

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Display system info
print(f"CPU cores: {psutil.cpu_count()}")
print(f"Available RAM: {round(psutil.virtual_memory().available / 1024 / 1024 / 1024, 2)} GB")

# Dataset path
dataset_path = '/kaggle/input/dataset3/dataset_2016.csv'

# Load and preprocess dataset
def load_data_from_csv(file_path, max_stories=None):
    """Load stories from CSV"""
    df = pd.read_csv(file_path, 
                    usecols=['storyid', 'storytitle', 'sentence1', 'sentence2', 'sentence3', 'sentence4', 'sentence5'],
                    low_memory=True)
    
    if max_stories:
        df = df.iloc[:max_stories]
    
    stories = []
    batch_size = 1000
    num_batches = (len(df) + batch_size - 1) // batch_size

    for i in tqdm(range(num_batches), desc="Processing stories"):
        batch_df = df.iloc[i*batch_size:(i+1)*batch_size]
        
        for _, row in batch_df.iterrows():
            story_sentences = [row[f'sentence{j}'] for j in range(1, 6) if pd.notna(row[f'sentence{j}'])]
            title = row['storytitle'] if pd.notna(row['storytitle']) else ""
            full_story = f"{title}: {' '.join(story_sentences)}" if title else ' '.join(story_sentences)
            stories.append(full_story)

        del batch_df
        gc.collect()

    del df
    gc.collect()
    
    return stories

# Create output directory
os.makedirs('/kaggle/working/processed', exist_ok=True)

# Load dataset
print("Loading and processing dataset...")
try:
    stories = load_data_from_csv(dataset_path)
except Exception as e:
    print(f"Error loading dataset: {e}, trying smaller sample...")
    stories = load_data_from_csv(dataset_path, max_stories=10000)

processed_file_path = '/kaggle/working/processed/processed_stories.txt'
with open(processed_file_path, 'w', encoding='utf-8') as f:
    for story in stories:
        f.write(story + '\n')

print(f"Saved processed stories to {processed_file_path}")

# Free memory
free_memory()

# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Add padding token
tokenizer.pad_token = tokenizer.eos_token

# Load dataset function
def load_dataset(file_path, tokenizer, block_size=256):
    return TextDataset(tokenizer=tokenizer, file_path=file_path, block_size=block_size)

print("Loading dataset for training...")
train_dataset = load_dataset(processed_file_path, tokenizer, block_size=256)
print(f"Dataset length: {len(train_dataset)}")

# Move model to GPU
model.to(device)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Output directory
model_output_dir = '/kaggle/working/gpt2_finetuned'
os.makedirs(model_output_dir, exist_ok=True)

# Training arguments (optimized for P100)
training_args = TrainingArguments(
    output_dir=model_output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,  # Reduced batch size
    gradient_accumulation_steps=4,  # Simulate larger batch size
    save_steps=5000,
    save_total_limit=2,
    fp16=True,  # Keep mixed precision
    fp16_opt_level="O1",  # More stable than "O2"
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    dataloader_num_workers=4,  # Increase workers
    dataloader_pin_memory=False,  # Avoid memory issues
    run_name="gpt2_finetune_kaggle"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Free memory
free_memory()

# Training progress bar
from transformers import TrainerCallback

class ProgressBarCallback(TrainerCallback):
    def __init__(self, total_steps):
        self.progress_bar = tqdm(total=total_steps, desc="Training Progress", dynamic_ncols=True)

    def on_step_end(self, args, state, control, **kwargs):
        self.progress_bar.update(1)

    def on_train_end(self, args, state, control, **kwargs):
        self.progress_bar.close()

# Add progress bar
total_steps = len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
trainer.add_callback(ProgressBarCallback(total_steps))

# Start training
print("Starting training...")
trainer.train()

# Save model
model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(model_output_dir)
print(f"Model saved to {model_output_dir}")

# Final cleanup
free_memory()

def generate_text(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],  # Important fix
        max_length=max_length,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id  # Set explicitly
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

sample_prompt = "The young boy wanted to"
generated_text = generate_text(sample_prompt)
print(f"\nGenerated text:\n{generated_text}")

Using GPU: Tesla P100-PCIE-16GB
CPU cores: 4
Available RAM: 29.74 GB
Loading and processing dataset...


Processing stories:   0%|          | 0/46 [00:00<?, ?it/s]

Saved processed stories to /kaggle/working/processed/processed_stories.txt


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loading dataset for training...




Dataset length: 10210


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Training Progress:   0%|          | 0/319 [00:00<?, ?it/s]

Starting training...


Step,Training Loss
100,3.3556
200,3.0926
300,3.0149
400,2.9343
500,2.9259
600,2.9054
700,2.8345
800,2.828
900,2.8215


Model saved to /kaggle/working/gpt2_finetuned

Generated text:
The young boy wanted to play with the other girls. So he got out his toy, then gave her a new one.
A Friend's Friend: Lucy wanted a dog. She knew that one would help her. She went to the shelter and offered it. But her dog couldn't talk, so she gave her one. Lucy was so happy she cried the whole time.
Too big: My brother was in the hospital with a huge heart. He was talking to a baby in the room
