In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, PretrainedConfig

# Load your protein sequence dataset
# Replace 'your_dataset.txt' with the actual file containing your protein sequences
dataset_path = 'your_dataset.txt'

# Load the pre-trained GPT-2 model and tokenizer
model_name = 'EleutherAI/gpt-neo-1.3B'  # You can experiment with different model sizes
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

class ProteinGPT2Config(GPT2Config):
    """Custom configuration for GPT-2 model."""
    model_type = "gpt2"
    attribute_map = {"num_heads": "num_attention_heads", "hidden_size": "d_model"}

    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = kwargs.pop("embed_dim", 768)
        self.num_hidden_layers = kwargs.pop("num_hidden_layers", 12)  # Set the correct number of hidden layers

config = ProteinGPT2Config.from_pretrained(model_name)
config.embed_dim = config.embed_dim // config.num_heads
model = GPT2LMHeadModel(config=config)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'])

# Prepare the dataset
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path=dataset_path,
    block_size=128,  # Adjust block_size according to your dataset size
    overwrite_cache=True,
)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./protein_generation_model',
    overwrite_output_dir=True,
    num_train_epochs=3,  # Adjust the number of epochs as needed
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
)

# Fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer),
    train_dataset=train_dataset,
)
