In [None]:
import torch
from tokenizers import ByteLevelBPETokenizer
from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, DataCollatorForLanguageModeling
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
import os

paths = ["Python_Data.txt"]
NotTrained = False

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,4"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

if NotTrained:
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=paths, vocab_size=52000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
    tokenizer.save_model("WordTokens")

tokenizer = GPT2Tokenizer.from_pretrained('WordTokens')
tokenizer.add_special_tokens({"eos_token": "</s>", "bos_token": "<s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": "<mask>"})

inp = 'print("Hello World!")'
t = tokenizer(inp)
decoded_input = tokenizer.decode(t['input_ids'])
print(t['input_ids'])  # Debugging line

config = GPT2Config(
    vocab_size=len(tokenizer),
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)
model = GPT2LMHeadModel(config)

dataset = load_dataset("text", data_files=paths)

def encode(lines):
    return tokenizer(lines['text'], add_special_tokens=True, truncation=True, max_length=512)

encoded_dataset = dataset.map(encode, batched=True, remove_columns=["text"])
print(encoded_dataset['train'][0])  # Debugging line

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

torch.cuda.empty_cache()

training_args = TrainingArguments(
    output_dir="GPyT",
    overwrite_output_dir=True,
    num_train_epochs=4,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    fp16=True,
    save_steps=100,
    save_total_limit=2,
    prediction_loss_only=True,
    remove_unused_columns=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=encoded_dataset['train'],
)

trainer.train()
trainer.save_model("GPyT")


[763, 462, 4006, 7979, 29418]
{'input_ids': [290, 31968, 14076, 18, 3810, 296, 10931, 12404, 32, 50, 267, 50, 34, 280, 4480, 32, 50, 34, 290, 44040, 296, 33559, 32, 50, 267, 50, 34, 280, 900, 32, 50, 34, 290, 743, 296, 873, 16, 1387, 32, 50, 267, 50, 34, 290, 743, 296, 914, 32, 50, 267, 50, 267, 50, 34, 318, 2263, 67, 3327, 67, 8470, 67, 7023, 12, 3327, 30, 33559, 16, 5291, 67, 1622, 30, 677, 13, 586, 914, 63, 7964, 6523, 1412, 50, 34, 263, 9752, 278, 1111, 50, 34, 263, 5133, 67, 1622, 278, 925, 12, 3327, 283, 50, 267, 50, 34], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


Step,Training Loss
500,5.5961
1000,4.8294
1500,4.5888
2000,4.4468
2500,4.3284
3000,4.1691
3500,4.131
4000,4.0464
4500,3.9656
5000,3.8894


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [2]:
print("Hello")

Hello
