In [None]:
#import os
#os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [None]:
#!pip install datasets
#!pip install transformers

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import GPT2Config, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer
import numpy as np


In [2]:
# Replace with your own dataset
dataset = load_dataset("Diplomkaazvposlednimsemestru/MUNIAI")

# Make validation split
dataset = dataset['train'].train_test_split(test_size=0.0015)

In [3]:
# load the gpt-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token=tokenizer.eos_token

In [4]:
# tokenize the dataset
def tokenize_function(example):
    return tokenizer(text=example["text"])
tokenized_ds = dataset.map(tokenize_function, batched=True, remove_columns='text')
tokenized_ds

Map:   0%|          | 0/3349 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['author', 'input_ids', 'attention_mask'],
        num_rows: 3349
    })
    test: Dataset({
        features: ['author', 'input_ids', 'attention_mask'],
        num_rows: 6
    })
})

In [5]:
from itertools import chain
from datasets import Dataset, DatasetDict

def concatenate_and_chunk(dataset, chunk_size=512):
    # Flatten all `input_ids` into a single list
    all_input_ids = list(chain(*dataset["input_ids"]))
    
    # Create chunks of `chunk_size`
    chunks = [all_input_ids[i:i + chunk_size] for i in range(0, len(all_input_ids), chunk_size)]
    
    # Only keep chunks that are exactly of length `chunk_size`
    chunks = [chunk for chunk in chunks if len(chunk) == chunk_size]
    
    # Create a new dataset with only the `input_ids` chunks
    return Dataset.from_dict({"input_ids": chunks})

# Apply this function to each split (train and test) in the DatasetDict
chunked_ds = DatasetDict({
    split: concatenate_and_chunk(split_ds, chunk_size=512)
    for split, split_ds in tokenized_ds.items()
})

chunked_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids'],
        num_rows: 1172
    })
    test: Dataset({
        features: ['input_ids'],
        num_rows: 2
    })
})

In [6]:
# data collator joins chunks into batches
# see https://huggingface.co/docs/transformers/en/main_classes/data_collator
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [30]:
#UPRAVIL JSEM KVULI CPU

# Define the model configuration for the smallest GPT-2
config = GPT2Config(
    vocab_size=len(tokenizer),      # Standard GPT-2 vocab size 50257
    n_positions=512,                # Context size (512 is enough for small-scale models)
    n_embd=768,                     # Embedding size
    n_layer=12,                     # Number of transformer layers
    n_head=12,                      # Number of attention heads
)

# Initialize the model and tokenizer
model = GPT2LMHeadModel(config)

In [31]:
import torch
import math

# Define the perplexity metric
def compute_metrics(eval_pred):
    # `eval_pred` is a tuple of (logits, labels)
    logits, labels = eval_pred

    # Convert logits and labels to PyTorch tensors if they are NumPy arrays
    if isinstance(logits, np.ndarray):
        logits = torch.tensor(logits)
    if isinstance(labels, np.ndarray):
        labels = torch.tensor(labels)

    # Shift labels so that tokens align for calculating loss
    shift_labels = labels[:, 1:].reshape(-1)
    shift_logits = logits[:, :-1, :].reshape(-1, logits.shape[-1])

    # Calculate the cross-entropy loss
    loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100)  # Ignore padding tokens
    loss = loss_fct(shift_logits, shift_labels)

    # Calculate perplexity
    perplexity = math.exp(loss.item())
    return {"perplexity": perplexity}

In [32]:
#UPRAVIL JSEM KVULI CPU

# Set this according to size of your dataset
# You should train for at least 15 mins on A10 GPU to get something reasonable
TRAIN_EPOCHS = 15

SAVE_STEPS = 300
EVAL_STEPS = SAVE_STEPS // 2

# training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-training",  # Directory to save the model checkpoints and other outputs
    eval_strategy="steps",  # Evaluation strategy to use during training ('steps' or 'epochs')
    eval_steps=EVAL_STEPS,  # Perform evaluation every 500 steps
    num_train_epochs=TRAIN_EPOCHS,  # Total number of training epochs
    per_device_train_batch_size=16,  # Batch size for training on each device
    per_device_eval_batch_size=16,  # Batch size for evaluation on each device
    learning_rate=2.5e-4,  # Initial learning rate for the optimizer
    lr_scheduler_type='cosine',  # Learning rate scheduler type. 'cosine' provides a cosine decay schedule.
    warmup_ratio=0.05,  # Proportion of training to perform linear learning rate warmup for
    adam_beta1=0.9,  # Beta1 parameter for the Adam optimizer (first moment decay)
    adam_beta2=0.999,  # Beta2 parameter for the Adam optimizer (second moment decay)
    weight_decay=0.01,  # Weight decay to apply (L2 regularization)
    logging_strategy="steps",  # Logging strategy to use. 'steps' logs at specified steps.
    logging_steps=EVAL_STEPS,  # Log training metrics every 500 steps
    save_steps=SAVE_STEPS,  # Save a checkpoint every 1000 steps
    save_total_limit=10,  # Maximum number of checkpoints to keep. Older checkpoints are deleted.
    # report_to='wandb',  # Uncomment to report metrics to Weights and Biases (optional)
)



In [33]:
from transformers import Trainer, DataCollatorWithPadding  # Zkontrolujte, zda máte správný import pro data_collator

trainer = Trainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,  # Používá se místo tokenizer
    train_dataset=chunked_ds["train"],
    eval_dataset=chunked_ds["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator
)


In [None]:
"""
trainer = Trainer(model=model,
                 args = training_args,
                 tokenizer=tokenizer,
                 train_dataset=chunked_ds["train"],
                 eval_dataset=chunked_ds["test"],
                 compute_metrics=compute_metrics,
                 data_collator = data_collator)

"""

  trainer = Trainer(model=model,


In [34]:
trainer.train()


Step,Training Loss,Validation Loss,Perplexity
150,5.2239,3.882962,48.567674
300,3.6743,3.717967,41.180109
450,3.438,3.629918,37.709278
600,3.1885,3.590772,36.261607
750,2.8589,3.512029,33.515793
900,2.5601,3.527914,34.052406
1050,2.38,3.528779,34.081874


TrainOutput(global_step=1110, training_loss=3.277919844893722, metrics={'train_runtime': 652.1319, 'train_samples_per_second': 26.958, 'train_steps_per_second': 1.702, 'total_flos': 4593513922560000.0, 'train_loss': 3.277919844893722, 'epoch': 15.0})

In [35]:
trainer.save_model("./gpt2-small-final") 


In [36]:
YOUR_MODEL_NAME = "my_small_gpt2_csknihy" # change this
HF_TOKEN = "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"  # change this 

model.push_to_hub(YOUR_MODEL_NAME, token=HF_TOKEN)
tokenizer.push_to_hub(YOUR_MODEL_NAME, token=HF_TOKEN)

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/Diplomkaazvposlednimsemestru/my_small_gpt2_csknihy/commit/5837da59e23052e4ec6720a18a617970bbb2a3d4', commit_message='Upload tokenizer', commit_description='', oid='5837da59e23052e4ec6720a18a617970bbb2a3d4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Diplomkaazvposlednimsemestru/my_small_gpt2_csknihy', endpoint='https://huggingface.co', repo_type='model', repo_id='Diplomkaazvposlednimsemestru/my_small_gpt2_csknihy'), pr_revision=None, pr_num=None)

In [37]:
from transformers import  GPT2LMHeadModel, AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token=tokenizer.eos_token

In [38]:
model =  GPT2LMHeadModel.from_pretrained("./gpt2-small-final")
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [204]:
PROMPT = "On je" # Set starting prompt, something specific for your dataset

generator(
    PROMPT,
    max_length=50,       # Maximum length of the generated text
    do_sample=True,
    temperature=0.6,         # Experiment with this
    repetition_penalty=1.4,  # Experiment with this
)

[{'generated_text': 'On jeho vyšej k tomu se něm podporovat jako by bávali. Na druhé zasní tak, životem o představ'}]

TOHLE MI NEFUNGUHE V LEVO SE VYTVORIL ADRESAR gpt2-traning, ALE NEVYTVORILI SE POZADOVANE ADRESARE

In [215]:
def get_sample_after_N_steps(N, prompt, **kwargs):
    model =  GPT2LMHeadModel.from_pretrained(f"./gpt2-training/checkpoint-{N}/")
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

    output = generator(prompt, **kwargs)
    return output  

In [216]:
"""
import os

# Cesta k adresáři, který chcete zkontrolovat
directory = "./gpt2-training/checkpoint-1000/"

# Zkontroluje existenci adresáře
if os.path.isdir(directory):
    print("Adresář existuje.")
else:
    print("Adresář neexistuje.")
"""

'\nimport os\n\n# Cesta k adresáři, který chcete zkontrolovat\ndirectory = "./gpt2-training/checkpoint-1000/"\n\n# Zkontroluje existenci adresáře\nif os.path.isdir(directory):\n    print("Adresář existuje.")\nelse:\n    print("Adresář neexistuje.")\n'

In [217]:
get_sample_after_N_steps(300, "Pokus", do_sample=True, temperature=0.5)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'generated_text': 'Pokusští počítačře které které'}]

In [218]:
get_sample_after_N_steps(600, "Pokus", do_sample=True, temperature=0.5)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'generated_text': 'Pokusít a nás nám nedím přík'}]

In [219]:
get_sample_after_N_steps(900, "Pokus", do_sample=True, temperature=0.5)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'generated_text': 'Pokusu přesně jenom měl jed'}]

In [220]:
get_sample_after_N_steps(1110, "Pokus", do_sample=True, temperature=0.5)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'generated_text': 'Pokusním těně vypadá, že'}]

In [244]:
from huggingface_hub import login
token = "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"  # Vložte svůj token zde
login(token)

In [224]:
from transformers import GPT2LMHeadModel, AutoTokenizer
from huggingface_hub import HfApi

import os

# Get the absolute path of the model directory
model_path = os.path.abspath("./gpt2-small-final")

# Print the path to verify
print("Model path:", model_path)
print("Files in directory:", os.listdir(model_path))

# Try loading the model and tokenizer again
model = GPT2LMHeadModel.from_pretrained(model_path, local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)



# Push the model and tokenizer to the Hugging Face Hub
model.push_to_hub("Diplomkaazvposlednimsemestru/your-model-name")
tokenizer.push_to_hub("Diplomkaazvposlednimsemestru/your-model-name")

Model path: /teamspace/studios/this_studio/gpt2-small-final
Files in directory: ['config.json', 'generation_config.json', 'merges.txt', 'model.safetensors', 'special_tokens_map.json', 'tokenizer.json', 'tokenizer_config.json', 'training_args.bin', 'vocab.json']


model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Diplomkaazvposlednimsemestru/your-model-name/commit/a0baa1f6c5ab2c378593bbbf03723d64b40afd95', commit_message='Upload tokenizer', commit_description='', oid='a0baa1f6c5ab2c378593bbbf03723d64b40afd95', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Diplomkaazvposlednimsemestru/your-model-name', endpoint='https://huggingface.co', repo_type='model', repo_id='Diplomkaazvposlednimsemestru/your-model-name'), pr_revision=None, pr_num=None)

In [237]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-classification", model="Toshifumi/bert-base-multilingual-cased-finetuned-emotion")

config.json:   0%|          | 0.00/1.16k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/712M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [246]:
from datasets import load_dataset
from transformers import pipeline

# Načteme model pro analýzu emocí
pipe = pipeline("text-classification", model="Toshifumi/bert-base-multilingual-cased-finetuned-emotion")

# Načteme dataset z Hugging Face Hub
dataset = load_dataset("Diplomkaazvposlednimsemestru/MUNIAI")  # Změňte 'your_dataset_name' na název datasetu

# Získáme texty z datasetu (předpokládejme, že jsou ve sloupci 'text')
texts = dataset['train']['text']  # Pokud máte trénovací data, nebo použijte 'test' nebo 'validation'

# Aplikujeme model na texty
results = pipe(texts[:10])  # Například analyzujeme prvních 10 textů

# Výpis výsledků
for text, result in zip(texts[:10], results):  # Vybereme jen prvních 10 textů pro zobrazení
    print(f"Text: {text}\nEmotion: {result['label']}, Score: {result['score']}\n")


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Text: Bruce Sterling Z·tah na hacker y Napsal: Bruce Sterling, Přeložil: Václav Bárta, 2:423/59.1 Upravil: Martin Hinner, Toto je výtah překladu knihy Bruce Sterlinga "The Hacker Crackdown" z časopisu Natura (12/1995 - 07/1996). Originál překladu najdete na adrese: http://www.penguin.cz/~mhi/crackdown/ Vydáno jako knižní příloha ZX Magazínu. Ke stažení na adrese: http://zxm.speccy.cz Literární
Emotion: LABEL_1, Score: 0.6464129090309143

Text: freeware Bruce Sterling Z·tah na hacker y 5 Z·tah na hackery Hackers manifesto Another one got caught today, it's all over the papers. "Teenager Arrested in Computer Crime Scandal", "Hacker Arrested after Bank Tampering"... Damn kids. They're all alike. But did you, in your three-piece psychology and 1950's technobrain, ever
Emotion: LABEL_0, Score: 0.5150907039642334

Text: take a look behind the eyes of the hacker? Did you ever wonder what made him tick, what forces shaped him, what may have molded him? I am a hacker, enter my world... Mine is 