In [1]:
!pip install transformers bitsandbytes accelerate

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.0->bitsandbytes)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-

In [5]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
os.environ["WANDB_DISABLED"] = "true"
# Load GPT-3
model_name = "gpt2-medium"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

# 🛠 Merge all .txt files in a folder into one big temporary file
def merge_texts_from_folder(folder_path, output_path="merged_dataset.txt"):
    with open(output_path, 'w', encoding='utf-8') as outfile:
        for filename in os.listdir(folder_path):
            if filename.endswith(".txt"):
                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r', encoding='utf-8') as infile:
                    outfile.write(infile.read() + "\n\n")  # extra spacing between files
    return output_path

# 📂 Your folder containing Einstein .txt files
folder_path = "/content/drive/MyDrive/Albert"

# Merge them
merged_file = merge_texts_from_folder(folder_path)

# Load merged dataset
def load_dataset(file_path, tokenizer, block_size=512):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )

train_dataset = load_dataset(merged_file, tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training settings
training_args = TrainingArguments(
    output_dir="./einstein-gpt2",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=True,  # if using GPU
    logging_steps=100
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=data_collator
)

# Fine-tune!
trainer.train()

# Save model
trainer.save_model("./einstein-gpt3")
tokenizer.save_pretrained("./einstein-gpt3")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,2.5319
200,2.317
300,2.2243
400,2.0937
500,2.0249
600,1.8523
700,1.8978
800,1.7343
900,1.7871
1000,1.7283


('./einstein-gpt3/tokenizer_config.json',
 './einstein-gpt3/special_tokens_map.json',
 './einstein-gpt3/vocab.json',
 './einstein-gpt3/merges.txt',
 './einstein-gpt3/added_tokens.json',
 './einstein-gpt3/tokenizer.json')

In [9]:

# Input (prompt) per la predizione
input_text = "Minkowski’s notion"

# Tokenizza il prompt
input_ids = tokenizer.encode(input_text, return_tensors="pt").to('cuda')

# Genera il testo
output = model.generate(input_ids, max_length=200, num_return_sequences=1, no_repeat_ngram_size=2, top_p=0.95, top_k=50)

# Decodifica e stampa il risultato
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Minkowski’s notion of space and time is not only a fundamental one, but also a very important one.

The fundamental idea of relativity is that of the relative motion of bodies. The laws of motion are not absolute, and the laws are relative to the system of reference. In the case of a body, the motion is relative, because the body is moving with the velocity of light. But the law of relative movement is also absolute. It is the same with all other laws. If we put the equations of Newtonian mechanics in the form
(1)
   (2)

 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
