In [9]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from trl import SFTTrainer
from datasets import load_dataset

In [10]:

# 1. LOAD THE BASE MODEL
# We pick a "pre-trained" model that already knows English basics.
model_id = "gpt2" # You can replace this with bigger models like Llama-3
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [11]:
# 2. PREPARE THE DATASET
# This is a sample dataset. For real fine-tuning, you would load your own
# custom .json or .csv files here.
dataset = load_dataset("imdb", split="train[:1%]") # Using 1% for a quick demo

In [12]:
# 3. CONFIGURE THE TRAINING SETTINGS
# These are the "knobs" we turn to control how the model learns.
training_args = TrainingArguments(
    output_dir="./my_special_model", # Where to save the model
    per_device_train_batch_size=4,   # How many examples to show the AI at once
    num_train_epochs=3,              # How many times to loop through the data
    learning_rate=2e-5,              # How "fast" the model should update its brain
    logging_steps=10,                # Print progress every 10 steps
    save_strategy="epoch",           # Save the model after every loop
)

In [13]:
# 4. START THE FINE‑TUNING
# The Trainer acts like a "teacher" that shows the data to the model.

# 1. tokenize the text column
def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=True, max_length=512)

tokenized = dataset.map(tokenize_fn, batched=True)

# causal‑LM training: labels = input_ids
tokenized = tokenized.map(lambda ex: {"labels": ex["input_ids"]}, batched=True)

# we don't need the original text/label columns any more
tokenized = tokenized.remove_columns(["text", "label"])

# 2. create the trainer
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
)

Truncating train dataset:   0%|          | 0/250 [00:00<?, ? examples/s]

In [14]:
# This command starts the actual retraining process
trainer.train()

# 5. SAVE YOUR NEW BRAIN
# Now you have a version of GPT-2 that understands your specific data better.
trainer.save_model("./my_special_model")

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,3.8613
20,3.8892
30,3.9078
40,3.7532
50,3.696
60,3.6151
70,3.7235
80,3.6707
90,3.6168
100,3.5748




In [15]:
# 6. USE YOUR FINE-TUNED MODEL
# Load the fine-tuned model and tokenizer
from transformers import pipeline

# Load your fine-tuned model
fine_tuned_model = AutoModelForCausalLM.from_pretrained("./my_special_model")
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./my_special_model")

# Method 1: Using the pipeline (easier)
generator = pipeline("text-generation", model=fine_tuned_model, tokenizer=fine_tuned_tokenizer)

# Generate text
prompt = "This movie is"
output = generator(prompt, max_length=50, num_return_sequences=1)
print(output[0]['generated_text'])

# Method 2: Manual generation (more control)
input_ids = fine_tuned_tokenizer.encode(prompt, return_tensors="pt")
output_ids = fine_tuned_model.generate(input_ids, max_length=50, num_return_sequences=1)
generated_text = fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and

This movie is a satire of the American dream. It is a satire of a fantasy movie that has been made by the very real people who made it. It is a satire of the dream of the American dream. It has been made by the very real people who made it. Now, the Hollywood elite have taken control of the movie industry and they have turned it into a propaganda film. There is nothing to be said about that. It is a propaganda film. It is a propaganda film that has been made by the very real people who made it. The film is a propaganda film. It has been made by the very real people who made it. The film is a propaganda film. It has been made by the very real people who made it. This is an interesting movie. It is a fascinating movie. It is a fascinating movie that has a lot of plot holes. There are very few plot holes. The movie is a comedy. It is a comedy with a lot of plot holes. The movie is a comedy with a lot of plot holes. It is a comedy. It is a comedy with a lot of plot holes. There are very fe