###### Credits to hugging face documentation

In [None]:
import logging
logging.getLogger("transformers").setLevel(logging.ERROR) # This blocks all warnings

# Pipeline

The pipeline() is the easiest and fastest way to use a pretrained model for inference.
Start by creating an instance of pipeline()
The pipeline() downloads and caches a default pretrained model and tokenizer for sentiment analysis.
The pipeline() can accommodate any model from the Hub, making it easy to adapt the pipeline() for other use-cases.  

In [None]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis","distilbert-base-uncased-finetuned-sst-2-english")
classifier(["I don't think anybody hates cows.","I don't think anybody hate cows."]) # huh!

In [None]:
model = pipeline('text-generation', "gpt2")
print(type(model))
print(model("Last night, I saw a cow")[0])

# AutomodelForCausalLM and AutoTokenizer

While pipeline() is an awesome way to use pre-trained models, it encapsulates all the working, like tokenising of the input, and back. To get all fine-grained control of the whole process, we use:

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Change the model_name to GPT-2
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Example text
text = "I saw a cow in the office, which"

# Tokenize the text
inputs = tokenizer(text, return_tensors="pt")

# Generate text using GPT-2
generated_ids = model.generate(input_ids=inputs["input_ids"], max_length=50, num_return_sequences=1)
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# Print the generated text
print("Generated Text:", generated_text)


# Using GPU

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Checking if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# initialising model and tokenizer for gpt2
gpt_model = AutoModelForCausalLM.from_pretrained('gpt2').to(device)
gpt_tokenizer = AutoTokenizer.from_pretrained('gpt2')

input_data = 'There was a time when I saw a cow in the office'

# Encoding the input and moving tensors to GPU
encoding = gpt_tokenizer(input_data, return_tensors="pt").to(device)

# Generate text using GPT-2
generated_ids = gpt_model.generate(input_ids=encoding['input_ids'], max_length=50, num_return_sequences=1)

# Move generated IDs to CPU before decoding
generated_ids = generated_ids[0].cpu()

# Decoding on CPU
generated_text = gpt_tokenizer.decode(generated_ids, skip_special_tokens=True)
print("Generated Text:", generated_text)


In [None]:
my_loc ="/home/arjun/Desktop/./pt_save" 

# Save the model

In [None]:
gpt_model.save_pretrained(my_loc)

# Load the model

In [None]:
loaded = AutoModelForCausalLM.from_pretrained(my_loc)
# print(loaded)

# Customising models

In [None]:
from transformers import AutoConfig, AutoModel

# Create a GPT-2 configuration with attention heads of 10. instead of the default 12
gpt2_config = AutoConfig.from_pretrained("gpt2", n_heads=10)

# Create a new GPT-2 model with the modified configuration
gpt2_model = AutoModel.from_config(gpt2_config)


# Trainer

All models are a standard torch.nn.Module so you can use them in any typical training loop. While you can write your own training loop, 🤗 Transformers provides a Trainer class for PyTorch, which contains the basic training loop and adds additional functionality for features like distributed training, mixed precision, and more.

In [20]:
from transformers import AutoModelForCausalLM, TrainingArguments, AutoTokenizer, DataCollatorForLanguageModeling, Trainer
from datasets import load_dataset

# Load GPT-2 model
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Training arguments for fine-tuning GPT-2
training_args = TrainingArguments(
    output_dir="path/to/save/folder/",
    overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=2,
)

# Load GPT-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# Add a new padding token to the tokenizer
tokenizer.add_special_tokens({"pad_token": "[PAD]"})

# Load and tokenize the dataset
dataset = ''
def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=True)
dataset = dataset.map(tokenize_function, batched=True)

# Data collator for language modeling (GPT-2)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

# Create Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    data_collator=data_collator,
)

# Fine-tune GPT-2
trainer.train()


AttributeError: 'str' object has no attribute 'map'

In [25]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Step 1: Create a custom dataset in a text file named custom_dataset.txt

# Step 2: Load the custom dataset
dataset_path = "custom_dataset.txt"
with open(dataset_path, "r", encoding="utf-8") as f:
    custom_dataset = f.readlines()

# Step 3: Tokenize the dataset using the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenized_custom_dataset = tokenizer.batch_encode_plus(
    custom_dataset,
    max_length=1024,
    truncation=True,
    padding="max_length",
    return_tensors="pt",
)

# Step 4: Create DataLoader and define training arguments
train_dataset = TextDataset(tokenized_custom_dataset, tokenizer=tokenizer)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
train_dataloader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=2,
    shuffle=True,
)

training_args = TrainingArguments(
    output_dir="./gpt2-custom-model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
)

# Step 5: Initialize the GPT-2 model and the Trainer, and start training
model = GPT2LMHeadModel.from_pretrained("gpt2")

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()


Using pad_token, but it is not set yet.


ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.