<a href="https://colab.research.google.com/github/AbhijeetKumarThakur2198/Python_Projects/blob/main/Projects_List/Fine-Tune_Hugging_Face_GPT2_Model/FineTuneHuggingFaceGPT2Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this project, we will fine-tune the Hugging Face GPT2 model using the transformers trainer function.

In [None]:
#@title Install dependencies
!pip install transformers torch accelerate

In [None]:

#@title Fine-Tune GPT2 Model
#@markdown Fill Hyperparameters
model_name = "gpt2"  #@param ["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl"]
file_path = "data.txt"  #@param  {type:"string"}
output_path = "Fine_Tuned_Model"  #@param  {type:"string"}
batch_size = 16  #@param  {type:"integer"}
num_epochs = 50  #@param  {type:"integer"}
block_size = 124  #@ param {type:"integer"}
learning_rate = 5e-5  #@param  {type:"number"}
save_steps = 10000  #@param  {type:"integer"}
overwrite_output_path = True  #@param  {type:"boolean"}

# IMPORT NECESSARY MODULES
try:
    import re
    import os
except Exception as e:
    print(f"Error: {e}")

try:
    import torch
except ModuleNotFoundError:
    print("Pytorch module not found in your environment please download it!")
except Exception as e:
    print(f"Error: {e}")

try:
    from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
except ModuleNotFoundError:
    print("Transformers module not found in your environment please download it!")
except Exception as e:
    print(f"Error: {e}")

# DEFINE FUNCTIONS
def read_txt(file_path):
    with open(file_path, "r") as file:
        text = file.read()
    return text

def load_dataset(file_path, tokenizer, block_size):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size,
    )
    return dataset

def fine_tune(train_file_path, model_name, output_path, overwrite_output_path, per_device_train_batch_size, num_train_epochs, save_steps, learning_rate, block_size):
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    text_data = read_txt(train_file_path)
    text_data = re.sub(r'\n+', '\n', text_data).strip()
    train_dataset = load_dataset(train_file_path, tokenizer, block_size)

    if not os.path.exists(output_path):
        os.makedirs(output_path)

    model.save_pretrained(output_path)
    tokenizer.save_pretrained(output_path)

    training_args = TrainingArguments(
        output_dir=output_path,
        overwrite_output_dir=overwrite_output_path,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        save_steps=save_steps,
        learning_rate=learning_rate,
        do_train=True
    )

    try:
        trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
            train_dataset=train_dataset
        )
        trainer.train()
        trainer.save_model()

        print("Your fine-tuning process is completed. Now you can use your fine-tuned model.")
    except Exception as e:
        print(f"Oh! It seems like something went wrong: {str(e)}. Please check all information again or open GitHub issue!")

# FINE TUNE MODEL
fine_tune(
    train_file_path=file_path,
    model_name=model_name,
    output_path=output_path,
    overwrite_output_path=overwrite_output_path,
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_epochs,
    save_steps=save_steps,
    learning_rate=learning_rate,
    block_size=block_size
)

In [None]:

#@title Inference Fine-Tuned Model
#@markdown Fill Configuration
import torch
max_length = 50  #@param {type:"integer"}
temperature = 0.7  #@param {type:"number"}
return_tensors = "pt"  #@param {type:"string"}
num_return_sequences = 1  #@param {type:"integer"}
skip_special_tokens = True  #@param {type:"boolean"}
seed_text = "Once upon a time"  #@param {type:"string"}
Fine_Tuned_Model_Path = "Fine_Tuned_Model"  #@param {type:"string"}

# IMPORT NECCESARY MODULE
try:
    import torch
except ModuleNotFoundError:
    print("Pytorch module not found in your environment please download it!")
except Exception as e:
    print(f"Error: {e}")

try:
    from transformers import GPT2Tokenizer, GPT2LMHeadModel
except ModuleNotFoundError:
    print("transformers module not found in your environment please download it!")
except Exception as e:
    print(f"Error: {e}")

# LOAD MODEL
tokenizer = GPT2Tokenizer.from_pretrained(Fine_Tuned_Model_Path)
model = GPT2LMHeadModel.from_pretrained(Fine_Tuned_Model_Path)

# ENCODE SEED TEXT INTO INTEGERS
encoded_text = tokenizer.encode(seed_text, return_tensors=return_tensors)

# Create attention mask
attention_mask = torch.ones(encoded_text.shape, dtype=torch.long)

# GENERATE OUTPUT
output = model.generate(
    encoded_text,
    max_length=max_length,
    num_return_sequences=num_return_sequences,
    temperature=temperature,
    do_sample=True,
    attention_mask=attention_mask,  # Pass attention mask
    pad_token_id=tokenizer.eos_token_id  # Set pad token ID
)

# DECODE INTEGERS TO STRINGS AND PRINT
decoded_text = tokenizer.decode(output[0], skip_special_tokens=skip_special_tokens)
print(decoded_text)