<a href="https://colab.research.google.com/github/Aastha031295/FineTuning/blob/main/FineTune_GPT2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install the necessary libraries
!pip install transformers datasets bitsandbytes peft

# Mount Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

# Load dataset
import pandas as pd

# Provide the correct path to your file (check that this path is accurate after mounting)
dataset_path = '/content/drive/MyDrive/combined_dataset.csv'  # Adjust 'MyDrive' if necessary
df = pd.read_csv(dataset_path)

# Check the data types of the columns
print(df.dtypes)

# Check for missing or unusual values in the label column
print(df['label'].unique())  # Replace 'label' with the actual label column name

# Prepare the dataset for Hugging Face's transformers
from datasets import Dataset

# Convert the dataframe to a Hugging Face Dataset (adjust column names as needed)
train_data = Dataset.from_pandas(df[['text', 'label']])
train_data = train_data.train_test_split(test_size=0.1)

# Load GPT-2 tokenizer and model
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = 'gpt2'
tokenizer = AutoTokenizer.from_pretrained(model_name)

# GPT-2 doesn't have a padding token by default, so we add it
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)

# Preprocess the data by tokenizing inputs
def preprocess_function(examples):
    inputs = examples['text']
    model_inputs = tokenizer(inputs, max_length=512, padding='max_length', truncation=True)

    # Labels are the same as the inputs for causal LM
    model_inputs['labels'] = model_inputs['input_ids'].copy()
    return model_inputs

# Tokenize the dataset
tokenized_dataset = train_data.map(preprocess_function, batched=True, remove_columns=['text', 'label'])

# Define the training arguments
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    report_to="none",  # Disable logging to external services like WandB or Huggingface Hub
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    save_strategy = "epoch" # Add this line
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer=tokenizer,
)

# Fine-tune the model
trainer.train()


Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/23499 [00:00<?, ? examples/s]

Map:   0%|          | 0/2611 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,1.7905,1.752018


Epoch,Training Loss,Validation Loss
1,1.7905,1.752018
2,1.733,1.740775
3,1.751,1.737953


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=8814, training_loss=1.7652923907718612, metrics={'train_runtime': 11168.6831, 'train_samples_per_second': 6.312, 'train_steps_per_second': 0.789, 'total_flos': 1.8420304379904e+16, 'train_loss': 1.7652923907718612, 'epoch': 3.0})

In [2]:
# Evaluate the fine-tuned model
results = trainer.evaluate()

# Print the evaluation results
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 1.7379528284072876, 'eval_runtime': 119.6555, 'eval_samples_per_second': 21.821, 'eval_steps_per_second': 2.733, 'epoch': 3.0}


In [3]:
model.save_pretrained("/content/drive/MyDrive/GPT2/finetuned_GPT2_sarcasm")
tokenizer.save_pretrained("/content/drive/MyDrive/GPT2/finetuned_GPT2_sarcasm")

('/content/drive/MyDrive/GPT2/finetuned_GPT2_sarcasm/tokenizer_config.json',
 '/content/drive/MyDrive/GPT2/finetuned_GPT2_sarcasm/special_tokens_map.json',
 '/content/drive/MyDrive/GPT2/finetuned_GPT2_sarcasm/vocab.json',
 '/content/drive/MyDrive/GPT2/finetuned_GPT2_sarcasm/merges.txt',
 '/content/drive/MyDrive/GPT2/finetuned_GPT2_sarcasm/added_tokens.json',
 '/content/drive/MyDrive/GPT2/finetuned_GPT2_sarcasm/tokenizer.json')