In [1]:
pip install transformers datasets accelerate evaluate





[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip





In [2]:
import pandas as pd
from datasets import Dataset

# Load dataset
df = pd.read_csv("instruction_dataset.csv")

# Combine instruction + input as a prompt
def create_prompt(example):
    instruction = example['instruction']
    input_text = example.get('input', '')
    prompt = f"{instruction}: {input_text}" if input_text else instruction
    return prompt

df['prompt'] = df.apply(create_prompt, axis=1)
df['labels'] = df['output']

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df[['prompt', 'labels']])

In [3]:
df.head()

Unnamed: 0,instruction,output,prompt,labels
0,What is the capital of France?,Paris,What is the capital of France?,Paris
1,What is 2 + 2?,4,What is 2 + 2?,4
2,Who wrote Hamlet?,William Shakespeare,Who wrote Hamlet?,William Shakespeare
3,What's the boiling point of water?,100 degrees Celsius,What's the boiling point of water?,100 degrees Celsius
4,Translate 'Hello' to Spanish.,Hola,Translate 'Hello' to Spanish.,Hola


In [4]:
dataset

Dataset({
    features: ['prompt', 'labels'],
    num_rows: 103
})

In [5]:
df.shape

(103, 4)

In [6]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")

def preprocess(example):
    input_enc = tokenizer(example['prompt'], truncation=True, padding='max_length', max_length=512)
    target_enc = tokenizer(example['labels'], truncation=True, padding='max_length', max_length=128)

    input_enc['labels'] = target_enc['input_ids']
    return input_enc

tokenized_dataset = dataset.map(preprocess, batched=True)


Map:   0%|          | 0/103 [00:00<?, ? examples/s]

In [7]:
tokenizer

T5TokenizerFast(name_or_path='google/flan-t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>

In [8]:
tokenized_dataset

Dataset({
    features: ['prompt', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 103
})

In [9]:
from transformers import AutoModelForSeq2SeqLM
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")

In [10]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [17]:
pip install huggingface_hub 

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
from huggingface_hub import login

In [27]:
login(token="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")

In [28]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [29]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./flan-t5-small-finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="no",
    save_strategy="epoch",
    logging_dir="./logs",
    push_to_hub=True,  # Optional
    hub_model_id="ayushparwal2004/text-gen-v1-small",
)


In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)


  trainer = Trainer(


In [31]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


TrainOutput(global_step=39, training_loss=35.235789763621796, metrics={'train_runtime': 1002.5741, 'train_samples_per_second': 0.308, 'train_steps_per_second': 0.039, 'total_flos': 57440165953536.0, 'train_loss': 35.235789763621796, 'epoch': 3.0})

In [32]:
model.push_to_hub("ayushparwal2004/text-gen-v1-small")
tokenizer.push_to_hub("ayushparwal2004/text-gen-v1-small")

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/ayushparwal2004/text-gen-v1-small/commit/d33f2af49832b6c45b74d1b67db5c464c07377c4', commit_message='Upload tokenizer', commit_description='', oid='d33f2af49832b6c45b74d1b67db5c464c07377c4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/ayushparwal2004/text-gen-v1-small', endpoint='https://huggingface.co', repo_type='model', repo_id='ayushparwal2004/text-gen-v1-small'), pr_revision=None, pr_num=None)