## Loading & splitting the Dataset

In [1]:
from datasets import load_dataset

dataset=load_dataset('json',data_files='smart_home_dataset.json')
dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 2000
    })
})

In [2]:
split_1=dataset['train'].train_test_split(test_size=0.2,seed=42)
split_2=split_1['test'].train_test_split(test_size=0.5,seed=42)

dataset={
    'train':split_1['train'],
    'validation':split_2['train'],
    'test':split_2['test']
}

In [3]:
dataset

{'train': Dataset({
     features: ['input', 'output'],
     num_rows: 1600
 }),
 'validation': Dataset({
     features: ['input', 'output'],
     num_rows: 200
 }),
 'test': Dataset({
     features: ['input', 'output'],
     num_rows: 200
 })}

In [4]:
dataset['test'][:3]

{'input': ['Increase the thermostat in the kitchen to 100.',
  'Decrease the alarm in the bedroom to 100.',
  'Stop the fan in the garage.'],
 'output': ['Increase(thermostat, 100)', 'Decrease(alarm, 100)', 'Stop(fan)']}

## Toknizing

In [5]:
from transformers import AutoTokenizer

tokenizer=AutoTokenizer.from_pretrained('t5-small')

In [6]:
def preprocess_function(dataset):
  model_inputs=tokenizer(dataset['input'],truncation=False,max_length=64,padding='max_length')
  labels=tokenizer(dataset['output'],truncation=False,max_length=64,padding='max_length')
  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

In [7]:
dataset_train=dataset['train'].map(preprocess_function,batched=True)
dataset_validation=dataset['validation'].map(preprocess_function,batched=True)
dataset_test=dataset['test'].map(preprocess_function,batched=True)

Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [8]:
dataset_train

Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1600
})

In [9]:
dataset_train[0]

{'input': 'Pause the door in the bedroom.',
 'output': 'Pause(door)',
 'input_ids': [22631,
  8,
  1365,
  16,
  8,
  2923,
  5,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'labels': [22631,
  599,
  11968,
  61,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


## Training

In [10]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,       # Evaluate at the end of each epoch
    eval_strategy="epoch",         # Evaluate at the end of each epoch
    save_strategy="epoch", 
    num_train_epochs=3,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
)



In [11]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')

In [12]:
from transformers import Trainer

trainer = Trainer(
    model=model,                       # The pre-trained model you want to fine-tune
    args=training_args,                # Your training arguments
    train_dataset=dataset_train,       # Your training set
    eval_dataset=dataset_validation,   # Your validation set
    tokenizer=tokenizer                # The tokenizer used
)

  trainer = Trainer(


In [13]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.0533,0.007989
2,0.0105,0.000969
3,0.0065,0.00066


Epoch,Training Loss,Validation Loss
1,0.0533,0.007989
2,0.0105,0.000969
3,0.0065,0.00066


TrainOutput(global_step=600, training_loss=0.42256366528570655, metrics={'train_runtime': 53.1235, 'train_samples_per_second': 90.355, 'train_steps_per_second': 11.294, 'total_flos': 81205080883200.0, 'train_loss': 0.42256366528570655, 'epoch': 3.0})

## Evaluation metric 1 ( Exact matching )

In [14]:
import torch
predictions = []
references = []

model.eval()
for example in dataset_test:
    input_ids = torch.tensor(example['input_ids']).unsqueeze(0).to(model.device)

    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=128)

    pred = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
    label = tokenizer.decode(example['labels'], skip_special_tokens=True).strip()

    predictions.append(pred)
    references.append(label)


In [15]:
from evaluate import load

exact_match = load("exact_match")
em_result = exact_match.compute(predictions=predictions, references=references)

In [16]:
em_result

{'exact_match': np.float64(1.0)}

## Evaluation metric 2 ( Levenshtein similarity )

In [17]:
import editdistance

def compute_levenshtein_similarity(predictions, references):
    similarities = []
    for pred, ref in zip(predictions, references):
        pred = pred.strip()
        ref = ref.strip()
        dist = editdistance.eval(pred, ref)
        max_len = max(len(pred), len(ref))
        similarity = 1.0 - (dist / max_len) if max_len > 0 else 1.0
        similarities.append(similarity)
    avg_similarity = sum(similarities) / len(similarities)
    return avg_similarity

In [18]:
ls_result=compute_levenshtein_similarity(predictions,references)
ls_result

1.0

# Both Evaluations Side by Side

In [19]:
import pandas as pd
results={
    "Model": "t5-small-finetuned",
    "Exact Match": em_result,
    "Levenshtein Similarity": ls_result
}
results=pd.DataFrame(results)
results.reset_index(drop=True, inplace=True)
results

Unnamed: 0,Model,Exact Match,Levenshtein Similarity
0,t5-small-finetuned,1.0,1.0
