# Importing necessary dependencies

In [96]:
from datasets import load_dataset, Dataset,  DatasetDict
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, EarlyStoppingCallback, AutoTokenizer, TrainingArguments, Trainer
import gc
import numpy as np

# Data loading and preprocessing

In [97]:
data_files = {"train": "/kaggle/input/intent-classifier-dataset/train.csv", "test": "/kaggle/input/intent-classifier-dataset/val.csv"}
from datasets import load_dataset
dataset = load_dataset("csv", data_files={"train": "/kaggle/input/intent-classifier-dataset/train.csv", "validation": "/kaggle/input/intent-classifier-dataset/val.csv"})

In [98]:
dataset = dataset.remove_columns("Unnamed: 0")

In [99]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 6153
    })
    validation: Dataset({
        features: ['text', 'intent'],
        num_rows: 684
    })
})

In [100]:
df = dataset['train'].to_pandas()
df.head()

Unnamed: 0,text,intent
0,"Thank you, goodbye",goodbye
1,Can you turn the volume down?,volume control
2,play games of xyz 64 bit,play games
3,covid cases,covid cases
4,good day,goodbye


In [101]:
# Specify the class label to downsample and the desired number of datapoints
class_to_downsample = 'greet and hello hi kind of things, general check in'
desired_num_datapoints = 350

# Get the indices of the rows belonging to the class to be downsampled
class_rows = df[df['intent'] == class_to_downsample]

# Downsample the class by randomly selecting the desired number of datapoints
downsampled_class_rows = class_rows.sample(n=desired_num_datapoints, random_state=42)

# Create a boolean mask to select the downsampled rows and the rows from other classes
mask = df.index.isin(downsampled_class_rows.index) | ~df['intent'].isin([class_to_downsample])

# Create a new DataFrame with the downsampled data
df = df[mask].reset_index(drop=True)

In [102]:
df['intent'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 5785 entries, 0 to 5784
Series name: intent
Non-Null Count  Dtype 
--------------  ----- 
5785 non-null   object
dtypes: object(1)
memory usage: 45.3+ KB


In [103]:
dataset['train'] = Dataset.from_pandas(df)

In [104]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 5785
    })
    validation: Dataset({
        features: ['text', 'intent'],
        num_rows: 684
    })
})

In [105]:
# Encode the 'Label' column
dataset = dataset.class_encode_column("intent")

Casting to class labels:   0%|          | 0/5785 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/684 [00:00<?, ? examples/s]

In [106]:
dataset['train'][0]

{'text': 'Thank you, goodbye', 'intent': 6}

In [107]:
label_mappings = dataset['train'].features['intent']
label_mappings

ClassLabel(names=['asking date', 'asking time', 'asking weather', 'click photo', 'covid cases', 'download youtube video', 'goodbye', 'greet and hello hi kind of things, general check in', 'i am bored', 'open website', 'places near me', 'play games', 'play on youtube', 'send email', 'send whatsapp message', 'take screenshot', 'tell me about', 'tell me joke', 'tell me news', 'volume control', 'what can you do'], id=None)

In [108]:
id2label = {}
for i in range(len(label_mappings.names)):
    id2label[i]=label_mappings.int2str(i)
id2label

{0: 'asking date',
 1: 'asking time',
 2: 'asking weather',
 3: 'click photo',
 4: 'covid cases',
 5: 'download youtube video',
 6: 'goodbye',
 7: 'greet and hello hi kind of things, general check in',
 8: 'i am bored',
 9: 'open website',
 10: 'places near me',
 11: 'play games',
 12: 'play on youtube',
 13: 'send email',
 14: 'send whatsapp message',
 15: 'take screenshot',
 16: 'tell me about',
 17: 'tell me joke',
 18: 'tell me news',
 19: 'volume control',
 20: 'what can you do'}

In [109]:
label2id = {}
for k,v in id2label.items():
    label2id[v] = k
label2id

{'asking date': 0,
 'asking time': 1,
 'asking weather': 2,
 'click photo': 3,
 'covid cases': 4,
 'download youtube video': 5,
 'goodbye': 6,
 'greet and hello hi kind of things, general check in': 7,
 'i am bored': 8,
 'open website': 9,
 'places near me': 10,
 'play games': 11,
 'play on youtube': 12,
 'send email': 13,
 'send whatsapp message': 14,
 'take screenshot': 15,
 'tell me about': 16,
 'tell me joke': 17,
 'tell me news': 18,
 'volume control': 19,
 'what can you do': 20}

In [110]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'intent'],
        num_rows: 5785
    })
    validation: Dataset({
        features: ['text', 'intent'],
        num_rows: 684
    })
})

# Train test and validation split

In [111]:
# Split the validation set into validation and test sets
validation_test_split = dataset['validation'].train_test_split(test_size=0.2)  # Adjust test_size as needed

# Add the new splits back to the DatasetDict
dataset['validation'] = validation_test_split['train']
dataset['test'] = validation_test_split['test']

In [112]:
dataset['train'][0]

{'text': 'Thank you, goodbye', 'intent': 6}

# Tokenizing text data

In [113]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [114]:
dataset = dataset.rename_column('intent', 'label')

In [116]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [117]:
print('The vocabulary size is:', tokenizer.vocab_size)

The vocabulary size is: 30522


In [132]:
print('Maximum context size:', tokenizer.model_max_length)

Maximum context size: 512


In [119]:
print('Name of the fields, model need in the forward pass:', tokenizer.model_input_names)

Name of the fields, model need in the forward pass: ['input_ids', 'attention_mask']


In [120]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [121]:
data_encoded = dataset.map(tokenize, batched=True, batch_size=None)

Map:   0%|          | 0/5785 [00:00<?, ? examples/s]

Map:   0%|          | 0/547 [00:00<?, ? examples/s]

Map:   0%|          | 0/137 [00:00<?, ? examples/s]

In [122]:
data_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5785
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 547
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 137
    })
})

In [123]:
len(label2id)

21

# Loading pretrained distilbert-base-uncased model

In [124]:
num_labels = len(label2id)
model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = (AutoModelForSequenceClassification
        .from_pretrained(model_ckpt, num_labels=num_labels, id2label=id2label, label2id=label2id)
        .to(device))

In [125]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [126]:
len(data_encoded['train'])

5785

In [127]:
data_encoded

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5785
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 547
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 137
    })
})

# Defining hyperparameters

In [128]:
batch_size = 8
logging_steps = len(data_encoded["train"]) // batch_size
model_name = f"/kaggle/working/{model_ckpt}-finetuned-intent"

training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=100,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    log_level="error",
    report_to="none",
    load_best_model_at_end=True  # Ensure the best model is loaded at the end
)



# Setting up data and eval metrics for finetuning model

In [129]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_encoded["train"],
    compute_metrics=compute_metrics,
    eval_dataset=data_encoded["validation"],
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  
)

In [130]:
gc.collect()

81

In [131]:
torch.cuda.empty_cache()

# Model Training

In [42]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.153592,0.974406,0.974138
2,0.613100,0.112139,0.974406,0.974407
3,0.613100,0.097406,0.974406,0.974138
4,0.027400,0.125127,0.972578,0.972258
5,0.027400,0.103973,0.978062,0.978006
6,0.015200,0.131805,0.978062,0.978074




TrainOutput(global_step=2172, training_loss=0.2182417267399221, metrics={'train_runtime': 263.4276, 'train_samples_per_second': 2196.049, 'train_steps_per_second': 137.419, 'total_flos': 269502033130200.0, 'train_loss': 0.2182417267399221, 'epoch': 6.0})

# Model evaluation on test dataset

In [43]:
preds_output = trainer.predict(data_encoded["test"])



In [44]:
preds_output.metrics

{'test_loss': 0.06746798753738403,
 'test_accuracy': 0.9854014598540146,
 'test_f1': 0.9851955243347138,
 'test_runtime': 0.3629,
 'test_samples_per_second': 377.54,
 'test_steps_per_second': 24.802}

## Saving finetuned model

In [51]:
tokenizer.save_pretrained("/kaggle/working/my_model_with_labels")

('/kaggle/working/my_model_with_labels/tokenizer_config.json',
 '/kaggle/working/my_model_with_labels/special_tokens_map.json',
 '/kaggle/working/my_model_with_labels/vocab.txt',
 '/kaggle/working/my_model_with_labels/added_tokens.json',
 '/kaggle/working/my_model_with_labels/tokenizer.json')

In [46]:
trainer.save_model("/kaggle/working/my_model_with_labels")

# Inferencing fine tuned model

In [52]:
loaded_model = AutoModelForSequenceClassification.from_pretrained("/kaggle/working/my_model_with_labels")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/my_model_with_labels")

In [53]:
from transformers import pipeline

classifier = pipeline("text-classification", model=loaded_model, tokenizer=tokenizer)

# Example inference
result = classifier("I am getting bored, I wanna hear some jokes")
print(result)

[{'label': 'tell me joke', 'score': 0.962701678276062}]


In [54]:
# Tokenize input text
text = "I am getting bored, I wanna hear some jokes"
inputs = tokenizer(text, return_tensors="pt")

# Ensure model and inputs are on the same device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model = loaded_model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Forward pass
with torch.no_grad():
    outputs = loaded_model(**inputs)

# Get predicted class probabilities
probs = torch.softmax(outputs.logits, dim=-1).squeeze().tolist()

# Get index of highest probability
predicted_index = torch.argmax(outputs.logits, dim=-1).item()

print("Predicted Intent:- ", id2label[predicted_index])

Predicted Intent:-  tell me joke


# Downloading finetuned model folder

In [94]:
!zip -r file.zip /kaggle/working/my_model_with_labels

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/my_model_with_labels/ (stored 0%)
  adding: kaggle/working/my_model_with_labels/training_args.bin (deflated 51%)
  adding: kaggle/working/my_model_with_labels/tokenizer.json (deflated 71%)
  adding: kaggle/working/my_model_with_labels/tokenizer_config.json (deflated 76%)
  adding: kaggle/working/my_model_with_labels/special_tokens_map.json (deflated 42%)
  adding: kaggle/working/my_model_with_labels/config.json (deflated 59%)
  adding: kaggle/working/my_model_with_labels/vocab.txt (deflated 53%)
  adding: kaggle/working/my_model_with_labels/model.safetensors (deflated 8%)


In [95]:
from IPython.display import FileLink
FileLink(r'file.zip')