# Importing lib's & Dataset

In [None]:
import pandas as pd
import torch

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.metrics import classification_report, confusion_matrix
from datasets import Dataset

In [None]:
df = pd.read_csv('/content/train.csv')
df = df.sample(n= 4000)
df

Unnamed: 0,text,intent
353,rate this series chronicle 0 points,RateBook
9253,i want to hear the album suites & sweets,PlayMusic
6592,book me a table for five at the top-rated moro...,BookRestaurant
10625,what s the weather in springside nature reserv...,GetWeather
3617,book a table in hallwood for one for supper,BookRestaurant
...,...,...
7573,what is the current place forecast,GetWeather
2514,book a spot for 8 at a tavern on jun the 11th...,BookRestaurant
11717,open vimeo and play music,PlayMusic
8495,find movie schedules,SearchScreeningEvent


In [None]:
df['intent'].value_counts()

Unnamed: 0_level_0,count
intent,Unnamed: 1_level_1
SearchScreeningEvent,606
PlayMusic,577
GetWeather,570
RateBook,569
BookRestaurant,567
SearchCreativeWork,565
AddToPlaylist,546


In [None]:
df.isnull().sum()

Unnamed: 0,0
text,0
intent,0


# Text Preprocessing

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

text = 'I am learning LLMs'
preprocessed_text = preprocess_text(text)
print(preprocessed_text)

learning llms


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
df['text'] = df['text'].apply(preprocess_text)
df['text']

Unnamed: 0,text
353,rate series chronicle 0 points
9253,want hear album suites & sweets
6592,book table five top-rated morocco restaurant
10625,weather springside nature reserve four pm
3617,book table hallwood one supper
...,...
7573,current place forecast
2514,book spot 8 tavern jun 11th 2034
11717,open vimeo play music
8495,find movie schedules


# Assign No to Unique Class/Labels

In [None]:
unique_classes = df['intent'].unique()
classes_id = {labels : i for i, labels in enumerate(unique_classes)}
df['classes'] = df['intent'].map(classes_id)
df

Unnamed: 0,text,intent,classes
353,rate series chronicle 0 points,RateBook,0
9253,want hear album suites & sweets,PlayMusic,1
6592,book table five top-rated morocco restaurant,BookRestaurant,2
10625,weather springside nature reserve four pm,GetWeather,3
3617,book table hallwood one supper,BookRestaurant,2
...,...,...,...
7573,current place forecast,GetWeather,3
2514,book spot 8 tavern jun 11th 2034,BookRestaurant,2
11717,open vimeo play music,PlayMusic,1
8495,find movie schedules,SearchScreeningEvent,4


# Loading Max-length & Tokenizer

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
max_leng = max([len(tokenizer.encode(text)) for text in df['text']])
print(f'Max Length: {max_leng}')

Max Length: 23


# Applying Tokenizer

In [None]:
def tokenization(dummy_text):
    tokenize_inps = tokenizer(dummy_text['text'], padding= 'max_length', truncation= True, max_length= max_leng)
    tokenize_inps['labels'] = dummy_text['classes'] # Changed 'classes' to 'labels'
    return tokenize_inps

dataset = Dataset.from_pandas(df[['text', 'classes']])
dataset = dataset.map(tokenization)
dataset[0]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

{'text': 'rate series chronicle 0 points',
 'classes': 0,
 '__index_level_0__': 353,
 'input_ids': [101,
  3446,
  2186,
  9519,
  1014,
  2685,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'labels': 0}

# Fine-Tuning Model

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels= len(unique_classes))
device = 0 if torch.cuda.is_available() else -1

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
traning_args = TrainingArguments(
    output_dir = './results',
    logging_dir = './logs',
    eval_strategy = 'epoch',
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 64,
    num_train_epochs = 5,
    save_strategy = 'epoch',
    weight_decay = 0.01,
    learning_rate = 2e-5,
    load_best_model_at_end = True
)

In [None]:
trainer = Trainer(
    model = model,
    args = traning_args,
    train_dataset = dataset,
    eval_dataset = dataset
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.073062
2,0.335400,0.033642
3,0.335400,0.019759
4,0.030600,0.014414
5,0.030600,0.010476


TrainOutput(global_step=1250, training_loss=0.14982971992492675, metrics={'train_runtime': 107.3558, 'train_samples_per_second': 186.296, 'train_steps_per_second': 11.644, 'total_flos': 119024290680000.0, 'train_loss': 0.14982971992492675, 'epoch': 5.0})

# Getting Performance

In [None]:
predictions, true_labels, _ = trainer.predict(dataset)
predicted_labels = predictions.argmax(axis= 1)

print('**Classification Report:**')
print(classification_report(true_labels, predicted_labels))
print('**Confusion Matrix:**')
print(confusion_matrix(true_labels, predicted_labels))

**Classification Report:**
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       569
           1       1.00      1.00      1.00       577
           2       1.00      1.00      1.00       567
           3       1.00      1.00      1.00       570
           4       1.00      1.00      1.00       606
           5       1.00      1.00      1.00       546
           6       1.00      0.99      0.99       565

    accuracy                           1.00      4000
   macro avg       1.00      1.00      1.00      4000
weighted avg       1.00      1.00      1.00      4000

**Confusion Matrix:**
[[569   0   0   0   0   0   0]
 [  0 577   0   0   0   0   0]
 [  0   0 567   0   0   0   0]
 [  0   0   2 568   0   0   0]
 [  0   0   0   0 604   0   2]
 [  0   0   0   0   0 546   0]
 [  0   2   0   0   2   0 561]]


# Saving Model

In [None]:
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json')

# Inference Pred System

In [None]:
model = DistilBertForSequenceClassification.from_pretrained('./saved_model')
tokenizer = DistilBertTokenizer.from_pretrained('./saved_model')

In [None]:
id_to_label = {i : label for label, i in classes_id.items()}
id_to_label

{0: 'RateBook',
 1: 'PlayMusic',
 2: 'BookRestaurant',
 3: 'GetWeather',
 4: 'SearchScreeningEvent',
 5: 'AddToPlaylist',
 6: 'SearchCreativeWork'}

# Predictions

In [None]:
def predictions(text, model, tokenizer, max_length= 25):
    text = preprocess_text(text)
    inps = tokenizer(text, padding= 'max_length', truncation= True, max_length= max_length, return_tensors= 'pt')

    with torch.no_grad():
        outputs = model(**inps)
        logits = outputs.logits

    pred_class_id = torch.argmax(logits).item()
    pred_class = id_to_label[pred_class_id]

    return pred_class

# Test Examples

In [None]:
test_msg = 'Rate the book i just finish reading!'
predictions(test_msg, model, tokenizer)

'RateBook'

In [None]:
test_msg2 = "Play some relaxing music."
predictions(test_msg2, model, tokenizer)

'PlayMusic'

# Downloading Zip

In [None]:
import shutil
model_dir = './saved_model'
model.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)
shutil.make_archive('distilBert_model', 'zip', model_dir)

'/content/distilBert_model.zip'