# Fine-tuning the DistilBERT Model from HuggingFace

---

## Import data to fine-tune model

In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('../../assets/spam-data.csv')

In [3]:
dataset.head(), dataset.shape

(   label                                               text
 0      0  Go until jurong point, crazy.. Available only ...
 1      0                      Ok lar... Joking wif u oni...
 2      1  Free entry in 2 a wkly comp to win FA Cup fina...
 3      0  U dun say so early hor... U c already then say...
 4      0  Nah I don't think he goes to usf, he lives aro...,
 (5572, 2))

## Extract dependent and independent features

In [4]:
X = list(dataset['text'])
y = list(dataset['label'])

## Train-test Split

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=14)

## Use a HuggingFace Model

Generally, the steps involved in using a model from HuggingFace involves,
1. Calling the pre-trained model
2. Calling the model's tokenizer - since each model has it's own tokenizer
3. Use the tokenizer to encode the train and test datasets
   1. `truncation` - remove whitespace from each data point
   2. `padding` - conform all data points to the same length

In [6]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
trainEncoded = tokenizer(X_train, truncation=True, padding=True)
testEncoded = tokenizer(X_test, truncation=True, padding=True)

In [8]:
print(testEncoded)

{'input_ids': [[101, 2440, 3684, 6643, 1024, 1011, 1007, 1045, 2031, 6611, 2098, 3514, 6643, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 14935, 1997, 2251, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

## Create Dataset Objects with Tensorflow

In tensorflow, the dataset objects are tensors. We do this so data flows through our pipeline in the expected format.

In [9]:
import tensorflow as tf

trainDataset = tf.data.Dataset.from_tensor_slices((
    dict(trainEncoded),
    y_train
))

testDataset = tf.data.Dataset.from_tensor_slices((
    dict(testEncoded),
    y_test
))

In [10]:
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

RuntimeError: Failed to import transformers.models.distilbert.modeling_tf_distilbert because of the following error (look up to see its traceback):
[WinError 127] The specified procedure could not be found

In [None]:
trainingArguments - TFTrainingArguments(
    output_dir = '../models/results/distilbert',
    num_training_epochs = 2,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 8,
    warmup_steps = 100,
    weight_decay = 0.01,
    logging_dir = '../models/logs/distilbert',
    logging_steps = 10
)