## Fine tuning Huggingface model

In [23]:
# Dependencies

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
import numpy as np
import tensorflow as tf
import nltk
from collections import Counter
import string
import itertools
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from nltk import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import tensorflow_text as text
import tensorflow_hub as hub
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import GlobalMaxPool1D, BatchNormalization, Dense, RNN, GRU, LSTM, TimeDistributed, Bidirectional, Activation, Embedding, Input, Conv1D, Dropout
import tensorflow as tf
import keras.backend as K
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from transformers import AutoTokenizer
from transformers import TFAutoModelForSequenceClassification
from scipy.special import softmax
from transformers import pipeline
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

plt.style.use('ggplot')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Data imports and preprocessing
with open('reviews.txt', 'r') as f:
  reviews = f.read()
with open('labels.txt', 'r') as f:
  labels = f.read()
  
# Remove punctuation
# reviews = "".join([char for char in reviews if char not in string.punctuation])

reviews_num = 100

reviews = reviews.split('\n')[:reviews_num]
labels = labels.split('\n')

labels = [1 if label == "positive" else 0 for label in labels][:reviews_num]

In [8]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size = 0.2, random_state = 1)

checkpoint = 'distilbert-base-uncased'

# Define tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained(checkpoint)

# Apply tokenizer to create encodings
X_train_encodings = tokenizer(X_train, truncation = True, padding = True)
X_test_encodings = tokenizer(X_test, truncation = True, padding = True)

In [9]:
# Create tensorflow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
  dict(X_train_encodings),
  y_train
))

test_dataset = tf.data.Dataset.from_tensor_slices((
  dict(X_test_encodings),
  y_test
))

In [11]:
# Define training arguments
training_args = TFTrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps=500,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10
)

In [13]:
# Define Tensorflow Trainer object
with training_args.strategy.scope():
  model = TFDistilBertForSequenceClassification.from_pretrained(checkpoint)
  
trainer = TFTrainer(
  model = model,
  args = training_args,
  train_dataset = train_dataset,
  eval_dataset = test_dataset
)

trainer.train()

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_layer_norm', 'activation_13', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_39', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

In [14]:
# Model evaluation dataset
eval_encodings = tokenizer(reviews[:3], truncation = True, padding = True)
eval_dataset = tf.data.Dataset.from_tensor_slices((
  dict(eval_encodings),
  [0] * len(reviews[:3])
))

In [20]:
# Sample predictions
predictions = trainer.predict(eval_dataset)
predicted_labels = np.argmax(predictions.predictions, axis=-1)
predictions

PredictionOutput(predictions=array([[-0.10553046, -0.00838621],
       [-0.08096523, -0.03045585],
       [-0.12836005,  0.01417688]], dtype=float32), label_ids=array([0, 0, 0]), metrics={'eval_loss': 0.7428574562072754})

In [22]:
predicted_labels

array([1, 1, 1], dtype=int64)

### Model fine tuning using native Tensorflow

In [28]:
# Define tensorflow model object
model = TFDistilBertForSequenceClassification.from_pretrained(checkpoint)

optimizer = tf.keras.optimizers.Adam(learning_rate = 5e-5)
model.compile(optimizer = optimizer, loss = model.compute_loss)
model.fit(train_dataset, epochs = 3, batch_size = 4)