In [7]:
import tensorflow as tf

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m91.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [10]:
from transformers import pipeline


# directly do sentiment-analysis without choosing any transformer. It will choose some default.
classifier = pipeline("sentiment-analysis")
classifier(
    [
        "I've been waiting for a HuggingFace course my whole life.",
        "I hate this so much!",
    ]
)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


[{'label': 'POSITIVE', 'score': 0.9598048329353333},
 {'label': 'NEGATIVE', 'score': 0.9994558691978455}]

# Chosing specific transformer for tokenizing the input words. This tokenized input words should only be fed into same transformer and not to the other transformer. 

In [17]:
from transformers import AutoTokenizer


# preparing the tokenizer by picking the model
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


# sample input
raw_inputs = [
    "I've been waiting for a HuggingFace course my whole life.",
    "I hate this so much!",
]


# feeding those inputs to the tokenizer for tokenizing those input words and saving those tokens in the form of tensorflow tensor. 
inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="tf")
print(inputs)


# after running the code you will see two multidimensional numpy array. The first array contains tokenized words of both of the sentences and another numpy array represents the attention mask which contains 1 and zero. it is important in transformer so that while in encoder or decoder, the model is look only to the words that are tokenized. Those zero means there are no words. they are just there because of padding 

{'input_ids': <tf.Tensor: shape=(2, 16), dtype=int32, numpy=
array([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662,
        12172,  2607,  2026,  2878,  2166,  1012,   102],
       [  101,  1045,  5223,  2023,  2061,  2172,   999,   102,     0,
            0,     0,     0,     0,     0,     0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2, 16), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}


# After Tokenizing, feeding them to the same model

In [18]:
from transformers import TFAutoModel


# preparing the model
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = TFAutoModel.from_pretrained(checkpoint)   # it will download the model and save it in cache


# The inputs variable is from previous cell which contains numeric tokens
outputs = model(inputs)
print(outputs.last_hidden_state.shape)


# That (2,16,768) shape coming in output means that there are two sentences which is processed. 16 tokens/unique words. And for each of those 16 words, there is 768 length vector which represent each token. meaning if input is "Dilip is ML engineer" the length of vector representing "Dilip" token would be 768 and other token will be like the same

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertModel: ['dropout_19', 'classifier', 'pre_classifier']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


(2, 16, 768)


In [21]:
# lets see how it looks like
outputs

TFBaseModelOutput(last_hidden_state=<tf.Tensor: shape=(2, 16, 768), dtype=float32, numpy=
array([[[-0.17978022,  0.23332833,  0.6320985 , ..., -0.3016663 ,
          0.50082004,  0.14814392],
        [ 0.2757767 ,  0.6497122 ,  0.3199771 , ..., -0.07599561,
          0.5136171 ,  0.13292241],
        [ 0.904585  ,  0.09851379,  0.29497236, ...,  0.33519453,
         -0.14074168, -0.6464028 ],
        ...,
        [ 0.1465893 ,  0.5660602 ,  0.32352817, ..., -0.33757487,
          0.5099777 , -0.05610804],
        [ 0.75000465,  0.04872592,  0.17379996, ...,  0.4684146 ,
          0.00296628, -0.6083754 ],
        [ 0.05194408,  0.3729484 ,  0.5223324 , ...,  0.35840553,
          0.65004265, -0.38829806]],

       [[-0.29370636,  0.7282561 , -0.14972661, ..., -0.11868094,
         -1.0226722 , -0.04215677],
        [-0.220636  ,  0.93838435, -0.09512489, ..., -0.36431676,
         -0.6605218 ,  0.2406973 ],
        [-0.15360779,  0.8987497 , -0.07276388, ..., -0.21891758,
         -0.8

# Custom Sentiment Analysis using numeric representation made by transformer as input

Now using above transformer model, i have converted my custom texts to the high dimensional numeric representation and i am going to use that numeric representation (just like the values coming in outputs variable in above code cell) to pass them to my custom feed forward neural network followed by the sigmoid layer for binary classification of the text

In [42]:
import tensorflow as tf
from transformers import TFAutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split

# Define the text input and corresponding labels
text_input = ["you are good", "you are bad", "I love you","life is useless"]
labels = [1, 0, 1,0]

# Split the data into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(text_input, labels, test_size=0.2, random_state=42)

# Load the DistilBERT model and tokenizer
checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModel.from_pretrained(checkpoint)

# Tokenize the input texts and convert them to model inputs
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors="tf")
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors="tf")

# Obtain the last-layer hidden states from the model
train_outputs = model(train_encodings.input_ids)[0]
test_outputs = model(test_encodings.input_ids)[0]



sentiment_model = tf.keras.Sequential([
    tf.keras.layers.GlobalMaxPooling1D(),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

# Compile the sentiment analysis model
sentiment_model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Convert labels to numpy arrays
train_labels = tf.convert_to_tensor(train_labels)
test_labels = tf.convert_to_tensor(test_labels)

# Train the sentiment analysis model
sentiment_model.fit(train_outputs, train_labels, validation_data=(test_outputs, test_labels), epochs=10, batch_size=32)


Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertModel: ['dropout_19', 'classifier', 'pre_classifier']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f1d0b7728f0>

In [43]:
# Testing with our test dataset splitted in above code cell
print(test_texts)
sentiment_model.predict(test_outputs)



['you are bad']


array([[0.00133191]], dtype=float32)

Now Testing with our custom input. For testing with below texts i have to first convert them into the meaninful representation with the same transformer model i have used above. As my actual model "sentiment_model" used above takes input from the numeric representation made by the transformer model.

So first we will tokenize the input sentences, pass them to the transformer model for numeric representation and then pass that as input to the sentiment_model.predict() function

In [44]:
# Tokenize the input texts and convert them to model inputs
custom_test_texts=[
    "I absolutely loved the movie. It was captivating from start to finish.",
    "The customer service was terrible. I had a horrible experience.",
    "The book was quite intriguing and kept me hooked until the last page.",
    "The restaurant had an amazing ambiance and the food was delicious.",
    "I was disappointed with the quality of the product. It did not meet my expectations.",
    "The performance was outstanding. The actors delivered exceptional performances.",
    "I found the storyline confusing and hard to follow. It lacked coherence.",
    "The hotel room was dirty and poorly maintained. I wouldn't recommend staying there.",
    "The concert was electrifying. The energy in the venue was incredible.",
    "The software crashed multiple times. It was frustrating and caused a lot of inconvenience.",
    "I thought the experience will be worst, bad, and horrible but it ended up really amazing and good"
]
validation_encodings = tokenizer(custom_test_texts, truncation=True, padding=True, return_tensors="tf")

# Obtain the last-layer hidden states from the model
validation_outputs = model(validation_encodings.input_ids)[0]


predictions=sentiment_model.predict(validation_outputs)



for sentence, prediction in zip(custom_test_texts, predictions):
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    print(f"Sentence: {sentence}")
    print(f"Sentiment: {sentiment}")



Sentence: I absolutely loved the movie. It was captivating from start to finish.
Sentiment: Positive
Sentence: The customer service was terrible. I had a horrible experience.
Sentiment: Negative
Sentence: The book was quite intriguing and kept me hooked until the last page.
Sentiment: Positive
Sentence: The restaurant had an amazing ambiance and the food was delicious.
Sentiment: Positive
Sentence: I was disappointed with the quality of the product. It did not meet my expectations.
Sentiment: Negative
Sentence: The performance was outstanding. The actors delivered exceptional performances.
Sentiment: Positive
Sentence: I found the storyline confusing and hard to follow. It lacked coherence.
Sentiment: Negative
Sentence: The hotel room was dirty and poorly maintained. I wouldn't recommend staying there.
Sentiment: Negative
Sentence: The concert was electrifying. The energy in the venue was incredible.
Sentiment: Positive
Sentence: The software crashed multiple times. It was frustrating 

# Doing with hugging face sequence classification model

In [36]:
from transformers import TFAutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint)

#this inputs variable is from very above code cells where we have done tokenization to our input sentences
outputs = model(inputs)
print(outputs.logits.shape)



# the logit scores will come as output from this model. for each of the sentence there will be two scores. (for our two input sentences there will be 2*2 dimensional matrix)

print(outputs.logits)

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized: ['dropout_291']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(2, 2)
tf.Tensor(
[[-1.5606955  1.6122805]
 [ 4.169232  -3.3464477]], shape=(2, 2), dtype=float32)


Postprocessing: now let's convert this logits score to the probability score and see the respective label associated with those probability sores.

In [40]:
probability_scores = tf.math.softmax(outputs.logits, axis=-1)
print(probability_scores)
# To get the labels corresponding to each position, we can inspect the id2label attribute of the model config:
print(model.config.id2label)

# so for first sentence there are two labels, negative and positive and for second sentence also

tf.Tensor(
[[4.0195443e-02 9.5980453e-01]
 [9.9945587e-01 5.4418371e-04]], shape=(2, 2), dtype=float32)
{0: 'NEGATIVE', 1: 'POSITIVE'}
