In [1]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, Input
from keras import Model

!pip install transformers

In [2]:
from transformers import AutoTokenizer
from transformers import TFAutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer= AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFAutoModel.from_pretrained("bert-base-uncased")

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

## feeding tokenizer outputs to bert model example

In [4]:
max_len= 40

In [5]:
encoded_input= tokenizer(['I am Groot', 'wakanda forever'], padding= True, truncation= True, return_tensors= 'tf', max_length= max_len, return_token_type_ids= True)
encoded_input

{'input_ids': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[  101,  1045,  2572, 24665, 17206,   102],
       [  101, 11333,  9126,  2850,  5091,   102]])>, 'token_type_ids': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(2, 6), dtype=int32, numpy=
array([[1, 1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1, 1]])>}

In [7]:
bert_model(encoded_input)

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(2, 6, 768), dtype=float32, numpy=
array([[[-0.3327677 ,  0.42149922, -0.16797335, ..., -0.09722349,
          0.2623883 ,  0.37818265],
        [ 0.08121498, -0.11514756, -0.21918866, ..., -0.30390704,
         -0.07442201,  0.3168953 ],
        [-0.31906444,  0.25485864,  0.02986773, ..., -0.20651984,
          0.48025885,  0.07582793],
        [-0.65378463, -0.31161553,  0.5614989 , ..., -0.14978798,
          0.27655056,  0.17065088],
        [-1.1158617 , -0.24015881, -0.0070679 , ...,  0.23682293,
          0.75628185,  0.02180522],
        [ 0.8939981 ,  0.04288428, -0.38189763, ...,  0.05125692,
         -0.6297489 , -0.28878093]],

       [[-0.31611997, -0.2772767 ,  0.0022681 , ..., -0.41819984,
          0.4085982 ,  0.37561494],
        [ 0.28752905, -0.9181662 , -0.31021798, ...,  0.20761995,
         -0.89254284, -0.1056675 ],
        [-0.01256803, -0.6021403 , -0.7832769 , ..., -0.48230404

### Data Reading

In [8]:
import numpy as np
import pandas as pd

In [9]:
data= pd.read_csv(r"F:\AI-ML\NLP\resources\twitter_training.csv", names= ['id', 'area', 'sentiment', 'review'])
data.head()

Unnamed: 0,id,area,sentiment,review
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [36]:
test_data= pd.read_csv(r'F:\AI-ML\NLP\resources\twitter_validation.csv',  names= ['id', 'area', 'sentiment', 'review'])
test_data.head()

Unnamed: 0,id,area,sentiment,review
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [None]:
list(data['review'].values)

### Preprocessing

In [11]:
import re
from string import punctuation

def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in punctuation])
    text = re.sub(r'\d+', '', text)
    return text

In [12]:
data['review']= data['review'].astype('str')
data['sentiment']= data['sentiment'].astype('str')

In [13]:
data['review']= data['review'].apply(clean_text)

In [14]:
num_classes= len(data['sentiment'].unique())

### Tokenization

In [15]:
encoded= tokenizer(list(data['review'].values), padding= True, truncation= True, return_tensors='tf', max_length= max_len, return_token_type_ids= True)
encoded

{'input_ids': <tf.Tensor: shape=(74682, 40), dtype=int32, numpy=
array([[  101, 10047,  2893, ...,     0,     0,     0],
       [  101,  1045,  2572, ...,     0,     0,     0],
       [  101, 10047,  2893, ...,     0,     0,     0],
       ...,
       [  101,  2074,  3651, ...,     0,     0,     0],
       [  101,  2074,  3651, ...,     0,     0,     0],
       [  101,  2074,  2066, ...,     0,     0,     0]])>, 'token_type_ids': <tf.Tensor: shape=(74682, 40), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])>, 'attention_mask': <tf.Tensor: shape=(74682, 40), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}

In [16]:
# inputs
input_ids= encoded['input_ids']
attention_mask= encoded['attention_mask']
token_type_ids= encoded['token_type_ids']

In [None]:
data.iloc[:50,:]['sentiment']

In [51]:
from sklearn.preprocessing import LabelEncoder

le= LabelEncoder()
le.fit(data['sentiment'])

In [None]:
data['sentiment']= data['sentiment'].apply(lambda x: le.transform([x])[0])

In [22]:
from keras.utils import to_categorical

labels= to_categorical(data['sentiment'])
labels

array([[0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       ...,
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]], dtype=float32)

### Building Model

#### Input > Tokenizer > bert

In [60]:
def create_model():
    input_ids= Input(shape=(max_len,), dtype= tf.int32, name= 'input_ids')
    attention_mask = Input(shape=(max_len,),name='attention_mask',dtype=tf.int32)
    embeddings= bert_model(input_ids, attention_mask)['pooler_output']
    outputs= Dense(num_classes, activation= 'sigmoid')(embeddings)

    model= Model(inputs= [input_ids, attention_mask], outputs= outputs)

    return model


In [24]:
model= create_model()

In [25]:
optimizer= keras.optimizers.Adam(learning_rate=2e-5)
loss= keras.losses.BinaryFocalCrossentropy()

model.compile(optimizer= optimizer, loss= loss, metrics= ['accuracy'])

In [26]:
with tf.device("CPU"):
    data_dict = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': token_type_ids,  # This is optional
        
    }, labels

    dataset = tf.data.Dataset.from_tensor_slices(data_dict)

    dataset = dataset.shuffle(buffer_size=len(labels)).batch(16)

In [27]:
!nvidia-smi

Sat Mar 23 18:45:55 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 516.40       Driver Version: 516.40       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   57C    P8     3W /  N/A |   2484MiB /  4096MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
with tf.device("GPU"):
    model.fit(dataset, epochs= 3)

Epoch 1/3
f:\anaconda\envs\py310\lib\site-packages\keras\engine\functional.py:637: UserWarning: Input dict contained keys ['token_type_ids'] which did not match any model input. They will be ignored by the model.

  inputs = self._flatten_to_reference_inputs(inputs)

4668/4668 [==============================] - 1307s 276ms/step - loss: 0.0873 - accuracy: 0.6892

Epoch 2/3

1133/4668 [======>.......................] - ETA: 16:10 - loss: 0.0396 - accuracy: 0.8794

In [40]:
test_data['review']= test_data['review'].astype('str')
test_data['sentiment']= test_data['sentiment'].astype('str')
test_data['review']= test_data['review'].apply(clean_text)
test_encoded= tokenizer(list(test_data['review'].values), padding= True, truncation= True, return_tensors='tf', max_length= max_len, return_token_type_ids= True)


input_ids= encoded['input_ids']
attention_mask= encoded['attention_mask']
token_type_ids= encoded['token_type_ids']

test_data['sentiment']
le.transform(['Positive'])

test_data['sentiment']= test_data['sentiment'].apply(lambda x: le.transform([x])[0])

test_labels= to_categorical(test_data['sentiment'])

In [58]:
with tf.device("CPU"):
    test_data_dict = {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'token_type_ids': token_type_ids,  # This is optional
        
    }, test_labels

    test_dataset = tf.data.Dataset.from_tensor_slices(data_dict)

    test_dataset = test_dataset.shuffle(buffer_size=len(labels)).batch(16)

### Evaluation

In [59]:
model.evaluate(test_dataset)

  inputs = self._flatten_to_reference_inputs(inputs)




[0.025201456621289253, 0.922966718673706]

### Prediction

In [66]:
def predict(model, tokenizer, input_text, max_len):
    # Tokenize input text
    input_ids = tokenizer.encode(input_text, max_length=max_len, padding='max_length', truncation=True, return_tensors='tf')
    attention_mask = tf.ones_like(input_ids)

    # Perform inference
    predictions = model.predict([input_ids, attention_mask])
    labeled_data= {0:'Irrelevant', 1: 'Negative', 2: 'Neutral', 3: 'Positive'}

    predicted_label= np.argmax(predictions)
    
    return labeled_data[predicted_label]

In [71]:
predict(model, tokenizer, 'I hate this algorithm', 40)



'Negative'

In [72]:
predict(model, tokenizer, 'Fortnite is running like ass.. fps drops everywhere wtf?', 40)



'Negative'

In [74]:
predict(model, tokenizer, 'this is the absolute FUNNIEST interaction I have ever seen on League of Legends', 40)



'Positive'

In [75]:
!pip freeze > resources/requirements.txt