In [44]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 4.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 50.2 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 45.7 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.1


In [45]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [30]:
twt_emotions =  pd.read_csv("tweet_emotions.csv")

In [31]:
twt_emotions.head(5)

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


### Number of words per tweet

In [90]:
twt_emotions['length'] = twt_emotions['content'].apply(
    lambda row: min(len(row.split(" ")), len(row)) if isinstance(row, str) else None
)

twt_emotions['length']

0        19
1        11
2         3
3         7
4        15
         ..
39995     1
39996     7
39997    25
39998    20
39999    25
Name: length, Length: 40000, dtype: int64

### Average number of words in all tweets

In [91]:
twt_emotions['length'].mean()

13.648925

### Handling the `sentiment` column

In [32]:
twt_emotions['sentiment'].value_counts()

neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: sentiment, dtype: int64

In [33]:
twt_emotions.sentiment.unique()

array(['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger'],
      dtype=object)

In [34]:
cat_codes = {"sentiment": {"empty": 0, "sadness": 1, "enthusiasm":2, "neutral":3
                           ,"worry":4, "surprise":5, "love":6, "fun":7,
                           "hate":8, "happiness":9, "boredom":10, "relief":11,
                           "anger":12}}

In [36]:
twt_emotions = twt_emotions.replace(cat_codes)

In [42]:
twt_emotions.drop("tweet_id", axis=1, inplace=True)

In [43]:
twt_emotions.head(5)

Unnamed: 0,sentiment,content
0,0,@tiffanylue i know i was listenin to bad habi...
1,1,Layin n bed with a headache ughhhh...waitin o...
2,1,Funeral ceremony...gloomy friday...
3,2,wants to hang out with friends SOON!
4,3,@dannycastillo We want to trade with someone w...


In [46]:
twt_emotions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  40000 non-null  int64 
 1   content    40000 non-null  object
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [47]:
twt_emotions['sentiment'] = twt_emotions['sentiment'].astype(int)

In [48]:
twt_emotions['sentiment'].value_counts()

3     8638
4     8459
9     5209
1     5165
6     3842
5     2187
7     1776
11    1526
8     1323
0      827
2      759
10     179
12     110
Name: sentiment, dtype: int64

### Tokenization

In [49]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [50]:
token = tokenizer.encode_plus(
    twt_emotions['content'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [51]:
token.input_ids

<tf.Tensor: shape=(1, 256), dtype=int32, numpy=
array([[  101,   137,   189, 11093, 18266, 19224,   178,  1221,   178,
         1108,  5113,  1394,  1106,  2213, 10671,  2206,  1105,   178,
         1408, 14406,  1394,  1120,  1117,  1226,   134,   164,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0, 

### Data Generators

In [99]:
X_input_ids = np.zeros((len(twt_emotions), 256))
X_attn_masks = np.zeros((len(twt_emotions), 256))

In [100]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate(twt_emotions['content'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [101]:
X_input_ids, X_attn_masks = generate_training_data(twt_emotions, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [102]:
labels = np.zeros((len(twt_emotions), 13))
labels.shape

(40000, 13)

In [103]:
labels[np.arange(len(twt_emotions)), twt_emotions['sentiment'].values] = 1 # one-hot encoded target tensor

In [104]:
labels

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [105]:
# Creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1) # one sample data

<TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(13,), dtype=tf.float64, name=None))>

In [106]:
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [107]:
dataset = dataset.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset 

In [108]:
dataset.take(1)

<TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(13,), dtype=tf.float64, name=None))>

In [109]:
len(dataset)

40000

In [114]:
dataset = dataset.shuffle(40000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

In [112]:
len(dataset)

2500

**Every unit in the `dataset` is a batch consisting of 16 tweets, as we had 40000 total tweets, the length of our dataset will be 40000/16=2500 now.**

In [68]:
train_dataset = dataset.take(2000)
val_dataset = dataset.skip(2000)

### BERT Model

In [69]:
from transformers import TFBertModel

In [70]:
model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights

Downloading:   0%|          | 0.00/527M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [75]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
output_layer = tf.keras.layers.Dense(13, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
sentiment_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 bert (TFBertMainLayer)         TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                         

In [76]:
optim = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [77]:
sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

In [78]:
hist = sentiment_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=1
)



In [79]:
sentiment_model.save('sentiment_model')



### Predictions

In [80]:
sentiment_model = tf.keras.models.load_model('sentiment_model')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=['empty', 'sadness', 'enthusiasm', 'neutral', 'worry', 'surprise',
       'love', 'fun', 'hate', 'happiness', 'boredom', 'relief', 'anger']):
    probs = model.predict(processed_data)[0]
    return classes[np.argmax(probs)]

In [84]:
input_text = input('Enter tweet here: ')
processed_data = prepare_data(input_text, tokenizer)
result = make_prediction(sentiment_model, processed_data=processed_data)
print(f"Predicted Sentiment: {result}")

Enter tweet here: are fucking serious?
Predicted Sentiment: worry


### Functionize this :

In [85]:
def predict():
    input_text = input('Enter tweet here: ')
    processed_data = prepare_data(input_text, tokenizer)
    result = make_prediction(sentiment_model, processed_data=processed_data)
    print(f"Predicted Sentiment: {result}")

In [86]:
predict()

Enter tweet here: thank god she is ok
Predicted Sentiment: worry


In [87]:
predict()

Enter tweet here: i adore her
Predicted Sentiment: love


In [88]:
predict()

Enter tweet here: i don't like black people
Predicted Sentiment: worry


In [89]:
predict()

Enter tweet here: i hate men
Predicted Sentiment: hate


In [98]:
predict()

Enter tweet here: go fuck yourself
Predicted Sentiment: hate


<img style="width:80px; height:80px; float:left;" src="https://i.kym-cdn.com/photos/images/original/000/715/783/7bf.gif"><img style="width:80px; height:80px; float:right;" src="https://i.kym-cdn.com/photos/images/original/000/715/783/7bf.gif"> 