## Emotion Detection Using Distilbert

In [1]:
import warnings
warnings.filterwarnings('ignore')
%config Completer.use_jedi = False

In [2]:
import pandas as pd
import numpy as np
import regex as re
import string
import alphabet_detector
import tensorflow as tf
import sklearn
import nltk.corpus
import seaborn as sn
import matplotlib.pylab as plt
import cleantext

from cleantext import clean
from tensorflow import keras
from nltk.corpus import stopwords
from alphabet_detector import AlphabetDetector
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [3]:
tf.get_logger().setLevel('ERROR')

In [4]:
df_train = pd.read_csv('train.txt', names = ['Sentence', 'Sentiment'], delimiter = ';', encoding='utf-8')
df_val = pd.read_csv('val.txt', names = ['Sentence', 'Sentiment'], delimiter = ';', encoding='utf-8')
df_test = pd.read_csv('test.txt', names = ['Sentence', 'Sentiment'], delimiter = ';', encoding='utf-8')

In [5]:
## 20K Data ==> 60% Train, 20% Val, 20% Test
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

(16000, 2)
(2000, 2)
(2000, 2)


In [6]:
df = pd.concat([df_train, df_val, df_test])
df.shape

(20000, 2)

## Text Preprocessing

In [7]:
words = set(nltk.corpus.words.words())
stop = stopwords.words('english')
def remove_symbols(text):
    pattern = r'[' + string.punctuation + ']'
    return re.sub(pattern, '', text)

def lower_case(text):
    return text.lower()

def remove_extra_spaces(text):
    text = re.sub(' +', ' ', text)
    if text[-1] == ' ':
        text = text[:-1]
    if text[0] == ' ':
        text = text[1:]
    return text

def remove_numbers(text):
    return re.sub('[0-9]', '',text)

def remove_links(text):
    return re.sub(r'http\S+', '', text)

def remove_non_ASCII(text):
    return re.sub(r'[^\x00-\x7f]',r' ',text)

def clean_text(text):
    #text = re.sub('\\n', '', text)
    text = remove_numbers(text)
    text = remove_symbols(text)
    text = lower_case(text)
    text = remove_non_ASCII(text)
    text = remove_extra_spaces(text)
    text = remove_links(text)
    return text

In [8]:
df.Sentence=df.Sentence.apply(lambda x:clean_text(x))

In [9]:
df

Unnamed: 0,Sentence,Sentiment
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
1995,i just keep feeling like someone is being unki...,anger
1996,im feeling a little cranky negative after this...,anger
1997,i feel that i am useful to my people and that ...,joy
1998,im feeling more comfortable with derby i feel ...,joy


In [10]:
df['Number_of_Words'] = df.Sentence.apply(lambda x:len(x.split()))

In [11]:
df.head()

Unnamed: 0,Sentence,Sentiment,Number_of_Words
0,i didnt feel humiliated,sadness,4
1,i can go from feeling so hopeless to so damned...,sadness,21
2,im grabbing a minute to post i feel greedy wrong,anger,10
3,i am ever feeling nostalgic about the fireplac...,love,18
4,i am feeling grouchy,anger,4


In [12]:
df.info()
# Sentinment column is of type object, we will change it to categorical

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Sentence         20000 non-null  object
 1   Sentiment        20000 non-null  object
 2   Number_of_Words  20000 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 625.0+ KB


In [13]:
df.Sentiment = df.Sentiment.astype('category')

In [14]:
df.info()
# Now the sentiment column has been changed to category

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Sentence         20000 non-null  object  
 1   Sentiment        20000 non-null  category
 2   Number_of_Words  20000 non-null  int64   
dtypes: category(1), int64(1), object(1)
memory usage: 488.5+ KB


In [15]:
df.Sentiment.unique()

['sadness', 'anger', 'love', 'surprise', 'fear', 'joy']
Categories (6, object): ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise']

In [16]:
# Easy way to make them numbers without using dictionary
df.Sentiment.cat.codes

0       4
1       4
2       0
3       3
4       0
       ..
1995    0
1996    0
1997    2
1998    2
1999    1
Length: 20000, dtype: int8

In [17]:
df.Sentiment = df.Sentiment.cat.codes

In [18]:
# Use this dictionary to convert them back into code at the end
dictionary_classes = {'anger':0, 'fear':1, 'joy':2, 'love':3, 'sadness':4, 'surprise':5}

In [19]:
# We will use this for seq_length later on
df.Number_of_Words.max()

66

In [20]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size = 0.3, random_state=42, stratify=df.Sentiment)

In [21]:
print(df_train.shape)
print(df_test.shape)

(14000, 3)
(6000, 3)


In [22]:
df

Unnamed: 0,Sentence,Sentiment,Number_of_Words
0,i didnt feel humiliated,4,4
1,i can go from feeling so hopeless to so damned...,4,21
2,im grabbing a minute to post i feel greedy wrong,0,10
3,i am ever feeling nostalgic about the fireplac...,3,18
4,i am feeling grouchy,0,4
...,...,...,...
1995,i just keep feeling like someone is being unki...,0,36
1996,im feeling a little cranky negative after this...,0,10
1997,i feel that i am useful to my people and that ...,2,18
1998,im feeling more comfortable with derby i feel ...,2,18


In [23]:
from tensorflow.keras.utils import to_categorical

## now we use this to one hot encode them
#to_categorical(df_train.Sentiment)

In [24]:
from transformers import AutoTokenizer, TFAutoModel
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
bert = TFAutoModel.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')

Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertModel: ['classifier', 'dropout_19', 'pre_classifier']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [25]:
## To save the tokenizer & bert use:
#tokenizer.save_pretrained('BERT_Tokenizer')
#bert.save_pretrained('BERT_Model')

In [26]:
X_train = tokenizer(text = df_train.Sentence.tolist(),
                    add_special_tokens=True,
                    max_length=70,
                    truncation=True,
                    padding=True,
                    return_tensors='tf',
                    return_token_type_ids=False, # Not very important
                    return_attention_mask=True,  # Usually good attention mask
                    verbose=True)

X_test = tokenizer(text = df_test.Sentence.tolist(),
                    add_special_tokens=True,
                    max_length=70,
                    truncation=True,
                    padding=True,
                    return_tensors='tf',
                    return_token_type_ids=False, # Not very important
                    return_attention_mask=True,  # Usually good attention mask
                    verbose=True)

In [27]:
X_train

{'input_ids': <tf.Tensor: shape=(14000, 70), dtype=int32, numpy=
array([[  101,  1045,  2514, ...,     0,     0,     0],
       [  101,  1045,  2514, ...,     0,     0,     0],
       [  101,  1045,  2514, ...,     0,     0,     0],
       ...,
       [  101,  1045,  3294, ...,     0,     0,     0],
       [  101, 10047,  2025, ...,     0,     0,     0],
       [  101,  1045,  2134, ...,     0,     0,     0]])>, 'attention_mask': <tf.Tensor: shape=(14000, 70), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}

In [28]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

In [29]:
#tf.config.experimental.list_physical_devices('GPU')

In [30]:
max_len = 70
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense

input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
# embeddings = dbert_model(input_ids,attention_mask = input_mask)[0]


embeddings = bert(input_ids,attention_mask = input_mask)[0] #(0 is the last hidden states,1 means pooler_output)
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)

y = Dense(6,activation = 'sigmoid')(out)
    
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True
# for training bert our lr must be so small

In [31]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 70)]         0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 70)]         0                                            
__________________________________________________________________________________________________
tf_distil_bert_model (TFDistilB TFBaseModelOutput(la 66362880    input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
global_max_pooling1d (GlobalMax (None, 768)          0           tf_distil_bert_model[0][0]   

In [32]:
## tf.keras.utils.plot_model(model, show_shapes=True)

In [33]:
optimizer = tf.keras.optimizers.Adam(1e-5)
loss = tf.keras.losses.CategoricalCrossentropy()  # categorical = one-hot
# acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [34]:
train_history = model.fit(
    x ={'input_ids':X_train['input_ids'],'attention_mask':X_train['attention_mask']} ,
    y = to_categorical(df_train.Sentiment),
    validation_data = (
    {'input_ids':X_test['input_ids'],'attention_mask':X_test['attention_mask']}, to_categorical(df_test.Sentiment)
    ),
  epochs=3,
    batch_size=36
)

Epoch 1/3
Epoch 2/3
Epoch 3/3
