# Model Transformer (with word2vec transfert learning)

## Data loading (from data)

In [1]:
import pandas as pd

Test_df = pd.read_csv('../Demo_Project_NLP_sentiment_analysis_benbhk/data/X_test.csv')
Train_df = pd.read_csv('../Demo_Project_NLP_sentiment_analysis_benbhk/data/X_train.csv')
Test_df.dropna(inplace=True)
Train_df.dropna(inplace=True)

In [2]:
Test_df.drop(columns=['Unnamed: 0'],inplace=True)
Train_df.drop(columns=['Unnamed: 0'],inplace=True)

## Data Processing

In [3]:
Test_df['Sentiment_num'] = -1
Test_df.loc[Test_df['Sentiment'] =='Extremely Negative','Sentiment_num'] = 0
Test_df.loc[Test_df['Sentiment'] =='Negative','Sentiment_num'] = 1
Test_df.loc[Test_df['Sentiment'] =='Neutral','Sentiment_num'] = 2
Test_df.loc[Test_df['Sentiment'] =='Positive','Sentiment_num'] = 3
Test_df.loc[Test_df['Sentiment'] =='Extremely Positive','Sentiment_num'] = 4

Train_df.loc[Train_df['Sentiment'] =='Extremely Negative','Sentiment_num'] = 0
Train_df.loc[Train_df['Sentiment'] =='Negative','Sentiment_num'] = 1
Train_df.loc[Train_df['Sentiment'] =='Neutral','Sentiment_num'] = 2
Train_df.loc[Train_df['Sentiment'] =='Positive','Sentiment_num'] = 3
Train_df.loc[Train_df['Sentiment'] =='Extremely Positive','Sentiment_num'] = 4

# Test_df[Test_df['Sentiment']=='Extremely Negative']['Sentiment_num'] = 0

In [4]:
X_test = Test_df['OriginalTweet'].to_numpy()
X_train = Train_df['OriginalTweet'].to_numpy()
y_test = Test_df['Sentiment_num'].to_numpy(dtype=int)
y_train = Train_df['Sentiment_num'].to_numpy(dtype=int)

In [5]:
for i in range(len(X_train)):
    X_train[i] = str.encode(X_train[i])
for i in range(len(X_test)):
    X_test[i] = str.encode(X_test[i])

In [6]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
X_train = [text_to_word_sequence(_.decode("utf-8")) for _ in X_train]
X_test = [text_to_word_sequence(_.decode("utf-8")) for _ in X_test]

2022-04-26 13:57:40.965594: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-04-26 13:57:40.965783: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Word2vec loading

In [7]:
import gensim.downloader as api
print(list(api.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [8]:
%%time
word2vec_transfer = api.load('glove-wiki-gigaword-100')

CPU times: user 1min 22s, sys: 357 ms, total: 1min 22s
Wall time: 1min 22s


In [9]:
# word2vec_transfer

In [10]:
# word2vec_transfer.save_word2vec_format('../Demo_Project_NLP_sentiment_analysis_benbhk/models/glove-wiki-gigaword-50.txt', binary=False)

In [11]:
# %%time
# from gensim.models import KeyedVectors

# vectors_reloaded = KeyedVectors.load_word2vec_format('../Demo_Project_NLP_sentiment_analysis_benbhk/models/glove-wiki-gigaword-50.txt', binary=False)

## Word2vec processing

In [12]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence_with_TF(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec:
            embedded_sentence.append(word2vec[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

# Embed the training and test sentences
X_train_embed_2 = embedding(word2vec_transfer, X_train)
X_test_embed_2 = embedding(word2vec_transfer, X_test)

# X_train_embed_2 = embedding(vectors_reloaded, X_train)
# X_test_embed_2 = embedding(vectors_reloaded, X_test)

## Padding (post)

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

X_train_pad_2 = pad_sequences(X_train_embed_2, dtype='float32', padding='post', maxlen=100)
X_test_pad_2 = pad_sequences(X_test_embed_2, dtype='float32', padding='post', maxlen=100)

## Model creation 

In [14]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout, Layer
from tensorflow.keras.layers import Embedding, Input, GlobalAveragePooling1D, Dense
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential, Model
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

In [15]:
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = Sequential(
            [Dense(ff_dim, activation="relu"), 
             Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [16]:
class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [17]:
embed_dim = 100  # Embedding size for each token
num_heads = 3  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

maxlen = 100
# vocab_size = 200

inputs = Input(shape=(maxlen,embed_dim))
# embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
# x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(inputs)
x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(20, activation="relu")(x)
x = Dropout(0.1)(x)
outputs = Dense(5, activation="softmax")(x)

model = Model(inputs=inputs, outputs=outputs)

2022-04-26 13:59:40.225491: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-04-26 13:59:40.226019: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-04-26 13:59:40.227457: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (BenLaptop-V8N6C5JR): /proc/driver/nvidia/version does not exist
2022-04-26 13:59:40.231350: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

es = EarlyStopping(patience=10,restore_best_weights=True)

history = model.fit(X_train_pad_2, y_train, 
                    batch_size=32, epochs=500,
                    callbacks=[es], 
                    # validation_data=(X_test_pad_2), y_test
                    validation_split=0.2
                    )

2022-04-26 14:01:57.728668: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 1316960000 exceeds 10% of free system memory.


Epoch 1/500

In [20]:
model.save_weights("../Demo_Project_NLP_sentiment_analysis_benbhk/sformer_weights_W2V_50_3attention_head_accuracy_.h5")

In [21]:
results = model.evaluate(X_test_pad_2, y_test, verbose=1)

for name, value in zip(model.metrics_names, results):
    
    print("%s: %.3f" % (name, value))

loss: 1.030
accuracy: 0.573
