# Model Adam (with word2vec transfert learning)

## Data loading (from data)

In [13]:
import pandas as pd

Test_df = pd.read_csv('../Demo_Project_NLP_sentiment_analysis_benbhk/data/X_test.csv')
Train_df = pd.read_csv('../Demo_Project_NLP_sentiment_analysis_benbhk/data/X_train.csv')
Test_df.dropna(inplace=True)
Train_df.dropna(inplace=True)

In [14]:
Test_df.drop(columns=['Unnamed: 0'],inplace=True)
Train_df.drop(columns=['Unnamed: 0'],inplace=True)

## Data Processing

In [15]:
Test_df['Sentiment_num'] = -1
Test_df.loc[Test_df['Sentiment'] =='Extremely Negative','Sentiment_num'] = 0
Test_df.loc[Test_df['Sentiment'] =='Negative','Sentiment_num'] = 1
Test_df.loc[Test_df['Sentiment'] =='Neutral','Sentiment_num'] = 2
Test_df.loc[Test_df['Sentiment'] =='Positive','Sentiment_num'] = 3
Test_df.loc[Test_df['Sentiment'] =='Extremely Positive','Sentiment_num'] = 4

Train_df.loc[Train_df['Sentiment'] =='Extremely Negative','Sentiment_num'] = 0
Train_df.loc[Train_df['Sentiment'] =='Negative','Sentiment_num'] = 1
Train_df.loc[Train_df['Sentiment'] =='Neutral','Sentiment_num'] = 2
Train_df.loc[Train_df['Sentiment'] =='Positive','Sentiment_num'] = 3
Train_df.loc[Train_df['Sentiment'] =='Extremely Positive','Sentiment_num'] = 4

# Test_df[Test_df['Sentiment']=='Extremely Negative']['Sentiment_num'] = 0

In [16]:
X_test = Test_df['OriginalTweet'].to_numpy()
X_train = Train_df['OriginalTweet'].to_numpy()
y_test = Test_df['Sentiment_num'].to_numpy(dtype=int)
y_train = Train_df['Sentiment_num'].to_numpy(dtype=int)

In [17]:
for i in range(len(X_train)):
    X_train[i] = str.encode(X_train[i])
for i in range(len(X_test)):
    X_test[i] = str.encode(X_test[i])

In [18]:
from tensorflow.keras.preprocessing.text import text_to_word_sequence
X_train = [text_to_word_sequence(_.decode("utf-8")) for _ in X_train]
X_test = [text_to_word_sequence(_.decode("utf-8")) for _ in X_test]

## Word2vec loading

In [19]:
import gensim.downloader as api
print(list(api.info()['models'].keys()))

['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis']


In [20]:
# word2vec_transfer = api.load('glove-wiki-gigaword-200')

In [22]:
# word2vec_transfer.save_word2vec_format('vectors.txt', binary=False)

In [23]:
from gensim.models import KeyedVectors

vectors_reloaded = KeyedVectors.load_word2vec_format('../Demo_Project_NLP_sentiment_analysis_benbhk/models/glove-wiki-gigaword-200.txt', binary=False)

## Word2vec processing

In [28]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Function to convert a sentence (list of words) into a matrix representing the words in the embedding space
def embed_sentence_with_TF(word2vec, sentence):
    embedded_sentence = []
    for word in sentence:
        if word in word2vec:
            embedded_sentence.append(word2vec[word])
        
    return np.array(embedded_sentence)

# Function that converts a list of sentences into a list of matrices
def embedding(word2vec, sentences):
    embed = []
    
    for sentence in sentences:
        embedded_sentence = embed_sentence_with_TF(word2vec, sentence)
        embed.append(embedded_sentence)
        
    return embed

# Embed the training and test sentences
# X_train_embed_2 = embedding(word2vec_transfer, X_train)
# X_test_embed_2 = embedding(word2vec_transfer, X_test)

X_train_embed_2 = embedding(vectors_reloaded, X_train)
X_test_embed_2 = embedding(vectors_reloaded, X_test)

## Padding (post)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

X_train_pad_2 = pad_sequences(X_train_embed_2, dtype='float32', padding='post', maxlen=100)
X_test_pad_2 = pad_sequences(X_test_embed_2, dtype='float32', padding='post', maxlen=100)

## Model creation 

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import Normalization
# from tensorflow.keras.layers import Normalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, SimpleRNN, Flatten
from tensorflow.keras import layers

model = Sequential()
model.add(layers.Masking(mask_value=0, input_shape=(100,200)))
model.add(layers.LSTM(units=32, activation="tanh"))
model.add(layers.Dense(32,activation="tanh"))
model.add(layers.Dense(16,activation="relu"))
model.add(layers.Dense(8,activation="relu"))
model.add(layers.Dense(2,activation="relu"))
model.add(layers.Dense(5,activation="relu"))
model.add(layers.Dense(5,activation="softmax"))

In [None]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adam


model.compile(loss='sparse_categorical_crossentropy',
              # optimizer='rmsprop',
              optimizer=Adam(learning_rate=0.005),
              metrics=['accuracy'])

es = EarlyStopping(patience=10,restore_best_weights=True)

history = model.fit(X_train_pad_2, y_train,
                    validation_split=0.2,
                    epochs=500, 
                    callbacks=[es],
                    batch_size=32, 
                    verbose=1)

In [None]:
res = model.evaluate(X_test_pad_2, y_test, verbose=1)

print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')

### save the model

In [None]:
import pickle as pkl

pkl.dump(model, open('../Demo_Project_NLP_sentiment_analysis_benbhk/models/model_Adam_5cat', 'wb'))

### load the model

In [25]:
import pickle as pkl

model = pkl.load(open('../Demo_Project_NLP_sentiment_analysis_benbhk/models/model_Adam_5cat', 'rb'))

2022-04-25 18:26:18.800779: W tensorflow/core/common_runtime/graph_constructor.cc:803] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-04-25 18:26:19.579927: W tensorflow/core/common_runtime/graph_constructor.cc:803] Node 'cond' has 5 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.




2022-04-25 18:26:23.694307: W tensorflow/core/common_runtime/graph_constructor.cc:803] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-04-25 18:26:23.774880: W tensorflow/core/common_runtime/graph_constructor.cc:803] Node 'cond' has 5 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-04-25 18:26:24.172546: W tensorflow/core/common_runtime/graph_constructor.cc:803] Node 'cond/while' has 13 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.
2022-04-25 18:26:24.275384: W tensorflow/core/common_runtime/graph_constructor.cc:803] Node 'cond' has 5 outputs but the _output_shapes attribute specifies shapes for 46 outputs. Output shapes may be inaccurate.


In [26]:
res = model.evaluate(X_test_pad_2, y_test, verbose=1)

print(f'The accuracy evaluated on the test set is of {res[1]*100:.3f}%')

NameError: name 'X_test_pad_2' is not defined