# Sentiment Analysis Model

In [1]:
import os
import tqdm
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import spacy
from spacy_transformers import TransformersLanguage, TransformersWordPiecer, TransformersTok2Vec
from transformers import BertTokenizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, BatchNormalization, Dropout, Flatten, Embedding, Dense
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import model_from_json
import tensorflow.keras.utils

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights





### Reading the data 

In [2]:
df = pd.read_csv('train.csv')
df.dropna(inplace=True)

In [3]:
X = df['text']
y = df['sentiment']

In [4]:
y.replace({'neutral':0, 'negative':-1, 'positive':1}, inplace=True)
y = tensorflow.keras.utils.to_categorical(y, num_classes=3, dtype='float32')

In [5]:
df

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",0
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,-1
2,088c60f138,my boss is bullying me...,bullying me,-1
3,9642c003ef,what interview! leave me alone,leave me alone,-1
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",-1
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,-1
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",-1
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,1
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,1


### Pre-proccessing with SpaCy-transformer pipeline

In [6]:
name = "bert-base-uncased"
nlp = TransformersLanguage(trf_name=name, meta={"lang": "en"})
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, name))
nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, name))

In [7]:
def preprocess(text_data):
    clean_X = []
    for text in tqdm.tqdm(text_data):
        doc = nlp(text)
        word_id = doc._.trf_word_pieces
        clean_X.append(word_id)
    max_lenght = len(max(clean_X, key=len))    
    word_vec_X = sequence.pad_sequences(clean_X, maxlen = max_lenght, padding='pre')
    pd.DataFrame(word_vec_X).to_csv("word_vec_X.csv", index=None)
    return word_vec_X
    
    
        

In [None]:
word_vec_X = preprocess(text_data)

In [8]:
word_vec_X = pd.read_csv('word_id.csv')
word_vec_X = word_vec_X.to_numpy()

In [9]:
Xtrain, Xtest, ytrain, ytest = train_test_split(word_vec_X, y, random_state=42)

### BERT Model

In [23]:
!wget https://storage.googleapis.com/bert_models/2020_02_20/uncased_L-12_H-768_A-12.zip
    

--2020-05-25 11:28:28--  https://storage.googleapis.com/bert_models/2020_02_20/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.5.112, 2607:f8b0:4005:80b::2010
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.5.112|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 408102251 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip’


2020-05-25 11:28:36 (79.7 MB/s) - ‘uncased_L-12_H-768_A-12.zip’ saved [408102251/408102251]



In [28]:
!unzip uncased_L-12_H-768_A-12.zip

Archive:  uncased_L-12_H-768_A-12.zip
caution: filename not matched:  y


In [48]:
model_dir = "./uncased_L-12_H-768_A-12/"
bert_params = bert.params_from_pretrained_ckpt(model_dir)
l_bert = bert.BertModelLayer.from_params(bert_params, name="bert")

In [49]:
max_seq_len = 112
l_input_ids = tensorflow.keras.layers.Input(shape=(max_seq_len,), dtype='int32')

output = l_bert(l_input_ids)

cls_out = tensorflow.keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
cls_out = Dropout(0.5)(cls_out)

logits = Dense(768, activation="tanh")(cls_out)
logits = Dropout(0.5)(logits)

logits = Dense(units=3,activation="softmax")(logits)

model = tensorflow.keras.Model(inputs=l_input_ids, outputs=logits)
model.build(input_shape=(None, max_seq_len))

In [50]:
bert.load_stock_weights(l_bert, './uncased_L-12_H-768_A-12/bert_model.ckpt')

Done loading 196 BERT weights from: ./uncased_L-12_H-768_A-12/bert_model.ckpt into <bert.model.BertModelLayer object at 0x7fa8b7219940> (prefix:bert_5). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights


[]

In [51]:
model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 112)]             0         
_________________________________________________________________
bert (BertModelLayer)        (None, 112, 768)          108890112 
_________________________________________________________________
lambda_4 (Lambda)            (None, 768)               0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 768)               590592    
_________________________________________________________________
dropout_9 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 3)                 2307

In [52]:
model.compile(optimizer=tensorflow.keras.optimizers.Adam(1e-5), loss='binary_crossentropy', metrics= ['accuracy'])

In [53]:
model.fit(Xtrain, ytrain, epochs=2, batch_size=8, validation_split=0.2)

Train on 16488 samples, validate on 4122 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7faccd4cba20>

### Saving the model

In [54]:
# serialize model to JSON
with open("model.json", "w") as json_file:
    json_file.write(model.to_json())

# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [56]:
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json, custom_objects={"BertModelLayer": bert.BertModelLayer})
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

Loaded model from disk


### Model evaluation

In [12]:
y_pred = loaded_model.predict(Xtest, batch_size=64, verbose=0)
y_pred_bool = np.argmax(y_pred, axis=1)
ytest_bool = np.argmax(ytest, axis=1)
print(classification_report(ytest_bool, y_pred_bool))

              precision    recall  f1-score   support

           0       0.80      0.85      0.82      2800
           1       0.89      0.86      0.87      2146
           2       0.86      0.83      0.85      1924

    accuracy                           0.85      6870
   macro avg       0.85      0.84      0.85      6870
weighted avg       0.85      0.85      0.85      6870



### Predicting sentiment of a new text

In [85]:
def sentiment_prediction(text):
    doc = nlp(text)
    word_id = doc._.trf_word_pieces
    word_id = sequence.pad_sequences([word_id], maxlen = 112, padding='pre')
    y_pred = loaded_model.predict(word_id, verbose=0)
    y_pred_bool = np.argmax(y_pred, axis=1)[0]
    
    if y_pred_bool == 0:
        prediction = "neutral"
    if y_pred_bool == 1:
        prediction = "positive"
    else:
        prediction = "negative"
        
    return prediction

In [86]:
text = 'I love spiced academy'
sentiment_prediction(text)


'positive'