In [1]:
import os
import tqdm
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

import spacy
from spacy_transformers import TransformersLanguage, TransformersWordPiecer, TransformersTok2Vec
from transformers import BertTokenizer

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, BatchNormalization, Dropout, Flatten, Embedding, Dense
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import model_from_json
import tensorflow.keras.utils

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

import bert
from bert import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights


### Reading the data 

In [2]:
df = pd.read_csv('train.csv')
df.dropna(inplace=True)

In [3]:
X = df['text']
y = df['sentiment']

In [4]:
y.replace({'neutral':0, 'negative':-1, 'positive':1}, inplace=True)
y = tensorflow.keras.utils.to_categorical(y, num_classes=3, dtype='float32')

### Importing SpaCy-transformer pipline

In [9]:
name = "bert-base-uncased"
nlp = TransformersLanguage(trf_name=name, meta={"lang": "en"})
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp.add_pipe(TransformersWordPiecer.from_pretrained(nlp.vocab, name))
nlp.add_pipe(TransformersTok2Vec.from_pretrained(nlp.vocab, name))

['sentencizer', 'trf_wordpiecer', 'trf_tok2vec']


In [10]:
def preprocess(text_data):
    clean_X = []
    for text in tqdm.tqdm(text_data):
        doc = nlp(text)
        word_id = doc._.trf_word_pieces
        clean_X.append(word_id)
    max_lenght = len(max(clean_X, key=len))    
    word_vec_X = sequence.pad_sequences(clean_X, maxlen = max_lenght, padding='pre')
    pd.DataFrame(word_vec_X).to_csv("word_vec_X.csv", index=None)
    return word_vec_X
    
    
        

In [23]:
word_vec_X = pd.read_csv('word_id.csv')
word_vec_X = word_vec_X.to_numpy()

In [24]:
Xtrain, Xtest, ytrain, ytest = train_test_split(word_vec_X, y)

### BERT Model

In [7]:
!wget https://storage.googleapis.com/bert_models/2020_02_20/uncased_L-12_H-768_A-12.zip
    

--2020-05-01 08:18:09--  https://storage.googleapis.com/bert_models/2020_02_20/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 216.58.194.208, 2607:f8b0:4005:802::2010
Connecting to storage.googleapis.com (storage.googleapis.com)|216.58.194.208|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 408102251 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip’


2020-05-01 08:18:30 (20.5 MB/s) - ‘uncased_L-12_H-768_A-12.zip’ saved [408102251/408102251]



In [8]:
!unzip uncased_L-12_H-768_A-12.zip

Archive:  uncased_L-12_H-768_A-12.zip
  inflating: bert_model.ckpt.data-00000-of-00001  
  inflating: bert_config.json        
  inflating: vocab.txt               
  inflating: bert_model.ckpt.index   


In [39]:
model_dir = "./uncased_L-12_H-768_A-12"
bert_params = bert.params_from_pretrained_ckpt(model_dir)
l_bert = bert.BertModelLayer.from_params(bert_params, name="bert")

In [29]:
max_seq_len = 112
l_input_ids = tensorflow.keras.layers.Input(shape=(max_seq_len,), dtype='int32')

output = l_bert(l_input_ids)

cls_out = tensorflow.keras.layers.Lambda(lambda seq: seq[:, 0, :])(output)
cls_out = Dropout(0.5)(cls_out)

logits = Dense(768, activation="tanh")(cls_out)
logits = Dropout(0.5)(logits)

logits = Dense(units=3,activation="softmax")(logits)

model = tensorflow.keras.Model(inputs=l_input_ids, outputs=logits)
model.build(input_shape=(None, max_seq_len))

In [11]:
bert_ckpt_file   = os.path.join(model_dir, "bert_model.ckpt")
bert.load_stock_weights(l_bert, bert_ckpt_file)

Done loading 196 BERT weights from: ./bert_model.ckpt into <bert.model.BertModelLayer object at 0x7f232856c518> (prefix:bert). Count of weights not found in the checkpoint was: [0]. Count of weights with mismatched shape: [0]
Unused weights from checkpoint: 
	bert/embeddings/token_type_embeddings
	bert/pooler/dense/bias
	bert/pooler/dense/kernel
	cls/predictions/output_bias
	cls/predictions/transform/LayerNorm/beta
	cls/predictions/transform/LayerNorm/gamma
	cls/predictions/transform/dense/bias
	cls/predictions/transform/dense/kernel
	cls/seq_relationship/output_bias
	cls/seq_relationship/output_weights


[]

In [34]:
model.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 112)]             0         
_________________________________________________________________
bert (BertModelLayer)        (None, 112, 768)          108890112 
_________________________________________________________________
lambda_2 (Lambda)            (None, 768)               0         
_________________________________________________________________
dropout_4 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 768)               590592    
_________________________________________________________________
dropout_5 (Dropout)          (None, 768)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 3)                 2307

In [35]:
model.compile(optimizer=tensorflow.keras.optimizers.Adam(1e-5), loss='binary_crossentropy', metrics= ['accuracy'])

In [36]:
model.fit(Xtrain, ytrain, epochs=2, batch_size=8, validation_split=0.2)

Train on 16488 samples, validate on 4122 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f1d1e11eba8>

### Model evaluation

In [37]:
y_pred = model.predict(Xtrain, batch_size=64, verbose=0)
y_pred_bool = np.argmax(y_pred, axis=1)
ytrain_bool = np.argmax(ytrain, axis=1)
print(classification_report(ytrain_bool, y_pred_bool))

              precision    recall  f1-score   support

           0       0.78      0.58      0.67      8374
           1       0.77      0.80      0.79      6462
           2       0.63      0.82      0.71      5774

    accuracy                           0.72     20610
   macro avg       0.72      0.74      0.72     20610
weighted avg       0.73      0.72      0.72     20610



In [38]:
y_pred = model.predict(Xtest, batch_size=64, verbose=0)
y_pred_bool = np.argmax(y_pred, axis=1)
ytest_bool = np.argmax(ytest, axis=1)
print(classification_report(ytest_bool, y_pred_bool))

NameError: name 'loaded_model' is not defined

### Saving the model

In [54]:
# serialize model to JSON
with open("model.json", "w") as json_file:
    json_file.write(model.to_json())

# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [70]:
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json, custom_objects={"BertModelLayer": bert.BertModelLayer})
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

Loaded model from disk
