# NLP with Bert for Sentiment Analysis

### Importing the libraries

In [1]:
import os.path
import numpy as np
import tensorflow as tf
import ktrain
from ktrain import text

## Data Preprocessing

### Loading the IMDB dataset

In [2]:
""" 
I will get the positive and Negative reviews from StanFord Webiste

"""

dataset = tf.keras.utils.get_file(fname="aclImdb_v1.tar.gz",
                                  origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
                                  extract=True)
IMDB_DATADIR = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [3]:
print(os.path.dirname(dataset))
print(IMDB_DATADIR)

/home/kris/.keras/datasets
/home/kris/.keras/datasets/aclImdb


### Creating the training and test sets

In [4]:
"""
datadir : It will accept the dataset

classes : It will accept the classes in our dataset. We can specify it there. It is there in the folder hence have put it here

maxlen :  maximum length of tokens in document

train_test_names : list of strings represnting the subfolder name for train and test sets

preprocess_mode : It is preprocessing Method of the text

"""

(x_train, y_train), (x_test, y_test), preproc = text.texts_from_folder(datadir=IMDB_DATADIR,
                                                                       classes=['pos','neg'],
                                                                       maxlen=500,
                                                                       train_test_names=['train','test'],
                                                                       preprocess_mode='bert')

detected encoding: utf-8
downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


## Building the BERT model

In [6]:
"""
Args :

name (string): one of:
                  - 'fasttext' for FastText model
                  - 'nbsvm' for NBSVM model  
                  - 'logreg' for logistic regression using embedding layers
                  - 'bigru' for Bidirectional GRU with pretrained word vectors
                  - 'bert' for BERT Text Classification
                  - 'distilbert' for Hugging Face DistilBert model
                  
Here I will be using BERT because I am building a BERT Model

train_data : 
             Is ktrain dataset

preproc : 
          Already defined above
"""

model = text.text_classifier(name='bert',
                             train_data=(x_train, y_train),
                             preproc=preproc)

Is Multi-Label? False
maxlen is 500
done.


## Training the BERT model

In [7]:
"""
Note : 

 Train and Test should be of same size
 
 Batch size can be experimented 6, 28, 32, 64 etc....
 
 Here I am taking 6 only

"""

BERT_Train = ktrain.get_learner(model=model,
                             train_data=(x_train, y_train),
                             val_data=(x_test, y_test),
                             batch_size=6)

In [None]:
"""
As It is not a serious model hence I am doing only for one epochs.

"""

BERT_Train.fit_onecycle(lr=2e-5,
                     epochs=1)



begin training using onecycle policy with max lr of 2e-05...
Train on 25000 samples, validate on 25000 samples
 1092/25000 [>.............................] - ETA: 93:31:31 - loss: 0.6157 - accuracy: 0.6740