# NLP with Bert for Sentiment Analysis

### Importing the libraries

In [None]:
!pip3 install ktrain

Collecting ktrain
[?25l  Downloading https://files.pythonhosted.org/packages/00/9b/1992a4744074ccc0d9dc471db1acacb81453a5447688beba24488f412d37/ktrain-0.18.5.tar.gz (25.2MB)
[K     |████████████████████████████████| 25.2MB 125kB/s 
[?25hCollecting tensorflow==2.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/85/d4/c0cd1057b331bc38b65478302114194bd8e1b9c2bbc06e300935c0e93d90/tensorflow-2.1.0-cp36-cp36m-manylinux2010_x86_64.whl (421.8MB)
[K     |████████████████████████████████| 421.8MB 28kB/s 
Collecting scikit-learn==0.21.3
[?25l  Downloading https://files.pythonhosted.org/packages/a0/c5/d2238762d780dde84a20b8c761f563fe882b88c5a5fb03c056547c442a19/scikit_learn-0.21.3-cp36-cp36m-manylinux1_x86_64.whl (6.7MB)
[K     |████████████████████████████████| 6.7MB 38.0MB/s 
Collecting keras_bert>=0.81.0
  Downloading https://files.pythonhosted.org/packages/e2/7f/95fabd29f4502924fa3f09ff6538c5a7d290dfef2c2fe076d3d1a16e08f0/keras-bert-0.86.0.tar.gz
Collecting langdetect
[?25

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import os.path
import ktrain
from ktrain import text

## Part 1: Data Preprocessing

### Loading the IMDB dataset

In [None]:
dataset = tf.keras.utils.get_file(fname='aclImdb_v1.tar.gz',
                                  origin='https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz',
                                  extract=True)
Imdb_data = os.path.join(os.path.dirname(dataset),'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [None]:
print(os.path.dirname(dataset))
print(Imdb_data)

/root/.keras/datasets
/root/.keras/datasets/aclImdb


In [None]:
!ls /root/.keras/datasets/aclImdb/test

labeledBow.feat  neg  pos  urls_neg.txt  urls_pos.txt



### Creating the training and test sets

In [None]:
(x_train, y_train),(x_test, y_test),preproc = text.texts_from_folder(datadir=Imdb_data, classes=['pos','neg'],
                                                          ngram_range=1,
                                                          maxlen= 500,val_pct=0.11, 
                                                          preprocess_mode ='bert')

detected encoding: utf-8
downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


## Part 2: Building the BERT model

In [None]:
model = text.text_classifier(name='bert',train_data = (x_train,y_train),
                     preproc = preproc, 
                     metrics=['accuracy'])

Is Multi-Label? False
maxlen is 500
done.


In [None]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        [(None, 500)]        0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      [(None, 500)]        0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 500, 768), ( 23440896    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 500, 768)     1536        Input-Segment[0][0]              
____________________________________________________________________________________________

## Part 3: Training the BERT model

In [None]:
leraner = ktrain.get_learner(model = model, 
                   train_data = (x_train, y_train),
                   val_data = (x_test,y_test),
                   multigpu = True,
                   batch_size=6)

In [None]:
leraner.lr_find(show_plot=True)

In [None]:
leraner.fit_onecycle(lr=2e-5,epochs=1)



begin training using onecycle policy with max lr of 2e-05...
Train on 25000 samples, validate on 25000 samples


<tensorflow.python.keras.callbacks.History at 0x7f58800c6128>

In [None]:
dataset_2 = tf.keras.utils.get_file(fname='trainingandtestdata.zip',
                                  origin='http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip',
                                  extract=True)
dirn = os.path.join(os.path.dirname(dataset_2),'Twitters')

Downloading data from http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip


In [None]:
!mkdir /root/.keras/datasets/Twitters
!ls /root/.keras/datasets/Twitters

In [None]:
!mkdir /root/.keras/datasets/Twitters/train
# !mkdir /root/.keras/datasets/Twitters/test

In [None]:
!ls /root/.keras/datasets/Twitters

test  train


In [None]:
df = pd.read_csv("/root/.keras/datasets/training.1600000.processed.noemoticon.csv",
                 encoding='latin-1',
                 header=None)

In [None]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
df.columns = ['Emotion','Id','Dated','Queried','User','Tweet']

In [None]:
df.head()

Unnamed: 0,Emotion,Id,Dated,Queried,User,Tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
df.groupby('Emotion').nunique()

Unnamed: 0_level_0,Emotion,Id,Dated,Queried,User,Tweet
Emotion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1,800000,593879,1,415671,790185
4,1,800000,286578,1,376569,793506


In [None]:
!mkdir /root/.keras/datasets/Twitters/train/mean
!mkdir /root/.keras/datasets/Twitters/train/pos
!touch /root/.keras/datasets/Twitters/train/pos/hello.txt
!touch /root/.keras/datasets/Twitters/train/mean/hello.txt

In [None]:
# !rm -rf /root/.keras/datasets/Twitters/train

In [None]:
for i,j in zip(df.loc[:,['Emotion','Tweet']].values,range(18000)):
    if i[0] == 0:
      np.savetxt(r'/root/.keras/datasets/Twitters/train/mean/'+str(j)+"th_mean.txt",list(i[1]),fmt="%s")
    else:
      np.savetxt(r'/root/.keras/datasets/Twitters/train/pos/'+str(j)+"th_pos.txt",list(i[1]),fmt="%s")

In [None]:
!tail /root/.keras/datasets/Twitters/train/pos/

In [None]:
df_test = pd.read_csv("/root/.keras/datasets/testdata.manual.2009.06.14.csv",
                      encoding='latin-1',
                      header=None)

In [None]:
df_test.shape

(498, 6)

In [None]:
df_test.columns = ['Emotion','Id','Dated','Queried','User','Tweet']

In [None]:
!rm -f /root/.keras/datasets/Twitters/train/mean/hello.txt
!rm -f /root/.keras/datasets/Twitters/train/pos/hello.txt
# !mkdir /root/.keras/datasets/Twitters/test/mean
# !mkdir /root/.keras/datasets/Twitters/test/pos
# !touch /root/.keras/datasets/Twitters/test/pos/hello.txt
# !touch /root/.keras/datasets/Twitters/test/mean/hello.txt

In [None]:
for i,j in zip(df.loc[:,['Emotion','Tweet']].values,range(498)):
    if i[0] == 0:
      np.savetxt(r'/root/.keras/datasets/Twitters/test/mean/'+str(j)+"th_mean.txt",
                 list(i[1]),
                 fmt="%s")
    else:
      np.savetxt(r'/root/.keras/datasets/Twitters/test/pos/'+str(j)+"th_pos.txt",
                 list(i[1]),
                 fmt="%s")

In [None]:
!rm -f /root/.keras/datasets/Twitters/test/mean/hello.txt
!rm -f /root/.keras/datasets/Twitters/test/pos/hello.txt

In [None]:
new_data_dir = '/root/.keras/datasets/Twitters'

In [None]:
(x_train, y_train),(x_test, y_test),preproc = text.texts_from_folder(datadir=new_data_dir, classes=['pos','mean'],
                                                          ngram_range=2,
                                                          maxlen= 500,val_pct=0.11, 
                                                          preprocess_mode ='bert',)

detected encoding: utf-8
preprocessing train...
language: cy


Is Multi-Label? False
preprocessing test...
language: cy


In [None]:
y_train= tf.compat.v1.keras.utils.to_categorical(y_train)
y_test = tf.compat.v1.keras.utils.to_categorical(y_test)

In [None]:
model = text.text_classifier(name='bert',train_data = (x_train,y_train),
                     preproc = preproc, 
                     metrics=['accuracy'],)

Is Multi-Label? False
maxlen is 500
done.


In [None]:
learner2 = ktrain.get_learner(model = model, 
                   train_data = (x_train, y_train),
                   val_data = (x_test,y_test),
                   multigpu = True,
                   batch_size=6)

In [None]:
# learner2.lr_find(start_lr=2e-3,show_plot=True,max_epochs=4)

In [None]:
learner2.fit_onecycle(lr=2e-3,epochs=1) 



begin training using onecycle policy with max lr of 0.002...
Train on 18000 samples, validate on 498 samples


<tensorflow.python.keras.callbacks.History at 0x7ff4cd44ce80>

In [None]:
!pip install tensorflow



In [None]:
tf.__version__

'2.1.0'