In [None]:
import numpy as np

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_datasets as tfds

import matplotlib.pyplot as plt

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.test.is_gpu_available() else "NOT AVAILABLE")

Version:  2.2.0
Eager mode:  True
Hub version:  0.8.0
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU is available


In [None]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

imdb_data= pd.read_csv("/gdrive/My Drive/datasets/IMDB Dataset.csv")

imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
def clean_text(doc):

    document = remove_tags(doc)

    document = re.sub('[^a-zA-Z]', ' ', document)

    document = re.sub(r"\s+[a-zA-Z]\s+", ' ', document)

    document = re.sub(r'\s+', ' ', document)

    return document

TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(document):
    return TAG_RE.sub('', document)

In [None]:
imdb_data["review"] = imdb_data["review"].apply(clean_text)
imdb_data['sentiment'] = imdb_data['sentiment'].map({'positive': 1,'negative': 0})
imdb_data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production The filming tech...,1
2,I thought this was wonderful way to spend time...,1
3,Basically there a family where little boy Jake...,0
4,Petter Mattei Love in the Time of Money is vis...,1


In [None]:
! pip install transformers



In [None]:


from sklearn import preprocessing
from sklearn.model_selection import train_test_split

 
from transformers import (TFBertForSequenceClassification, 
                          BertTokenizer)
 
from tqdm import tqdm
	


In [None]:
X = imdb_data['review'].values
y = imdb_data['sentiment'].values
 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
print("Shape of training data: {0}, \nShape of test data: {1}".format(X_train.shape, X_test.shape))


Shape of training data: (40000,), 
Shape of test data: (10000,)


In [None]:
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

In [None]:

pad_token=0
pad_token_segment_id=0
max_length= 128
 
def text_to_bert_input(reviews):
  input_ids,attention_masks,token_type_ids=[],[],[]
  
  for review in tqdm(reviews,position=0, leave=True):
    bert_inputs = bert_tokenizer.encode_plus(review,add_special_tokens=True, max_length=max_length, truncation = True)
    
    input, token_type = bert_inputs["input_ids"], bert_inputs["token_type_ids"]
    mask = [1] * len(input)
 
    padding_length = max_length - len(input)
 
    input = input + ([pad_token] * padding_length)
    mask = mask + ([0] * padding_length)
    token_type  = token_type  + ([pad_token_segment_id] * padding_length)
    
    input_ids.append(input)
    attention_masks.append(mask)
    token_type_ids.append(token_type)
  
  return [np.asarray(input_ids), 
            np.asarray(attention_masks), 
            np.asarray(token_type_ids)]

In [None]:
X_test_input=text_to_bert_input(X_test)
X_train_input=text_to_bert_input(X_train)

100%|██████████| 10000/10000 [00:21<00:00, 472.41it/s]
100%|██████████| 40000/40000 [01:23<00:00, 476.43it/s]


In [None]:
def convert_to_tensors(input_ids,attention_masks,token_type_ids,y):
  return {"input_ids": input_ids,
          "attention_mask": attention_masks,
          "token_type_ids": token_type_ids},y


train_dataset = tf.data.Dataset.from_tensor_slices((X_train_input[0],X_train_input[1],X_train_input[2],y_train)).map(convert_to_tensors).shuffle(100).batch(32)
 
test_dataset = tf.data.Dataset.from_tensor_slices((X_test_input[0],X_test_input[1],X_test_input[2],y_test)).map(convert_to_tensors).batch(64)
 

In [None]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-cased")


optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
 
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 
model.summary()

Some weights of the model checkpoint at bert-base-cased were not used when initializing TFBertForSequenceClassification: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier', 'dropout_37']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 108,311,810
Trainable params: 108,311,810
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(train_dataset, epochs=1, validation_data=test_dataset)

