In [167]:
!pip install transformers
!pip install wget



In [168]:

from os import path

inputfile="aclImdb_v1.tar.gz"

if not path.exists(inputfile) :
  !wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
  !tar xvfz aclImdb_v1.tar.gz

In [169]:
#!ls aclImdb/test/neg/*

In [170]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [171]:
import os
import torch
import torch.nn as nn
import transformers
from transformers import BertForSequenceClassification,AdamW,BertTokenizer
from transformers import get_linear_schedule_with_warmup
from torchsummary import summary


from tqdm.auto import tqdm 
import time
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn import model_selection
from sklearn.preprocessing import OneHotEncoder


In [172]:
MAX_LEN=512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 3
MODEL_PATH="/content/gdrive/My Drive/bert_stanford_sent_anal_model.bin"
OUTPUT_LOG="/content/gdrive/My Drive/bert_stanford_sent_anal_train.log"

In [173]:
if torch.cuda.is_available() :
  device = torch.device("cuda")
  print('We will use the GPU:',torch.cuda.get_device_name(0))
else:
  print('No GPU available, using the CPU instead')
  device = torch.device("cpu")

We will use the GPU: Tesla T4


In [174]:
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
enc = OneHotEncoder(handle_unknown='ignore')

Loading BERT tokenizer...


In [175]:
def log(value):
  if os.path.exists(OUTPUT_LOG) :
    f= open(OUTPUT_LOG,"a")
    f.write(value+"\n")
    f.close()
  else:
    f= open(OUTPUT_LOG,"w")
    f.write(value+ "\n")
    f.close()


In [176]:
#Model
class BERTBaseUncased(nn.Module):
    def __init__(self):
        super(BERTBaseUncased, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained(
                      "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
                      #num_labels = 2, # The number of output labels--2 for binary classification.
                      #output_attentions = False, # Whether the model returns attentions weights.
                      #output_hidden_states = False, # Whether the model returns all hidden-states.
                      )
    def forward(self, ids, mask, token_type_ids):
        o2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids)
        return o2[0]

In [177]:
class BERTDataset:
    def __init__(self, input_text, target):
        self.input_text = input_text
        self.target = target
        self.tokenizer = tokenizer
        self.max_len = MAX_LEN

    def __len__(self):
        return len(self.input_text)

    def __getitem__(self, item):
        input_text = str(self.input_text[item])
        input_text = " ".join(input_text.split())
        
        inputs = self.tokenizer.encode_plus(
            input_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            truncation='longest_first'
        )
        
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]
        token_type_ids = inputs["token_type_ids"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
            "targets": torch.tensor(self.target[item], dtype=torch.float),
        }


In [178]:
#Engine

def loss_fn(outputs, targets):
    #print("OUTPUT",outputs)
    return nn.BCEWithLogitsLoss()(outputs, targets)


def train_fn(data_loader, model, optimizer, device, scheduler):
    model.train()

    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        
        ids = d["ids"]
        token_type_ids = d["token_type_ids"]
        mask = d["mask"]
        targets = d["targets"]

        #print("ids",ids.shape)
        #print("attn",mask.shape)
        #print("token",token_type_ids.shape)

        ids = ids.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids=token_type_ids)

        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        


def eval_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            token_type_ids = d["token_type_ids"]
            mask = d["mask"]
            targets = d["targets"]

            ids = ids.to(device, dtype=torch.long)
            token_type_ids = token_type_ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets


In [179]:
def read_data(dir_path):
  classes=['pos','neg']
  values=[]
  targets=[]
  for cls in classes:
    base_path=dir_path + "/" + cls
    for entry in os.listdir(base_path):
      filename=os.path.join(base_path, entry)
      if os.path.isfile(filename):
        with open(filename, 'r') as file:
          data = file.read()
          values.append(data)
          targets.append(cls)
  #enc.fit(targets.reshape(-1,1))
  
  targets=enc.fit_transform(np.array(targets).reshape(-1,1)).toarray()
  print(targets.shape)
  return values,targets


In [180]:
def show_tokens(sentence):
    max_len = MAX_LEN
    input_text = str(sentence)
    input_text = " ".join(input_text.split())

  
    inputs = tokenizer.encode_plus(
            input_text,
            None,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            truncation='longest_first'
        )

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs["token_type_ids"]
    print(ids)
    print(mask)
    print(token_type_ids)

#show_tokens("Nice  picturization of the song on the beaches of Tahiti.[SEP]Very good acting by Abhishek and Aishwarya. ")
#show_tokens("Good Movie")

In [181]:
#Training

import random

def train():
 
    train_path='aclImdb/train'
    test_path='aclImdb/test'

    train_values,train_targets=read_data(train_path)
    test_values,test_targets=read_data(test_path)

    #for i in range(0,10) :
      #print(train_targets[i],train_values[i])

    print(len(train_targets))
    print(len(train_values))

    print(len(test_targets))
    print(len(test_values))

    choices = list(range(len(train_values)))
    random.shuffle(choices)

    new_train_values = []
    new_train_targets = []

    for n in choices :
      new_train_values.append(train_values[choices[n]])
      new_train_targets.append(train_targets[choices[n]])

    train_dataset = BERTDataset(        
        input_text=new_train_values, target=new_train_targets
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, num_workers=4,
    )

    valid_dataset = BERTDataset(
        input_text=test_values, target=test_targets
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, num_workers=1,
    )

    log("Train Size="+str(len(train_dataset)))
    log("Valid size="+str(len(valid_dataset)))
    
    model = BERTBaseUncased()
    model.to(device)
    #print(model)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]

    num_train_steps = int(len(train_targets) / TRAIN_BATCH_SIZE * EPOCHS)
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps
    )

    model = nn.DataParallel(model)

    best_accuracy = 0
    for epoch in range(EPOCHS):
        train_fn(train_data_loader, model, optimizer, device, scheduler)
        print("Calling eval")
        outputs, targets = eval_fn(valid_data_loader, model, device)
        outputs = np.array(outputs) >= 0.5
        accuracy = metrics.accuracy_score(targets, outputs)
        log(" Epoch="+str(epoch)+"Accuracy="+str(accuracy))
        print(f"Accuracy Score = {accuracy}")
        if accuracy > best_accuracy:
            model_to_save = model.module if hasattr(model, 'module') else model
            torch.save(model_to_save.state_dict(), MODEL_PATH)
            best_accuracy = accuracy
    return model

In [185]:
#Predict
PREDICTION_CACHE = dict()

def predict_from_cache(model,sentence):
    if sentence in PREDICTION_DICT:
        return PREDICTION_DICT[sentence]
    else:
        result = sentence_prediction(model,sentence)
        PREDICTION_CACHE[sentence] = result
        return result

def sentence_prediction(model,sentence):
    max_len = MAX_LEN
    input_text = str(sentence)
    input_text = " ".join(input_text.split())

  
    inputs = tokenizer.encode_plus(
            input_text,
            None,
            add_special_tokens=True,
            max_length=max_len,
            #pad_to_max_length=True,
            truncation='longest_first'
        )

    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]
    token_type_ids = inputs["token_type_ids"]

    padding_length = max_len - len(ids)
    ids = ids + ([0] * padding_length)
    mask = mask + ([0] * padding_length)
    token_type_ids = token_type_ids + ([0] * padding_length)

    ids = torch.tensor(ids, dtype=torch.long).unsqueeze(0)
    mask = torch.tensor(mask, dtype=torch.long).unsqueeze(0)
    token_type_ids = torch.tensor(token_type_ids, dtype=torch.long).unsqueeze(0)

    ids = ids.to(device, dtype=torch.long)
    token_type_ids = token_type_ids.to(device, dtype=torch.long)
    mask = mask.to(device, dtype=torch.long)
    
    outputs = model(ids=ids, mask=mask, token_type_ids=token_type_ids)
    outputs = torch.sigmoid(outputs[0]).cpu().detach().numpy()
    print(outputs)

    cls = np.round(outputs.reshape(-1,2))
    return enc.inverse_transform(cls)

In [186]:
def predict() :
  
  model=BERTBaseUncased()
  model.load_state_dict(torch.load(MODEL_PATH))
  model.to(device)
    
  out = sentence_prediction(model,"Sholay is a good movie, I like Thanku's acting very much")
  print(out)
  out = sentence_prediction(model,"Good Movie")
  print(out)
  out= sentence_prediction(model,"Uski Roti was a boring movie, there is no drama , just the routine visits to home by husband etc.")
  print(out)
  out= sentence_prediction(model,"bad movie")
  print(out)

In [187]:
#train()
predict()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

[0.04918659 0.9535181 ]
[['pos']]
[0.06349492 0.94567746]
[['pos']]
[0.945852   0.05560359]
[['neg']]
[0.9330367  0.07159735]
[['neg']]
