In [None]:
import os

In [None]:
os.chdir('/content/drive/MyDrive/BERT Classification')

In [None]:
import torch
import torchtext

In [None]:
! pip install transformers

In [None]:
import pandas as pd

train_data=pd.read_csv("IMDB/train.csv")

In [None]:
train_data.head()

Unnamed: 0,text,sentiment
0,"Now, I won't deny that when I purchased this o...",neg
1,"The saddest thing about this ""tribute"" is that...",neg
2,Last night I decided to watch the prequel or s...,neg
3,I have to admit that i liked the first half of...,neg
4,I was not impressed about this film especially...,neg


In [None]:
train_data.sentiment.unique()

array(['neg', 'pos'], dtype=object)

In [None]:
train_data["sentiment"]= train_data['sentiment'].map({'neg':0,'pos':1})

In [None]:
train_data.head()

Unnamed: 0,text,sentiment
0,"Now, I won't deny that when I purchased this o...",0
1,"The saddest thing about this ""tribute"" is that...",0
2,Last night I decided to watch the prequel or s...,0
3,I have to admit that i liked the first half of...,0
4,I was not impressed about this film especially...,0


In [None]:
X=list(train_data["text"])
y=list(train_data["sentiment"])

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y,stratify=y,test_size=0.2)

In [None]:
from transformers import BertTokenizer
model_name="bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [None]:
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [None]:
class Dataset(torch.utils.data.Dataset):

  def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

  def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item
        
  def __len__(self):
        return len(self.encodings["input_ids"])

train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)
  

In [None]:
!pip install tqdm



In [None]:
from tqdm import tqdm

In [None]:
def train(loader,model,optim,BS):
  model.train()
  training_loss=0

  for batch in tqdm(loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optim.step()

        training_loss+=loss.item()

  return round(training_loss/BS,2) 


def val(loader,model,BS):

  model.eval()
  val_loss=0

  with torch.no_grad():
    for batch in tqdm(loader):
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['labels'].to(device)
      outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
      loss = outputs[0]
      val_loss+=loss.item()

  return round(val_loss/BS,2)



In [None]:
def save_checkpoint(save_path, model, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)

def load_checkpoint(load_path, model):
    
    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    return state_dict['valid_loss']


In [None]:
from torch.utils.data import DataLoader
from transformers import BertForSequenceClassification, AdamW

best_valid_loss = float("Inf")
BS=16
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = BertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)

train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BS, shuffle=False)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):
    train_loss=train(train_loader,model,optim,BS)
    val_loss=val(val_loader,model,BS)

    print("Epoch [{}], Train Loss: {}, Valid Loss ".format(epoch+1,train_loss,val_loss))

    if best_valid_loss > val_loss:
      best_valid_loss = val_loss
      save_checkpoint('model.pt', model, best_valid_loss)
model.eval()

In [None]:
test_data=pd.read_csv("IMDB/test.csv")
test_data["sentiment"]= test_data['sentiment'].map({'neg':0,'pos':1})
X_test=list(test_data["text"])
y_test=list(test_data["sentiment"])

In [None]:
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)
test_dataset = Dataset(X_test_tokenized, y_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

def evaluate(model, loader):
    y_pred = []
    y_true = []

    model.eval()
    with torch.no_grad():
        for batch in tqdm(loader):
          input_ids = batch['input_ids'].to(device)
          attention_mask = batch['attention_mask'].to(device)
          labels = batch['labels'].to(device)
          outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
          output = outputs[1]
          y_pred.extend(torch.argmax(output, 1).tolist())
          y_true.extend(labels.tolist())
    
    print('Classification Report:')
    print(classification_report(y_true, y_pred, labels=[1,0], digits=4))
    
    cm = confusion_matrix(y_true, y_pred, labels=[1,0])
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d")

    ax.set_title('Confusion Matrix')

    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')

    ax.xaxis.set_ticklabels(['NEG', 'POS'])
    ax.yaxis.set_ticklabels(['NEG', 'POS'])

In [None]:
BS=16
test_loader= DataLoader(test_dataset, batch_size=BS, shuffle=True)
evaluate(model, test_loader)