In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# pytorch_pretained_bert already available in kaggle conda env.
# !pip install pytorch-nlp

**Note :** uncomment the code line in above cell; you are running this notebook locally, and would need pytorch-nlp library.Here, it is pre-installed.

### importing necessaries libraries...

In [None]:
import sys
import numpy as np
import random as rn
import pandas as pd
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn
# from torchnlp.datasets import imdb_dataset      # --> We are using our own uploaded dataset.
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline

### Initializing seed values to stabilize the outcomes.

In [None]:
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

## Prepare the data

In [None]:
path = '../input/imdb-50k-movie-reviews-test-your-bert/'

train_data = pd.read_csv(path + 'train.csv')
test_data = pd.read_csv(path + 'test.csv')

In [None]:
# experimenting here with a sample of dataset, to avoid memory overflow.
train_data = train_data[:2000]
test_data = test_data[:500]

train_data = train_data.to_dict(orient='records')
test_data = test_data.to_dict(orient='records')
type(train_data)

### Mapping sentences with their Labels...

In [None]:
train_texts, train_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['text'], d['sentiment']), test_data)))

len(train_texts), len(train_labels), len(test_texts), len(test_labels)

In [None]:
vocab = set()

for x in train_texts:
    words = x.split()
    a = set(words)
    vocab = vocab.union(a)

In [None]:
for x in test_texts:
    words = x.split()
    a = set(words)
    vocab = vocab.union(a)

In [None]:
word_to_ix = {word: i for i, word in enumerate(vocab)}

In [None]:
!python -m spacy download en_core_web_md

In [None]:
import en_core_web_md

In [None]:
# Load pre-trained GloVe model
nlp = en_core_web_md.load()
# nlp = spacy.load('en_core_web_md')

In [None]:
# Get word embeddings
word_embeddings = {}
for word in vocab:
    word_embeddings[word] = nlp(word).vector

In [None]:
type(word_embeddings)

In [None]:
df = pd.DataFrame.from_dict(word_embeddings)

In [None]:
df.to_csv('/kaggle/working/embedding.csv', index=False)

In [None]:
train_embeddings = []
for x in train_texts:
    sentence_embed = []
    for word in x.split():
        sentence_embed.append(word_embeddings[word])
    train_embeddings.append(sentence_embed)

In [None]:
test_embeddings = []
for x in test_texts:
    sentence_embed = []
    for word in x.split():
        sentence_embed.append(word_embeddings[word])
    test_embeddings.append(sentence_embed)

In [None]:
train_embeddings = np.array(train_embeddings)
train_embeddings = pad_sequences(train_embeddings, maxlen=512, truncating="post", padding="post", dtype=float)

In [None]:
test_embeddings = np.array(test_embeddings)
test_embeddings = pad_sequences(test_embeddings, maxlen=512, truncating="post", padding="post", dtype=float)

#### visualizing one of the sentences from train set

In [None]:
train_texts[0]

## visualizing sentences lengths

In [None]:
sentences = [len(sent) for sent in train_texts]

plt.rcParams.update({'figure.figsize':(7,5), 'figure.dpi':100})
plt.bar(range(1,2001), sentences, color = ['red'])
plt.gca().set(title='No. of characters in each sentence', xlabel='Number of sentence', ylabel='Number of Characters in each sentence');

#### We can see that most of the sentences are around 700 - 1000 characters long, which is pretty obvious. HOwever, few sentences are shorter and few even long as 6000 characters. So, this is a good, very versatile Review Dataset.

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
tokenizer.tokenize('Hi my name is Atul')

### Sample of how BERT Tokenizer works and Embeddings prepared to be fed into BERT Model.

![BERT TOKENS](https://miro.medium.com/max/619/1*iJqlhZz-g6ZQJ53-rE9VvA.png)

## Preparing Token embeddings...

In [None]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))

len(train_tokens), len(test_tokens)

## Preparing Token Ids...


![token ids](https://jalammar.github.io/images/distilBERT/sst2-text-to-tokenized-ids-bert-example.png)

In [None]:
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape, test_tokens_ids.shape

### Many a times your Kernel will Freeze but this is just OK. Let it be. This is a heavy computing task; So,it is just a common thing to happen. I have also put Monitoring code snippets to monitor your CPU/GPU usage and also Garbage Collector to free up space.


It is quite common to see your CPU floating above 100% and/or GPU over 100% like these screens below:
![SNAP-1](https://i.ibb.co/3cFD5Hs/cut-1.png)
![SNAP-2](https://i.ibb.co/G5qFRxj/cut-2.png)

In [None]:
train_y = np.array(train_labels) == 'pos'
test_y = np.array(test_labels) == 'pos'
train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

### Now Masking few random IDs from each sentences to remove Biasness from model.

In [None]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

# Baseline

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

In [None]:
baseline_model = make_pipeline(CountVectorizer(ngram_range=(1,3)), LogisticRegression()).fit(train_texts, train_labels)

In [None]:
baseline_predicted = baseline_model.predict(test_texts)

In [None]:
print(classification_report(test_labels, baseline_predicted))

#### Our baseline model is working just fine and yeilding a fair enough score. 

In [None]:
if torch.cuda.is_available():
    device = 'cuda'
else:
    device = 'cpu'

In [None]:
class TransformerModel(nn.Module):
    def __init__(self, input_dim, hidden_dim1, hidden_dim2, num_layers, num_heads):
        super(TransformerModel, self).__init__()
    
        # Define the positional encoding layer
        self.pos_enc = PositionalEncoding(input_dim)
        self.dropout = nn.Dropout(p=0.3)
        
        # Define the Transformer encoder
        encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads)
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers, norm=nn.LayerNorm(input_dim)).to(device)
        
        temporal_encoder_layer = nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads)
        self.temporal_encoder = nn.TransformerEncoder(temporal_encoder_layer, num_layers=num_layers, norm=nn.LayerNorm(input_dim)).to(device)

        self.layer_norm = nn.LayerNorm(input_dim).to(device)
        # Define the fully connected layers
        self.fc1 = nn.Linear(input_dim, hidden_dim1).to(device)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2).to(device)
        self.fc3 = nn.Linear(hidden_dim2, 2).to(device)

    def forward(self, x):
        b = x.shape[0]
        
        # Apply Transformer encoder to input
        x = self.encoder(x)

        # Apply positional encoding to input
        x = self.pos_enc(x)

        # Apply Temporal Transformer encoder to the Encoded Feature of Spatial Transformer
        x = self.temporal_encoder(x)
        x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = x.view(b, -1)
        return x

#         # Flatten the output from the Transformer
#         x = x.flatten(start_dim=1)
#         # Dimension is now input_dim x 124 after being flattened
#         x = self.layer_norm(x)

#         # Apply fully connected layers
#         x = self.fc1(x)
#         x = nn.functional.gelu(x)
#         x = self.fc2(x)
#         x = nn.functional.gelu(x)
#         x = self.fc3(x)
        
#         x = x.view(b, -1)

#         return x

# Define the positional encoding layer
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout = 0.1, max_len = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)].to(device)
        return self.dropout(x)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math

# class PolicyNetwork(nn.Module):
#     def __init__(self, input_size, hidden_size):
#         super(PolicyNetwork, self).__init__()
#         self.fc1 = nn.Linear(input_size, hidden_size).to(device)
#         self.fc2 = nn.Linear(hidden_size, 16).to(device)
#         self.output = nn.Linear(16, 1).to(device)

#     def forward(self, state):
#         x = F.relu(self.fc1(state))
#         x = F.relu(self.fc2(x))
#         output = torch.sigmoid(self.output(x))
#         return output
    
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers = 2):
        super(PolicyNetwork, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True).to(device)
        self.conv1d_1 = nn.Conv1d(hidden_size, 64, kernel_size=3, stride=1, padding=1).to(device)
        self.conv1d_2 = nn.Conv1d(64, 32, kernel_size=3, stride=1, padding=1).to(device)
        self.fc = nn.Linear(32, 1).to(device)
#         self.sigmoid = F.sigmoid().to(device)

    def forward(self, x):
        lstm_output, _ = self.lstm(x)
        lstm_output = lstm_output.permute(0, 2, 1)  # Reshape for conv1d
        conv1d_output = self.conv1d_1(lstm_output)
        conv1d_output = self.conv1d_2(conv1d_output)
        conv1d_output = conv1d_output.permute(0, 2, 1)  # Reshape back
        fc_output = self.fc(conv1d_output)
        output = torch.sigmoid(fc_output)
        return output


# class ClassificationNetwork(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size):
#         super(ClassificationNetwork, self).__init__()
#         self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True).to(device)
#         self.fc = nn.Linear(hidden_size, output_size).to(device)

#     def forward(self, x):
#         b, _, _ = x.shape
#         _, (hidden, _) = self.lstm(x)
#         output = self.fc(hidden.squeeze(0))
#         return output.view(b, -1)

class ClassificationNetwork(nn.Module):
    def __init__(self, hidden_size, num_heads, ff_dim, dropout):
        super(ClassificationNetwork, self).__init__()
        self.self_attention = nn.MultiheadAttention(hidden_size, num_heads).to(device)
        self.feed_forward = nn.Sequential(
            nn.Linear(hidden_size, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, hidden_size)
        ).to(device)
        self.layer_norm1 = nn.LayerNorm(hidden_size).to(device)
        self.layer_norm2 = nn.LayerNorm(hidden_size).to(device)
        self.dropout = nn.Dropout(dropout).to(device)

    def forward(self, x):
        # Self-attention
        attention_output, _ = self.self_attention(x, x, x)
        attention_output = self.dropout(attention_output)
        x1 = self.layer_norm1(x + attention_output)

        # Feed-forward network
        feed_forward_output = self.feed_forward(x1)
        feed_forward_output = self.dropout(feed_forward_output)
        x2 = self.layer_norm2(x1 + feed_forward_output)

        return x2


# Example usage
num_heads = 4  # Number of attention heads
ff_dim = 512  # Dimension of the feed-forward network
dropout = 0.1  # Dropout rate

num_classes = 2  # Number of output classes (binary sentiment)
num_layers = 2  # Number of transformer layers
def calculate_delayed_reward(probabilities, labels):
    positive_reward = torch.log(probabilities) * labels
    negative_reward = torch.log(1 - probabilities) * (1 - labels)
    delayed_reward = positive_reward + negative_reward
    return delayed_reward


# Define the input sizes and hyperparameters
input_size = 300  # Size of BERT embeddings
hidden_size = 256  # Size of hidden units in LSTM
output_size = 2  # Number of classes for sentence sentiment (binary classification)

# Instantiate the policy network and classification network
policy_net = PolicyNetwork(input_size, hidden_size)
# classification_net = ClassificationNetwork(input_size, hidden_size, output_size, num_layers, num_heads)
# classification_net = ClassificationNetwork(hidden_size, num_heads, ff_dim, dropout)

# Initialize the model
classification_net = TransformerModel(input_dim = 300, hidden_dim1 = 256, hidden_dim2 = 64, num_layers = 4, num_heads = 4)

In [None]:
X_train = torch.Tensor(train_embeddings).to(device)
y_train = torch.tensor(train_y, dtype=torch.long).to(device)

In [None]:
policy_loss_fn = nn.BCELoss()
policy_optimizer = torch.optim.RMSprop(policy_net.parameters(), lr=0.2)

classification_loss_fn = nn.CrossEntropyLoss()
classification_optimizer = torch.optim.RMSprop(classification_net.parameters(), lr=0.2)

In [None]:
batch_size = 100

In [None]:
from torch.autograd import Variable

In [None]:
num_epochs = 1
for epoch in range(num_epochs):
    
    for i in range(0, X_train.shape[0], batch_size):
        # Get a batch of data
        x_batch = X_train[i:i+batch_size]
        labels = y_train[i:i+batch_size]
    
        # Zero the gradients for both networks
        policy_optimizer.zero_grad()
        classification_optimizer.zero_grad()

        # Forward pass through the policy network
        probabilities = policy_net(x_batch)
        # Forward pass through the classification network
        selected_words = x_batch * probabilities
        prediction = classification_net(selected_words)

        # Compute the classification loss and backpropagate
        classification_loss = classification_loss_fn(prediction, labels)
        classification_loss.backward()
        classification_optimizer.step()
        
        probabilities = probabilities.detach()
        
        # Calculate the delayed reward
        delayed_reward = calculate_delayed_reward(probabilities, labels)
        print(delayed_reward)
        
        # Compute the policy loss and backpropagate
        policy_loss = torch.sum(-delayed_reward)
        policy_loss = Variable(policy_loss, requires_grad = True)
        policy_loss.backward()
        policy_optimizer.step()


    # Print the loss for monitoring
    print(f"Epoch {epoch + 1}: Policy Loss: {policy_loss.item()}, "
          f"Classification Loss: {classification_loss.item()}")

# After training, you can use the networks for inference.

In [None]:
X_test = torch.Tensor(test_embeddings).to(device)
y_test = torch.tensor(test_y, dtype=torch.long).cpu()

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
with torch.no_grad():
    probabilities = policy_net(X_test)
    # Forward pass through the classification network
    selected_words = X_test * probabilities
    prediction = classification_net(selected_words).cpu()
    prediction = np.argmax(prediction, axis = 1)

    # Compute the classification loss and backpropagate
    acc = accuracy_score(prediction, y_test)
print(acc)

# BERT Model


### Bidirectional Encoder Representations from Transformers. Each word here has a meaning to it and we will encounter that one by one in this article. For now, the key takeaway from this line is – **BERT is based on the Transformer architecture**.

In [None]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

In [None]:
# ensuring that the model runs on GPU, not on CPU

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

In [None]:
bert_clf = BertBinaryClassifier()
bert_clf = bert_clf.cuda()     # running BERT on CUDA_GPU

In [None]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

In [None]:
x = torch.tensor(train_tokens_ids[:3]).to(device)
y, pooled = bert_clf.bert(x, output_all_encoded_layers=False)
x.shape, y.shape, pooled.shape

In [None]:
y = bert_clf(x)
y.cpu().detach().numpy()        # kinda Garbage Collector to free up used and cache space

In [None]:
# Cross- checking CUDA GPU Memory to ensure GPU memory is not overflowing.
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

In [None]:
y, x, pooled = None, None, None
torch.cuda.empty_cache()     # Clearing Cache space for fresh Model run
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

# Fine Tune BERT

In [None]:
# Setting hyper-parameters

BATCH_SIZE = 4
EPOCHS = 10

In [None]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

In [None]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

In [None]:
param_optimizer = list(bert_clf.sigmoid.named_parameters()) 
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [None]:
optimizer = Adam(bert_clf.parameters(), lr=3e-6)

In [None]:
torch.cuda.empty_cache()   # Clearing Cache space for a fresh Model run

In [None]:
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
        logits = bert_clf(token_ids, masks)
        
        loss_func = nn.BCELoss()

        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()
        
        
        bert_clf.zero_grad()
        batch_loss.backward()
        

        clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
        optimizer.step()
        
        clear_output(wait=True)
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

In [None]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])


In [None]:
np.mean(bert_predicted)

In [None]:
print(classification_report(test_y, bert_predicted))