<a href="https://colab.research.google.com/github/BGDNick/not_nlp/blob/main/Research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m78.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m95.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import gc

from copy import deepcopy
import json

from tqdm import tqdm_notebook as tqdm
from IPython.display import clear_output

from transformers import BertModel, BertTokenizer
from nltk.corpus import movie_reviews
import nltk

import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.metrics import accuracy_score, f1_score

# Some utils functions

In [38]:
def loss_plot(losses, names=None):
    fig, ax = plt.subplots(1, 1, figsize=(12, 8))
    for ind, loss in enumerate(losses):
        if names is None:
            ax.plot(loss, label=f'loss {ind}')
            ax.legend()
        else:
            ax.plot(loss, label=names[ind])
            ax.legend()
    ax.set_xlabel('Iteration')
    ax.set_ylabel('BCELoss')
    return fig, ax

In [5]:
# Load the Snowball stemmer and NLTK stopwords
nltk.download('stopwords')
stemmer = SnowballStemmer("english")
stop_words = stopwords.words("english")

def preprocess(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])
    # Remove stopwords and stem each each word
    text = [stemmer.stem(word) for word in text.split() if word not in stop_words]
    # Join the words back into a string
    text = ' '.join(text)

    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
def freeze(model):
    for p in model.parameters():
        p.requires_grad_(False)
    model.eval()    
    
def unfreeze(model):
    for p in model.parameters():
        p.requires_grad_(True)
    model.train(True)

In [7]:
def masked_lm_loss(input_ids, outputs):
    # compute the masked language modeling loss
    loss = F.cross_entropy(outputs.logits.view(-1, outputs.logits.size(-1)), input_ids.view(-1), reduction='none')
    
    # create a mask to ignore padding tokens
    mask = (input_ids != 0).float()
    
    # ignore loss for padding tokens
    masked_loss = loss * mask.view(-1)
    
    # compute the mean loss
    mean_loss = masked_loss.sum() / mask.sum()
    
    return mean_loss


# Load of dataset & models, preprocess dataset

In [8]:
nltk.download('movie_reviews')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', device='cuda')
bert_gen = BertModel.from_pretrained('bert-base-uncased').to('cuda')
bert_discr = BertModel.from_pretrained('bert-base-uncased', num_labels=2).to('cuda')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.tr

In [9]:
# Load the positive and negative reviews
reviews = [(movie_reviews.raw(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
df = pd.DataFrame(reviews, columns=['review', 'sentiment'])
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'pos' else 0)

In [10]:
tokenized = []
labels = df['label'].tolist()
reviews = df['review'].tolist()
reviews = [preprocess(review) for review in tqdm(reviews)]
train_reviews, test_reviews, train_labels, test_labels = train_test_split(reviews, labels, test_size=0.3, random_state=42)
train_reviews = tokenizer(train_reviews, 
                    add_special_tokens=True,
                    max_length=512,
                    truncation=True,
                    padding='max_length',
                    return_attention_mask=True,
                    return_tensors='pt')

test_reviews = tokenizer(test_reviews, 
                    add_special_tokens=True,
                    max_length=512,
                    truncation=True,
                    padding='max_length',
                    return_attention_mask=True,
                    return_tensors='pt')
# for review in tqdm(reviews):
#     review = preprocess(review)

#     tokenized_review = tokenizer.encode(
#         review,
#         add_special_tokens=True,
#         max_length=512,
#         truncation=True,
#         padding='max_length',
#         return_attention_mask=True,
#         return_tensors='pt'
#     )
#     tokenized.append(tokenized_review)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  reviews = [preprocess(review) for review in tqdm(reviews)]


  0%|          | 0/2000 [00:00<?, ?it/s]

In [11]:
train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_reviews['input_ids']),
                                               torch.tensor(train_reviews['attention_mask']),
                                               torch.tensor(train_labels))

test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_reviews['input_ids']),
                                              torch.tensor(test_reviews['attention_mask']),
                                              torch.tensor(test_labels))

  train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_reviews['input_ids']),
  torch.tensor(train_reviews['attention_mask']),
  test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_reviews['input_ids']),
  torch.tensor(test_reviews['attention_mask']),


In [12]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

In [14]:
def train(T, f, T_opt, f_opt, train_loader, test_loader, loss_func, epochs, T_ITERS=2, plot_interval=1):
    # Init of losses 
    f_loss = []
    T_loss = []
    f_on_t_accuracy = []
    f_clear_accuracy = []

    f_loss_val = []
    T_loss_val = []
    f_on_t_accuracy_val = []
    f_clear_accuracy_val = []

    # Start of training
    for step in tqdm(range(epochs)):
        # Training of T
        unfreeze(T); freeze(f)
        global_T_loss = []
        # T would have T_ITERS iterations for every 1 iteration of f
        for t_iter in range(T_ITERS):
            for batch in train_loader:
                # Zero grad
                T_opt.zero_grad()
                # Converting the input data to cuda
                inputs, attention_mask, labels = tuple(t.to('cuda') for t in batch)
                # Getting adversarial results from T
                T_x = T(inputs, attention_mask=attention_mask, labels=labels)
                # Getting labels for adversarial result of T
                x_labels = f(T_x, attention_mask=attention_mask, labels=labels)
                # Loss for T, computed via loss_function and BCELoss of f correctly predicting the labels
                T_loss = loss_func(T_x, labels) - torch.nn.BCELoss(x_labels, labels)
                global_T_loss.append(T_loss.detach().to('cpu').item())
                T_loss.backward(); T_opt.step()
            del T_loss, T_x, x_labels, batch, inputs, attention_mask, labels; gc.collect(); torch.cuda.empty_cache()
        T_loss.append(torch.mean(global_T_loss).item())

        # Trainig of f 
        freeze(T); unfreeze(f)
        global_f_loss = []
        for batch in train_loader:
            # Zero grad of f
            f_opt.zero_grad()
            # Converting input data to cuda
            inputs, attention_mask, labels = tuple(t.to('cuda') for t in batch)
            # Getting adversarial examples
            with torch.no_grad():
                T_x = T(inputs, attention_mask=attention_mask, labels=labels)
            x = torch.concat([inputs, T_x])            
            # Combining of adversarial and real data
            labels = torch.concat([labels, labels])
            attention_mask = torch.concat([attention_mask, attention_mask])
            outputs = f(x, attention_mask=attention_mask, labels=labels)
            # Backpropagation of loss
            f_loss = torch.nn.BCELoss(outputs, labels)
            global_f_loss.append(f_loss.detach().to('cpu').item())
            f_loss.backward(); f_opt.step()
        f_loss.append(torch.mean(global_f_loss).item())
    
    # Plotting the results
    if step % plot_interval == 0:
        clear_output(wait=True)
        fig, ax = loss_plot()
