<a href="https://colab.research.google.com/github/BGDNick/not_nlp/blob/main/Research.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import gc

from copy import deepcopy
import json

from tqdm import tqdm_notebook as tqdm
from IPython.display import clear_output

from transformers import BertModel, BertTokenizer
from nltk.corpus import movie_reviews
import nltk

import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.metrics import accuracy_score, f1_score
from transformers 

# Some utils functions

In [None]:
# Load the Snowball stemmer and NLTK stopwords
nltk.download('stopwords')
stemmer = SnowballStemmer("english")
stop_words = stopwords.words("english")

def preprocess(text):
    # Lowercase the text
    text = text.lower()
    # Remove punctuation
    text = "".join([char for char in text if char not in string.punctuation])
    # Remove stopwords and stem each each word
    text = [stemmer.stem(word) for word in text.split() if word not in stop_words]
    # Join the words back into a string
    text = ' '.join(text)

    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def freeze(model):
    for p in model.parameters():
        p.requires_grad_(False)
    model.eval()    
    
def unfreeze(model):
    for p in model.parameters():
        p.requires_grad_(True)
    model.train(True)

In [None]:
def masked_lm_loss(input_ids, outputs):
    # compute the masked language modeling loss
    loss = F.cross_entropy(outputs.logits.view(-1, outputs.logits.size(-1)), input_ids.view(-1), reduction='none')
    
    # create a mask to ignore padding tokens
    mask = (input_ids != 0).float()
    
    # ignore loss for padding tokens
    masked_loss = loss * mask.view(-1)
    
    # compute the mean loss
    mean_loss = masked_loss.sum() / mask.sum()
    
    return mean_loss


# Load of dataset & models, preprocess dataset

In [None]:
nltk.download('movie_reviews')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', device='cuda')
bert_gen = BertModel.from_pretrained('bert-base-uncased').to('cuda')
bert_discr = BertModel.from_pretrained('bert-base-uncased', num_labels=2).to('cuda')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model ch

In [None]:
# Load the positive and negative reviews
reviews = [(movie_reviews.raw(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]
df = pd.DataFrame(reviews, columns=['review', 'sentiment'])
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'pos' else 0)

In [None]:
tokenized = []
labels = df['label'].tolist()
reviews = df['review'].tolist()
reviews = [preprocess(review) for review in tqdm(reviews)]
train_reviews, test_reviews, train_labels, test_labels = train_test_split(reviews, labels, test_size=0.3, random_state=42)
train_reviews = tokenizer(train_reviews, 
                    add_special_tokens=True,
                    max_length=512,
                    truncation=True,
                    padding='max_length',
                    return_attention_mask=True,
                    return_tensors='pt')

test_reviews = tokenizer(test_reviews, 
                    add_special_tokens=True,
                    max_length=512,
                    truncation=True,
                    padding='max_length',
                    return_attention_mask=True,
                    return_tensors='pt')
# for review in tqdm(reviews):
#     review = preprocess(review)

#     tokenized_review = tokenizer.encode(
#         review,
#         add_special_tokens=True,
#         max_length=512,
#         truncation=True,
#         padding='max_length',
#         return_attention_mask=True,
#         return_tensors='pt'
#     )
#     tokenized.append(tokenized_review)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  reviews = [preprocess(review) for review in tqdm(reviews)]


  0%|          | 0/2000 [00:00<?, ?it/s]

In [None]:
train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_reviews['input_ids']),
                                               torch.tensor(train_reviews['attention_mask']),
                                               torch.tensor(train_labels))

test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_reviews['input_ids']),
                                              torch.tensor(test_reviews['attention_mask']),
                                              torch.tensor(test_labels))

  train_dataset = torch.utils.data.TensorDataset(torch.tensor(train_reviews['input_ids']),
  torch.tensor(train_reviews['attention_mask']),
  test_dataset = torch.utils.data.TensorDataset(torch.tensor(test_reviews['input_ids']),
  torch.tensor(test_reviews['attention_mask']),


In [None]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

In [None]:
def train(T, f, T_opt, f_opt, train_loader, test_loader, loss_func, epochs, T_ITERS=2):
    for step in tqdm(range(epochs)):
        unfreeze(T); freeze(f)
        for t_iter in range(T_ITERS):
            for batch in train_loader:
                inputs, attention_mask, labels = tuple(t.to('cuda') for t in batch)
                T_opt.zero_grad()
                x = T(inputs, attention_mask=attention_mask, labels=labels)
                x_labels = f(x, attention_mask=attention_mask, labels=labels)
                T_loss = loss_func(x, labels) - torch.nn.BCELoss(x_labels, labels)
                T_loss.backward(); T_opt.step()
            del T_loss, x, x_labels, batch, inputs, attention_mask, labels; gc.collect(); torch.cuda.empty_cache()
    
        freeze(T); unfreeze(f)
        for batch in train_loader:
            inputs, attention_mask, labels = tuple(t.to('cuda') for t in batch)
            with torch.no_grad():
                T_x = T(inputs, attention_mask=attention_mask, labels=labels)
            



