<a href="https://colab.research.google.com/github/ElFosco/NLP_argument_creation/blob/main/perplexity_score.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Installation

In [None]:
!pip install pytorch_pretrained_bert

## Imports

In [None]:
import math
import torch
import numpy as np
import torch.optim as optim
from torch import nn
from torch.utils.data import DataLoader
from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel

from tqdm.auto import tqdm

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
import re

import pandas as pd

## Model

In [None]:
# Load pre-trained model (weights)
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
# Load pre-trained model tokenizer (vocabulary)
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

## Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

dataset = "/content/drive/MyDrive/NLP/arg_quality_rank_30k.csv"

In [None]:
df = pd.read_csv(dataset)
df.head()

In [None]:
lemmatizer = WordNetLemmatizer()
def clean_text(text,topic):
  text = re.sub('\"|-|\\\\|`', ' ', text)  # delete this chars from the string ["-\`]
  text = re.sub('\n', ' ', text)
  text = re.sub('^[.]+', '', text)         # delete dots at the beginning of the sentence
  #text = re.sub("([?.!,])", r" \1 ", text)
  text = re.sub('\. \.', '.', text)        # delete . .
  text = re.sub('&', ' and ', text)        # replace & with and
  text = re.sub(' +', ' ', text)           # delete additional whitespace
  text = text.rstrip()                  
  text = text.lstrip()
  text = " ".join([lemmatizer.lemmatize(x) for x in text.split()])
  if not (re.search('[\.|?|!]$',text)): #append the topic 
    text = text+' [SEP]'
  else:
    text = re.sub('[\.|?|!]$',' [SEP]',text)
  text = text + " " + topic.lower()
  tokenize_input = tokenizer.tokenize(text)
  tokenized = tokenizer.convert_tokens_to_ids(tokenize_input)
  return tokenized

In [None]:
df.loc[2, "argument"] = "zero tolerance policy in schools should not be adopted as circumstances are often not black and white, being more nuanced. no one should be written off due to a mistake of judgement."
df['tokenized'] = df.apply(lambda row : clean_text(row['argument'],row['topic']), axis = 1)

In [None]:
is_training_data =  df['set']=='train'
is_validation_data =  df['set']=='dev'
is_test_data =  df['set']=='test'

x_train = df['tokenized'][is_training_data]
x_train = x_train.append(df['tokenized'][is_validation_data])
x_test  = df['tokenized'][is_test_data]

## Tokenizer

In [None]:
def convert_text(df, tokenizer, is_training=False, max_seq_length=None):
    """
    Converts input text sequences using a given tokenizer

    :param texts: either a list or numpy ndarray of strings
    :tokenizer: an instantiated tokenizer
    :is_training: whether input texts are from the training split or not
    :max_seq_length: the max token sequence previously computed with
    training texts.

    :return
        text_ids: a nested list on token indices
        max_seq_length: the max token sequence previously computed with
        training texts.
    """

    # Padding
    if is_training:
        max_seq_length = int(np.quantile([len(seq) for seq in df], 0.95))
    else:
        assert max_seq_length is not None

    text_ids = [seq + [0] * (max_seq_length - len(seq)) for seq in df]
    text_ids = np.array([seq[:max_seq_length] for seq in text_ids])

    if is_training:
        return text_ids, max_seq_length
    else:
        return text_ids

In [None]:
x_train_tokens, max_seq_length = convert_text(x_train, tokenizer, True)
x_test_tokens = convert_text(x_test, tokenizer, False, max_seq_length)
print("Max token sequence: {}".format(max_seq_length))
print('X train shape: ', x_train_tokens.shape)
print('X test shape: ', x_test_tokens.shape)

## Train

In [None]:
def train_one_epoch(epoch_index):
    running_loss = 0.
    last_loss = 0.
    
    i = -1
    for inputs in iter(train_dataloader):
        i += 1
        inputs = inputs.to(device)

        # Zero your gradients for every batch!
        optimizer.zero_grad()

        # Make predictions for this batch
        loss = model(inputs, lm_labels=inputs)

        # Compute the loss and its gradients
        loss.backward()

        # Adjust learning weights
        optimizer.step()
        progress_bar.update(1)

        # Gather data and report
        running_loss += loss.item()
        if i % 10 == 9:
            last_loss = running_loss / 1000 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_dataloader) + i + 1
            running_loss = 0.

    return last_loss

In [None]:
# Initializing in a separate cell so we can easily add more epochs to the same run
best_loss = 1000.0
epoch_number = 0
EPOCHS = 10
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
optimizer = optim.Adam(model.parameters(), lr=6e-5)

train_dataloader = DataLoader(x_train_tokens, batch_size=64, shuffle=True)
test_dataloader = DataLoader(x_test_tokens, batch_size=128, shuffle=False)
progress_bar = tqdm(range(EPOCHS*len(train_dataloader)))

for epoch in range(EPOCHS):
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.to(device)
    model.train(True)
    avg_loss = train_one_epoch(epoch_number)

    valid_loss = 0.0
    valid_steps = 0
    model.train(False)
    model.to(device)
    with torch.no_grad():
      for inputs in iter(test_dataloader):
        valid_steps += 1
        inputs = inputs.to(device)
        loss = model(inputs, lm_labels=inputs)
        valid_loss += loss
    valid_loss /= valid_steps
    print("Validation loss: ", valid_loss)

    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model.state_dict(), "/content/drive/MyDrive/NLP/model_perplexity_best.th")

    epoch_number += 1
print("Best valid loss: ", best_loss)

## Inference

In [None]:
def score(sentence):
    tokenize_input = tokenizer.tokenize(sentence)
    tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)])
    loss=model(tensor_input, lm_labels=tensor_input)
    return math.exp(loss)

model.eval()
model.to(torch.device('cpu'))
sentences=['there a is cat end desk end',
                'there is a plane on the desk',
                        'there is a book in the desk',
           "there is and made opinion to buy and foolish itself counted celebrate identity and priest's burned"]

scores = [score(i) for i in sentences]
print(scores)  # Used to tune alpha
a = 70000
print([((a-i)/a)**3 for i in scores])

## Save

In [None]:
torch.save(model.state_dict(), "/content/drive/MyDrive/NLP/model_perplexity_0.th")

## Load

In [None]:
model.load_state_dict(torch.load("/content/drive/MyDrive/NLP/model_perplexity.th"))