# BERT

Sources:
https://towardsdatascience.com/checking-grammar-with-bert-and-ulmfit-1f59c718fe75
https://gist.github.com/sayakmisra/dbb06efec99e760cf9e5d197175ad9c5#file-grammar-checker-bert-ipynb

In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Package from: https://github.com/huggingface/transformers

In [None]:
!pip install transformers

# Loading Test Data

In [None]:
import pandas as pd

In [None]:
# Load the dataset into a pandas dataframe.
df = pd.read_csv("./Dataset/test.csv")

In [None]:
# Report the number of essays in test set.
print('Number of test essays: {:,}\n'.format(df.shape[0]))

# Display
df

In [None]:
text = df['full_text'].apply(lambda x: x.replace('\r\n\r\n', ' ') and x.replace('\n\n', ' '))

# Import Grammar Checker BERT Model

In [None]:
!pip install transformers

from transformers import BertForSequenceClassification

output_dir = "./model_save/"

print(output_dir)

In [None]:
from transformers import BertTokenizer
import torch
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained(output_dir)
model_loaded = BertForSequenceClassification.from_pretrained(output_dir)

# Create Grammar Dataframe for Test Set

In [None]:
import nltk
nltk.download('all')

In [None]:
from nltk import tokenize

In [None]:
grammar_correct_ratio = []
sentence_number = []

In [None]:
for i in range(len(text)):
  print('Running on essay ', i+1, '/',len(text))

  sentences = [sentence for sentence in tokenize.sent_tokenize(text[i])]
  encoded_dict = tokenizer.batch_encode_plus(
                          sentences,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = 64,           # Pad & truncate all sentences.
                          pad_to_max_length = True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
  # Add the encoded sentence to the list.    
  input_id = encoded_dict['input_ids']
      
  # And its attention mask (simply differentiates padding from non-padding).
  attention_mask = encoded_dict['attention_mask']
  input_id = torch.LongTensor(input_id)
  attention_mask = torch.LongTensor(attention_mask)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model_loaded = model_loaded.to(device)
  input_id = input_id.to(device)
  attention_mask = attention_mask.to(device)

  with torch.no_grad():
    # Forward pass, calculate logit predictions
    outputs = model_loaded(input_id, token_type_ids=None, attention_mask=attention_mask)

  logits = outputs[0]
  index = logits.argmax(dim=1)

  sentence_number.append(len(sentences))
  grammar_correct_ratio.append(torch.sum(index).item()/len(sentences))

In [None]:
sentence_number

In [None]:
# check the list of ratio
grammar_correct_ratio

In [None]:
df_grammar = pd.DataFrame({'cleaned_full_text':text, 'sentence_number': sentence_number, 'ratio_grammar_correct_sentences': grammar_correct_ratio })

In [None]:
df_grammar

In [None]:
# Save data to csv
df_grammar.to_csv('./grammar_test.csv')