# BERT

Sources:
https://towardsdatascience.com/checking-grammar-with-bert-and-ulmfit-1f59c718fe75
https://gist.github.com/sayakmisra/dbb06efec99e760cf9e5d197175ad9c5#file-grammar-checker-bert-ipynb

In [None]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

In [None]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

Package from: https://github.com/huggingface/transformers

In [None]:
!pip install transformers

# Loading Data

In [None]:
import pandas as pd

In [None]:
# Load the dataset into a pandas dataframe.
df = pd.read_csv("./Dataset/train.csv")

In [None]:
text = df['full_text'].apply(lambda x: x.replace('\r\n\r\n', ' ') and x.replace('\n\n', ' '))

In [None]:
text.shape

In [None]:
# Get the list grammar scores
labels = df.grammar.values

In [None]:
labels.shape

# Import Grammar Checker BERT Model

In [None]:
!pip install transformers

from transformers import BertForSequenceClassification

output_dir = "./model_save/"

print(output_dir)

In [None]:
from transformers import BertTokenizer
import torch
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained(output_dir)
model_loaded = BertForSequenceClassification.from_pretrained(output_dir)

### Try on first essay

In [None]:
import nltk
nltk.download('all')

In [None]:
from nltk import tokenize

In [None]:
essay1_sentences = [sentence for sentence in tokenize.sent_tokenize(text[0])]

In [None]:
len(essay1_sentences)

In [None]:
encoded_dict = tokenizer.batch_encode_plus(
                        essay1_sentences,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 64,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
# Add the encoded sentence to the list.    
input_id = encoded_dict['input_ids']
    
# And its attention mask (simply differentiates padding from non-padding).
attention_mask = encoded_dict['attention_mask']
input_id = torch.LongTensor(input_id)
attention_mask = torch.LongTensor(attention_mask)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_loaded = model_loaded.to(device)
input_id = input_id.to(device)
attention_mask = attention_mask.to(device)

In [None]:
with torch.no_grad():
  # Forward pass, calculate logit predictions
  outputs = model_loaded(input_id, token_type_ids=None, attention_mask=attention_mask)

outputs

In [None]:
logits = outputs[0]
index = logits.argmax(dim=1)
for id in index:
  if id == 1:
    print("Gramatically correct")
  else:
    print("Gramatically in-correct")

In [None]:
type(index)

In [None]:
print('The number of grammatically correct sentences is ', torch.sum(index).item(), ' out of ', len(essay1_sentences), ' sentences')

In [None]:
print('Correct ratio is ', torch.sum(index).item()/len(essay1_sentences))

In [None]:
print('Grammar score is ', labels[0])

## Make a list of ratios corresponding grammatically correct sentences for essays in trainset

In [None]:
grammar_correct_ratio = []

In [None]:
for i in range(len(text)):
  if i%100 == 0:
    print('Running on essay ', i, '/',len(text))
  sentences = [sentence for sentence in tokenize.sent_tokenize(text[i])]
  encoded_dict = tokenizer.batch_encode_plus(
                          sentences,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = 64,           # Pad & truncate all sentences.
                          pad_to_max_length = True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
  # Add the encoded sentence to the list.    
  input_id = encoded_dict['input_ids']
      
  # And its attention mask (simply differentiates padding from non-padding).
  attention_mask = encoded_dict['attention_mask']
  input_id = torch.LongTensor(input_id)
  attention_mask = torch.LongTensor(attention_mask)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model_loaded = model_loaded.to(device)
  input_id = input_id.to(device)
  attention_mask = attention_mask.to(device)

  with torch.no_grad():
    # Forward pass, calculate logit predictions
    outputs = model_loaded(input_id, token_type_ids=None, attention_mask=attention_mask)

  logits = outputs[0]
  index = logits.argmax(dim=1)

  grammar_correct_ratio.append(torch.sum(index).item()/len(sentences))

In [None]:
# check the list of ratio
grammar_correct_ratio

In [None]:
df_grammar = pd.DataFrame({'cleaned_full_text':text, 'grammar_score': labels, 'ratio_grammar_correct_sentences': grammar_correct_ratio })

In [None]:
df_grammar

In [None]:
# Save data to csv
df_grammar.to_csv('./grammar_train.csv')

In [None]:
sentence_number = []

In [None]:
for i in range(len(text)):
  if i%100 == 0:
    print('Running on essay ', i+1, '/',len(text))
  sentence_number.append(len(tokenize.sent_tokenize(text[i])))

In [None]:
len(sentence_number)

In [None]:
df_train_sentence_number = pd.DataFrame({'sentence_number':sentence_number})

## Combine Train csv and save

In [None]:
train_comb = df_grammar

In [None]:
train_comb['sentence_number'] = df_train_sentence_number['sentence_number']

In [None]:
train_comb

In [None]:
train_comb.to_csv('./grammar_train_comb.csv')