In [None]:
# Mount Google Drive to access the dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# import the required libraries
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random
import csv
from sklearn.model_selection import train_test_split
import pandas as pd

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from nltk.translate import chrf_score,bleu_score

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# We’ll need a unique index per word to use as the inputs and targets of the networks
#  later. To keep track of all this we will use a helper class called Lang which has
#   word → index (word2index) and index → word (index2word) dictionaries, as well as
#    a count of each word word2count which will be used to replace rare words later.
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [None]:
# Turn a Unicode string to plain ASCII, thanks to
# The files are all in Unicode, to simplify we will turn Unicode characters to ASCII,
#  make everything lowercase, and trim most punctuation.


# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [None]:
# To read the data file we will split the file into lines, and then split lines into pairs.
#  The files are all Incorrect → Correct

def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Specify the path to your CSV file
    csv_file_path = '/content/drive/MyDrive/EE782_Grammer_Checker_Project/small_dataframe.csv'

    # Read the CSV file and split into lines
    with open(csv_file_path, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        lines = [row for row in reader]

    # Normalize the strings
    pairs = [[normalizeString(row[0]), normalizeString(row[1])] for row in lines]
    input_lang = Lang(lang1)
    output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [None]:
#  maximum length is 20 words (that includes ending punctuation) and we’re filtering
#  to sentences that translate to the form “I am” or “He is” etc. (accounting for
# apostrophes replaced earlier).
MAX_LENGTH = 20

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH #and \
        #p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [None]:
# The full process for preparing the data is:
# Normalize text, filter by length and content
# Make word lists from sentences in pairs

def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('incorrect', 'coorect', True)
print(random.choice(pairs))

Reading lines...
Read 10001 sentence pairs
Trimmed to 5306 sentence pairs
Counting words...
Counted words:
incorrect 13551
coorect 12466
['rd m grissom singled to lefted', 'rd m grissom singled to left']


In [None]:
# The Encoder
# The encoder of a seq2seq network is a RNN that outputs some value for
#  every word from the input sentence. For every input word the encoder
#  outputs a vector and a hidden state, and uses the hidden state for
#   the next input word.

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [None]:
# The Decoder
# The decoder is another RNN that takes the encoder output vector(s)
#  and outputs a sequence of words to create the translation.

# class DecoderRNN(nn.Module):
#     def __init__(self, hidden_size, output_size):
#         super(DecoderRNN, self).__init__()
#         self.embedding = nn.Embedding(output_size, hidden_size)
#         self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
#         self.out = nn.Linear(hidden_size, output_size)

#     def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
#         batch_size = encoder_outputs.size(0)
#         decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
#         decoder_hidden = encoder_hidden
#         decoder_outputs = []

#         for i in range(MAX_LENGTH):
#             decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
#             decoder_outputs.append(decoder_output)

#             if target_tensor is not None:
#                 # Teacher forcing: Feed the target as the next input
#                 decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
#             else:
#                 # Without teacher forcing: use its own predictions as the next input
#                 _, topi = decoder_output.topk(1)
#                 decoder_input = topi.squeeze(-1).detach()  # detach from history as input

#         decoder_outputs = torch.cat(decoder_outputs, dim=1)
#         decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
#         return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

#     def forward_step(self, input, hidden):
#         output = self.embedding(input)
#         output = F.relu(output)
#         output, hidden = self.gru(output, hidden)
#         output = self.out(output)
#         return output, hidden

In [None]:
# Bahdanau attention, also known as additive attention, is a commonly used
#  attention mechanism in sequence-to-sequence models, particularly in
#  neural machine translation tasks.

class CustomAttention(nn.Module):
    def __init__(self, hidden_size):
        super(CustomAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

# Attention allows the decoder network to “focus” on a different part of the encoder’s
#  outputs for every step of the decoder’s own outputs.

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = CustomAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [None]:
# To train, for each pair we will need an input tensor
#  (indexes of the words in the input sentence) and target tensor
#   (indexes of the words in the target sentence). While creating
#   these vectors we will append the EOS token to both sequences.

def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size, input_lang, output_lang, pairs):
    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader

In [None]:
# To train we run the input sentence through the encoder, and keep track of every
# output and the latest hidden state. Then the decoder is given the <SOS> token as
# its first input, and the last hidden state of the encoder as its first hidden state.

def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data
        input_tensor = input_tensor.to(device)
        target_tensor = target_tensor.to(device)

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [None]:
# This is a helper function to print time elapsed and estimated time
#  remaining given the current time and progress %.
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [None]:
# The whole training process is desinged in followign way:
# Start a timer
# Initialize optimizers and criterion
# Create set of training pairs
# Start empty losses array for plotting

def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [None]:
# lotting is done with matplotlib, using the array of
# loss values plot_losses saved while training.

import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [None]:
# Evaluation is mostly the same as training, but there are no targets so we simply
#  feed the decoder’s predictions back to itself for each step.

def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

In [None]:
# We can evaluate random sentences from the training set and print out the input,
#  target, and output to make some subjective quality judgements:
# we also find the chrf_score here

def evaluateRandomly(encoder, decoder, test_pairs, n=30):
    score = 0
    for i in range(n):
        pair = random.choice(test_pairs)
        src, trg = pair
        print('>', src)
        print('=', trg)
        output_words, _ = evaluate(encoder, decoder, src, input_lang, output_lang)
        prediction = ' '.join(output_words)
        score += chrf_score.sentence_chrf(prediction, trg)
        print('<', prediction)
        print('')
    return score

In [None]:
hidden_size = 128
batch_size = 32
input_lang, output_lang, pairs = prepareData('correct', 'incorrect', True)

# Split the data into training and testing sets (80% train, 20% test)
train_pairs, test_pairs = train_test_split(pairs, test_size=0.01, random_state=42)

input_lang, output_lang, train_dataloader = get_dataloader(batch_size, input_lang, output_lang, train_pairs)

encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)

Reading lines...
Read 10001 sentence pairs
Trimmed to 5306 sentence pairs
Counting words...
Counted words:
correct 13551
incorrect 12466


In [None]:
df = pd.DataFrame(train_pairs)

In [None]:
df

Unnamed: 0,0,1
0,the photo s of juha flinkman miia laine and sa...,photo s by juha flinkman miia laine and sarita...
1,ashburnham close is located within celebrated ...,ashburnham close is located within the ceremon...
2,sorry to be less specific on here figured most...,sorry about being less specific on here figure...
3,s how to decorate and bathroom sets safe home ...,how to decorate bathroom sets safe home inspir...
4,osa scouting updated rating potential contact ...,osa scouting updated ratings potential contact...
...,...,...
5247,prediction about its upcoming tournament perfo...,predictions about their upcoming performance f...
5248,tuition december tuition will be send home wit...,tuition december tuition will be sent home wit...
5249,thanks for the reminder alex it s really good ...,thanks for the reminder alex it s really good ...
5250,health policy and systems research a methodolo...,health policy and systems research a methodolo...


In [None]:
# start trainig
train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)

0m 38s (- 9m 31s) (5 6%) 4.1135
1m 13s (- 8m 36s) (10 12%) 2.9560
1m 49s (- 7m 53s) (15 18%) 2.0953
2m 24s (- 7m 14s) (20 25%) 1.4361
3m 0s (- 6m 36s) (25 31%) 0.9666
3m 35s (- 5m 59s) (30 37%) 0.6461
4m 10s (- 5m 21s) (35 43%) 0.4404
4m 44s (- 4m 44s) (40 50%) 0.3054
5m 20s (- 4m 9s) (45 56%) 0.2218
5m 56s (- 3m 33s) (50 62%) 0.1637
6m 30s (- 2m 57s) (55 68%) 0.1251
7m 5s (- 2m 21s) (60 75%) 0.1000
7m 40s (- 1m 46s) (65 81%) 0.0781
8m 17s (- 1m 11s) (70 87%) 0.0651
8m 52s (- 0m 35s) (75 93%) 0.0546
9m 27s (- 0m 0s) (80 100%) 0.0495


In [None]:
# Set dropout layers to eval mode
encoder.eval()
decoder.eval()
chrf_score = evaluateRandomly(encoder, decoder, test_pairs)

In [3]:
""" Example1: """
example_1 = "grammar: This sentences, has bads grammar and spelling!"

output_words, _ = evaluate(encoder, decoder, example_1, input_lang, output_lang)
prediction = ' '.join(output_words)
print(prediction)

grammar this sentences has bad grammar and spelling!


In [6]:
""" Example2: """

example_2 = "grammar: I am enjoys, writtings articles ons AI and I also enjoyed write articling on AI."

output_words, _ = evaluate(encoder, decoder, example_2, input_lang, output_lang)
prediction = ' '.join(output_words)
print(prediction)

grammar i am enjoy writtings articles one two AI and I also enjoy write articling on AI.


In [None]:
# chrf score which is similarity matrixs
print(chrf_score)

22.774357419700753
