In [344]:
#!pip install thefuzz

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting thefuzz
  Downloading thefuzz-0.19.0-py2.py3-none-any.whl (17 kB)
Installing collected packages: thefuzz
Successfully installed thefuzz-0.19.0


In [345]:
from gramformer import Gramformer
import torch
import spacy
import random
from thefuzz import fuzz


## Original GrammarModel

In [48]:
class GrammarModel(Gramformer):
    """
    Grammar correction model.
    """
    def __init__(self, models=1, use_gpu=False, seed=1212):
        self.gm = super().__init__(models=1, use_gpu=False)


    def grammar_correction(self,last_user_input):
        """
        Generate a corrected sentence and a message to the user with the correction.
        """
        corrected_sentence = self.correct(last_user_input, max_candidates=1)
        corrected_sentence = list(corrected_sentence)[0]
        message_styles = [
            "I think you meant: ",
            "Oh, you mean: ",
            "This would be better said like this: "
        ]

        if corrected_sentence != last_user_input:
            correction_message = f"{random.choice(message_styles)} \"{corrected_sentence}\" "
        else:
             correction_message = None

        return corrected_sentence, correction_message


    def add_correction_to_chat_history(self, chat_history):
        """
        Append the message to the user to the chat history.
        Return the corrected sentence.
        """
        last_user_input = chat_history[-1].get('text')
        corrected_sentence, correction_message = self.grammar_correction(last_user_input)
        error_types = self.get_edits(last_user_input, corrected_sentence)

        if correction_message:
            chat_history.append(
                {
                    'sender': 'bot',
                    'text': correction_message,
                    'correction': True
                }
            )
        return chat_history       


    def _get_edits(self, input_sentence, corrected_sentence):
        """
        Return the type of the error.
        """
        input_sentence = self.annotator.parse(input_sentence)
        corrected_sentence = self.annotator.parse(corrected_sentence)
        alignment = self.annotator.align(input_sentence, corrected_sentence)
        edits = self.annotator.merge(alignment)

        if len(edits) == 0:  
            return []

        edit_annotations = []
        for e in edits:
            e = self.annotator.classify(e)
            edit_annotations.append(e.type[2:])
                
        if len(edit_annotations) > 0:
            return edit_annotations
        else:    
            return []

In [4]:
gm = GrammarModel(models = 1, use_gpu=False)

[Gramformer] Grammar error correct/highlight model loaded..


### Showcase punctuation and casing errors (GrammarModel)

In [5]:
# Example sentences
ex1= "Hi bot!" # should not be corrected to "Bot"
ex2= "Hello" # should not be corrected to "Hello." or "Hello!"

In [32]:
error_types_ex1 = []
error_types_ex2 = []
corrected_sentences_ex1 = []
corrected_sentences_ex2 = []

In [43]:
correct_sentence, message = gm.grammar_correction(ex2)
corrected_sentences_ex2.append(correct_sentence)

error_types = gm.get_edits(ex2, correct_sentence)
error_types_ex2.extend(error_types)

print(correct_sentence, error_types)

Hello. ['OTHER']


In [45]:
# summary of tracked errors for sentence ex1
print(f"Original: {ex1}\nError Types: {error_types_ex1}\nSuggested Corrections: {corrected_sentences_ex1}")

Original: Hi bot!
Error Types: ['OTHER', 'ORTH', 'OTHER', 'NOUN', 'OTHER', 'OTHER', 'ORTH']
Suggested Corrections: ['Hi Bi-Bo!', 'Hi Bot!', 'Hi!', 'Hello BOTH!', 'Hi Bot!', 'Hi bot!']


In [46]:
# summary of tracked errors for sentence ex2
print(f"Original: {ex2}\nError Types: {error_types_ex2}\nSuggested Corrections: {corrected_sentences_ex2}")

Original: Hello
Error Types: ['OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER', 'OTHER']
Suggested Corrections: ['Hello!', 'Hello.', 'Hello.', 'Hello.', 'Hello.', 'Hello.']


## Improvements to grammar correction

In [413]:
class GrammarModel2(Gramformer):
    """
    Grammar correction model.
    """
    def __init__(self, models=1, use_gpu=False, seed=1212):
        self.gm = super().__init__(models=1, use_gpu=False)
        self.ignore_errors = ['OTHER', 'ORTH']


    def grammar_correction(self,last_user_input):
        """
        Generate a corrected sentence and a message to the user with the correction.
        """
        corrected_sentence = self.correct(last_user_input, max_candidates=1)
        corrected_sentence = list(corrected_sentence)[0]
        message_styles = [
            "I think you meant: ",
            "Oh, you mean: ",
            "This would be better said like this: "
        ]

        if corrected_sentence != last_user_input:
            correction_message = f"{random.choice(message_styles)} \"{corrected_sentence}\" "
        else:
             correction_message = None

        return corrected_sentence, correction_message


    def add_correction_to_chat_history(self, chat_history):
        """
        Append the message to the user to the chat history.
        Return the corrected sentence.
        """
        last_user_input = chat_history[-1].get('text')
        corrected_sentence, correction_message = self.grammar_correction(last_user_input)
        error_types = self.get_edits(last_user_input, corrected_sentence)
        relevant_error = any(error not in self.ignore_errors for error in error_types) # check if there is an error in the sentence which is not in the ignore list 
        token_sort_ratio = fuzz.token_sort_ratio(corrected_sentence, last_user_input) # calculate token similarity (ignoring punctuation and casing)
        print(f"correction_message: {correction_message}\nErrors detected: {error_types}\nPresence of a relevant error: {relevant_error}\nSimilarity Score: {token_sort_ratio}") # for debugging only
        
        if correction_message and relevant_error and token_sort_ratio != 100:
            chat_history.append(
                {
                    'sender': 'bot',
                    'text': correction_message,
                    'correction': True
                }
            )
        
        return chat_history       


    def _get_edits(self, input_sentence, corrected_sentence):
        """
        Return the type of the error.
        """
        input_sentence = self.annotator.parse(input_sentence)
        corrected_sentence = self.annotator.parse(corrected_sentence)
        alignment = self.annotator.align(input_sentence, corrected_sentence)
        edits = self.annotator.merge(alignment)

        if len(edits) == 0:  
            return []

        edit_annotations = []
        for e in edits:
            e = self.annotator.classify(e)
            edit_annotations.append(e.type[2:])
                
        if len(edit_annotations) > 0:
            return edit_annotations
        else:    
            return []

In [414]:
gm2 = GrammarModel2(models=1, use_gpu=False)

[Gramformer] Grammar error correct/highlight model loaded..


### 1. Remove correction when no relevant errors are detected(other than those in self.ignore_errors)

In [450]:
chat_history_ex1 = [{'sender': 'User', 'text': 'where are you goin?'}]

In [451]:
chat_history = gm2.add_correction_to_chat_history(chat_history_ex1)

correction_message: I think you meant:  "where are you going?" 
Errors detected: ['PUNCT']
Presence of a relevant error: True
Similarity Score: 97


In [452]:
chat_history

[{'sender': 'User', 'text': 'where are you goin?'},
 {'sender': 'bot',
  'text': 'I think you meant:  "where are you going?" ',
  'correction': True}]

### 2. Remove correction when input and correction are very similar

In [369]:
# Example of similar sentences which should not be corrected
ex1 = "Hi bot!"
ex2 = "Hi bot"
ex3 = "Hi bot."
ex4 = "Hi Bot"
ex5 = "Hi Bot."
ex6 = "Hi Bot!"
ex7 = "Hi Bot Bot!" # should lead to lower token sort ratio, but same token set ratio compared to ex1

In [364]:
# Measure the similarity between 0 and 100 to define a threshold.

def measure_similarity(sentence1, sentence2):
    simple_ratio = fuzz.ratio(sentence1, sentence2)
    print(f"simple ratio similarity score: {simple_ratio}")

    partial_ratio = fuzz.partial_ratio(sentence1, sentence2) # Return the ratio of the most similar substring.
    print(f"partial ratio similarity score: {partial_ratio}")

    ratio = fuzz.ratio(sentence1, sentence2)
    print(f"ratio similarity score: {ratio}")

    token_sort_ratio = fuzz.token_sort_ratio(sentence1, sentence2) # Return a measure of the sequences' similarity sorting the token before comparing. This is what we want to set as threshold.
    print(f"token sort ratio similarity score: {token_sort_ratio}")

    token_set_ratio = fuzz.token_set_ratio(sentence1, sentence2) # Measures similarity between unique tokens.
    print(f"token set ratio similarity score: {token_set_ratio}")

In [378]:
measure_similarity(ex1, ex7)

simple ratio similarity score: 67
partial ratio similarity score: 71
ratio similarity score: 67
token sort ratio similarity score: 75
token set ratio similarity score: 100


In [426]:
## Correction accuracy check