## Importing Libraries & Datasets

In [23]:

#For Implementing Gramformer Solution
from gramformer import Gramformer

#for implementing Bert Solution
from happytransformer import HappyTextToText, TTSettings

#For Implementing LanguageTool Solution
import language_tool_python

#For GingerIt Solution
from gingerit.gingerit import GingerIt

#For Symspellpy
import pkg_resources
from symspellpy import SymSpell

#For Web Application Development
import gradio as gr

import pandas as pd
import sys 


In [22]:
df = pd.read_csv('./grammatical error detection/NLP Assignment/test_data.csv')
df.head(20)
test_df = df.head(40)
pd.set_option('display.max_colwidth', None)

## Gramformer Model

In [2]:
gf = Gramformer(models=1, use_gpu=False) #1=corrector, 2=detector

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

[Gramformer] Grammar error correct/highlight model loaded..


In [3]:
gf.correct('hello my dear childs')

{'hello my dear child.'}

    Function to build web application using gradio

In [None]:
def correct(sentence):
    res = gf.correct(sentence) 
    return res
# app_inputs = gr.inputs.Textbox(lines=3, placeholder="Enter a grammatically incorrect sentence here...")

# interface = gr.Interface(fn=correct, 
#                         inputs=app_inputs,
#                          outputs='text', 
#                         title='Hi there, I\'m Gramformer')

#interface.launch()

    Gramformer highlighter

In [None]:
# gh = Gramformer(models=3, use_gpu=False) #1=corrector, 2=detector

In [None]:
# gh.highlight(orig='to tha store',cor='to the store')

### Trying Gramformer on Datasets

In [None]:
def gramformer_corrector(text):
    res = gf.correct(text) 
    return res

    Calling Gramformer model for correction

In [None]:
test_df['corrected_sentence'] = test_df['input'].apply(lambda text: gramformer_corrector(text))
test_df.head(40)

## Bert + Huggingface Model 

In [12]:
happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
args = TTSettings(num_beams=5, min_length=1)
result = happy_tt.generate_text("spell and grammar correction: But there 's no oubt that Dudley , im he runs , would bring name familiarity to the rac given that he had two stints playing for the Trail Blazers , from 1993-97 and from 2001-03 .", args=args)
print(result.text)

But there's no doubt that Dudley, as he runs, would bring name familiarity to the rac given that he had two stints playing for the Trail Blazers , from 1993-97 and from 2001


In [13]:
def huggingface_corrector(text):
    result = happy_tt.generate_text('spell and grammar correction: ' + text, args=TTSettings(num_beams=1, min_length=1, max_length=100))
    return result.text
# app_hugginface_inputs = gr.inputs.Textbox(lines=3, placeholder="Enter a grammatically incorrect sentence here...")

# interface2 = gr.Interface(fn=huggingface_corrector, 
#                         inputs=app_hugginface_inputs,
#                         outputs='text', 
#                         title='Hi there, I\'m Huggingface')

#interface2.launch()

### Trying Bert + Hugging Face on Datasets

In [None]:
def huggingface_corrector(text):
    result = happy_tt.generate_text('grammar: ' + text, args=TTSettings(num_beams=1, min_length=1, max_length=100))
    return result.text

    Calling Bert model for correction

In [None]:
test_df['corrected_sentence'] = test_df['input'].apply(lambda text: huggingface_corrector(text))
test_df.head(40)

## Language Tool Model

In [None]:
my_tool = language_tool_python.LanguageTool('en-US')  
my_text = """LanguageTool provides utility to check grammar and spelling errors. We just have to paste the text here and click the 'Check Text' button. Click the colored phrases for for information on potential errors. or we can use this text too see an some of the issues that LanguageTool can dedect. Whot do someone thinks of grammar checkers? Please not that they are not perfect. Style problems get a blue marker: It is 7 P.M. in the evening. The weather was nice on Monday, 22 November 2021"""   

def english_text_corrector(tool, text):
    
    matches = tool.check(text)

    #empty lists
    Mistakes = [] 
    Corrections = []  
    StartPositions = []  
    EndPositions = []  

    for rules in matches:
        if len(rules.replacements) > 0:  
            StartPositions.append(rules.offset)  
            EndPositions.append(rules.errorLength + rules.offset)  
            Mistakes.append(my_text[rules.offset : rules.errorLength + rules.offset])  
            Corrections.append(rules.replacements[0]) 

    print("Mistakes made")
    print (Mistakes)
    print ("\nRecommended Corrections")
    print(Corrections)
    print ("\nMistake Starting character number")
    print(StartPositions)
    print ("\nMistake EndPoint character number")
    print(EndPositions)

    mistakes_number = len (Mistakes)

    print( "\nNumber of mistakes made " + str(mistakes_number))
    #return mistakes_number

#english_text_corrector(my_tool, my_text)

### Language Tool Model on Datasets

In [None]:
tool = language_tool_python.LanguageTool('en-US')
def language_tool_corrector(text:str):
    correction = tool.correct(text)
    return correction

    Calling language tool model for correction

In [None]:
test_df['corrected_sentence'] = test_df['input'].apply(lambda text: language_tool_corrector(text))
test_df.head(40)

## GingerIt Model

In [17]:
text = 'The smelt of fliwers bring back memories.'

parser = GingerIt()
parser.parse(text)['result']

'The smell of flowers brings back memories.'

### Trying GingerIt Model on Datasets

In [None]:
def ginger_corrector(text: str):
    
    parser = GingerIt()
    correction = parser.parse(text)['result']
    return correction

        Calling GingerIt model for correction

In [None]:
test_df['corrected_sentence'] = test_df['input'].apply(lambda text: ginger_corrector(text))
test_df.head(40)

## Symspellpy Model

In [30]:
sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt"
)
bigram_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_bigramdictionary_en_243_342.txt"
)
# term_index is the column of the term and count_index is the
# column of the term frequency
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
input_term = (
    "whereis th elove hehad dated forImuch of thepast who "
    "couqdn'tread in sixtgrade and ins pired him"
)
# max edit distance per lookup (per single word, not per whole input string)
suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)

for suggestion in suggestions:
    print(suggestion)

where is the love he had dated for much of the past who couldn't read in six grade and inspired him, 9, 0


In [33]:
def symspelly_corrector(text):
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt"
    )
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt"
    )

    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
    sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
    suggestions = sym_spell.lookup_compound(text, max_edit_distance=2)

    for suggestion in suggestions:
        return suggestion

In [34]:
test_df['corrected_sentence'] = test_df['input'].apply(lambda text: symspelly_corrector(text))
test_df.head(40)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,input,corrected_sentence
0,I believe they will master Japanese soon because they were selected as scholarship recipients .,"i believe they will master japanese soon because they were selected as scholarship recipients, 4, 0"
1,I am looking for it .,"i am looking for it, 3, 0"
2,"Apple is a round fruit with smooth and colorful skin , red , green and sometimes gold .","apple is a round fruit with smooth and colourful skin red green and sometimes gold, 8, 0"
3,Let It Will Be Push .,"let it will be push, 7, 0"
4,I rode on this ship from Sendai .,"i rode on this ship from sendai, 4, 0"
5,I thought Korean is very interesting language when I was sutdying Korean .,"i thought korean is very interesting language when i was studying korean, 7, 0"
6,Nobody is perfect .,"nobody is perfect, 3, 3"
7,Can you lend me some money ?,"can you lend me some money, 3, 0"
8,Does anyone correct my previous article ?XD,"does anyone correct my previous article cd, 4, 0"
9,"Similarly , I want to thank and for all their help and the nice time spent together , and wish to continue being together .","similarly i want to thank and for all their help and the nice time spent together and wish to continue being together, 8, 0"


## Combining GingerIt with Bert

In [1]:
test_df['ginger_corrected_sentence'] = test_df['input'].apply(lambda text: ginger_corrector(text))
test_df['combined_with_bert_corrected_sentence'] = test_df['ginger_corrected_sentence'].apply(lambda text: huggingface_corrector(text))
test_df.head(40)

NameError: name 'test_df' is not defined

#### on dataset2

In [15]:
path = "spell error detection/"

real_files = ['test.1blm', 'test.bea20k', 'test.bea322', 'test.bea4660', 'test.bea4k', 'test.bea60k', 'test.jfleg', 'train.bea40k']

In [20]:
import os
import time

analysis_data = {'error': [], 'real_correct': [], 'predicted_correct': [], 'diff_r': [], 'diff_p': []}
for f in real_files:
    with open(path + f) as file:
        error_file = path + f + '.noise'
        if not os.path.isfile(error_file):
            error_file = path + f + '.noise.prob'
        errors_lines =  f = open(error_file,'r').readlines()
        real_lines = file.readlines()
        for i in range(20):
            time.sleep(0.25)
            analysis_data['error'].append(errors_lines[i].strip())
            analysis_data['real_correct'].append(real_lines[i].strip())
            predicted = huggingface_corrector(parser.parse(errors_lines[i].strip())['result'])
            analysis_data['predicted_correct'].append(predicted)
            analysis_data['diff_r'].append(set(real_lines[i].split()).difference(set(errors_lines[i].split())))
            analysis_data['diff_p'].append(set(predicted.split()).difference(set(errors_lines[i].split())))

In [22]:
data = pd.DataFrame(analysis_data)
data.to_csv(path + 'error_analysis2.csv', index=False)