In [1]:
import pandas as pd 
import pkg_resources
from symspellpy import SymSpell, Verbosity
import string
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
class SpellChecker():
    def __init__(self):
        self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt"
        )
        bigram_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_bigramdictionary_en_243_342.txt"
        )
        self.sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
        self.sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

    def lookup(self, input_term, max_edit_distance=2):
        suggestions = self.sym_spell.lookup(input_term, Verbosity.CLOSEST, max_edit_distance=max_edit_distance,
                                            transfer_casing=True, include_unknown=True)
        return suggestions[0].term

    def lookup_compound(self, input_term, max_edit_distance=2, **kwargs):
        suggestions = self.sym_spell.lookup_compound((input_term), max_edit_distance=max_edit_distance,
                                                     transfer_casing=True, ignore_non_words=True, **kwargs)
        return suggestions[0].term if len(suggestions) > 0 else input_term

    def correct(self, text, **kwargs):
        result = ""
        start = 0
        changed = True
        for match in re.finditer(f"[{re.escape(string.punctuation)}]", text):
            end = match.start(0)
            spaces = re.search(r"^(\s+)", text[start: end])
            corrected_text = self.lookup_compound(text[start: end], **kwargs)
            corrected_text = spaces.group(0) + corrected_text if spaces is not None else corrected_text
            spaces = re.search(r"(\s+)$", text[start: end])
            corrected_text = corrected_text + spaces.group(0) if spaces is not None else corrected_text
            corrected_text += match.group(0)
            result = ''.join([result, corrected_text])
            start = match.end(0)
        spaces = re.search(r"^(\s+)", text[start:])
        corrected_text = self.lookup_compound(text[start:], **kwargs)
        corrected_text = spaces.group(0) + corrected_text if spaces is not None else corrected_text
        result = ''.join([result, corrected_text])
        if corrected_text.lower() == text.lower():
            changed = False
        return result

In [40]:
def correct_text(text, sym_spell):
    try:
        return sym_spell.correct(text)
    except:
        return text

('I am a student of the University of Colombo School of Computing', False)

In [3]:
# Read the data
df = pd.read_csv("asag_dataset.csv")
df.drop_duplicates(subset=['question', "model_answer", "student_answer"], inplace=True)

sym_spell = SpellChecker()
corrected_question = []
corrected_model_answer = []
corrected_student_answer = []

import multiprocessing as mp
from multiprocessing import Pool

def correct_text(text):
    try:
        return sym_spell.correct(text)
    except:
        return text
with Pool(mp.cpu_count()) as p:
    corrected_question = p.map(correct_text, df['question'])
    print("Question done")
    corrected_model_answer = p.map(correct_text, df['model_answer'])
    print("Model answer done")
    corrected_student_answer = p.map(correct_text, df['student_answer'])
    print("Student answer done")
    

df['question'] = corrected_question
df['student_answer'] = corrected_student_answer
df.to_csv('all_corrected.csv', index=False)

Question done
Model answer done
Student answer done


In [75]:
import numpy as np
all_corrected = pd.read_csv('all_corrected.csv')
questions = all_corrected["student_answer"]

In [76]:
questions

0          ('As ILLus are elevated in patients with CROwN...
1          ('Neuronal signals via the hepatic vagus nerve...
2          ('SPA significantly inhibited MCF-7/add cells ...
3          ('We identified a novel biologically plausible...
4          ('Breast cancer immune cell subpopulation prof...
                                 ...                        
1578004    ('Gestational age is classification is an unli...
1578005    ('Pornography has no place in the diagnosis of...
1578006    ('These results suggest little need to use pea...
1578007    ('The PRP group recorded reduced pain, swellin...
1578008    ('Paediatric SF medicines were not more erosiv...
Name: student_answer, Length: 1578009, dtype: object

In [77]:
len(r"(' ', False)")

12

In [78]:
flag = []
all = 0
for i in questions:
    if isinstance(i, float) or len(i) == 12:
        all += 1
    else:
        x = i.split(",")
        flag.append(x[-1][1:-1])


In [79]:
flag

['True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 'True',
 

In [80]:
flag = [True if x == "True" else False for x in flag ]

In [83]:
len(flag)

1304310

In [9]:
count = 0
for i in count_corrected:
    if i:
        count += 1
print(count/len(count_corrected))
print(count)
print(len(count_corrected))

0.5916487168324135
933627
1578009


In [50]:
for question in enumerate(zip(df['question'], df['model_answer'], df['student_answer'])):
    count += 1
    if count%10000 == 0:
        print(count)
    try:
        corrected_question.append(sym_spell.correct(question))
        # if m_ans != " ":
        #     corrected_model_answer.append(sym_spell.correct(m_ans))
        # if st_ans != " ":
        #     corrected_student_answer.append(sym_spell.correct(st_ans))
    except:
        corrected_question.append(question)
        # corrected_model_answer.append(m_ans)
        # corrected_student_answer.append(st_ans)

In [2]:


# get cpu count
pool = Pool(mp.cpu_count())
pool

<multiprocessing.pool.Pool state=RUN pool_size=12>

In [59]:
input_term = "This is an exampe of a text with, 0 speling, erors and some other erors then a big word like antidisestablishmentarianism and some other erors to test the spell cheker "
# max edit distance per lookup (per single word, not per whole input string)
suggestions = sym_spell.correct(input_term)
# display suggestion term, edit distance, and term frequency
print(suggestions)


This is an example of a text with, 0 spelling, errors and some other errors then a big word like antidisestablishmentarianism and some other errors to test the spell cheer


0
1000
2000
3000
What does 基督徒 (pinyin: jīdū tú) mean?
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
The Ancient Greek κλῆς (klês - "famous") helped create what famous name?
14000
15000
16000
17000
18000
Who wrote Hasan Mellâh yâhud Sırr İçinde Esrâr?
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
In Kimberly's vision, who is Dr. Kalarjian "strangling"?
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
1

In [62]:
sym_spell.correct("What does 基督徒 (pinyin: jīdū tú) mean?")

IndexError: string index out of range