In [1]:
from benchmark import ModelBenchmark

In [2]:
DATA_PATH = "./data/"
with open(DATA_PATH + "small_clean.txt", 'r', encoding='utf-8', newline='\n') as f:
    input_clean = [line.strip() for line in f if line != ""]

with open(DATA_PATH + "small_corrupt.txt", 'r', encoding='utf-8', newline='\n') as f:
    input_corrupt = [line.strip() for line in f if line != ""]

### Baseline

In [17]:
from symspellpy import symspellpy
import pkg_resources
import re

max_edit_distance = 2
prefix_length = 7

print(symspellpy.WORD_PATTERN)
symspellpy.WORD_PATTERN = re.compile(r"(([^\s]+)|['’])")
print(symspellpy.WORD_PATTERN)
sym_spell = symspellpy.SymSpell(max_edit_distance, prefix_length)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
# bigram_path = pkg_resources.resource_filename(
#     "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
# sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

sym_spell.create_dictionary(DATA_PATH + "corpus.txt", encoding='utf-8')
input_phrase = "I luk forwad to it"
suggestions = sym_spell.lookup_compound(input_phrase, max_edit_distance=max_edit_distance)



re.compile("(([^\\s]+)|['’])")
re.compile("(([^\\s]+)|['’])")
[<symspellpy.suggest_item.SuggestItem object at 0x00000163E2A86590>]


In [18]:
print(*suggestions)

i luck forward to it, 3, 0


In [15]:
benchmark = ModelBenchmark(device='cpu')

In [None]:
benchmark.benchmark_model(sym_spell,
                          input_clean,
                          input_corrupt,
                          "neuspell-bert",
                          lambda model, data: model.correct_string(data),
                          warm_up_runs=0,
                          num_runs=2)


In [35]:
re_word = re.compile(r"(([^\s]+)|['’])")
matches = re_word.findall("team_number = tr[1].p.span.contents[0]")
print(matches)
matches = [match[0] for match in matches]
matches

[('team_number', 'team_number'), ('=', '='), ('tr[1].p.span.contents[0]', 'tr[1].p.span.contents[0]')]


['team_number', '=', 'tr[1].p.span.contents[0]']

In [18]:
text = "team_number = tr[1].p.span.contents[0]"
result = re.findall(r'\S+', text)

print(result)

['team_number', '=', 'tr[1].p.span.contents[0]']


### Neuspell


In [3]:
from neuspell import BertChecker

checker = BertChecker(device='cuda')
checker.from_pretrained()

data folder is set to `C:\FIT\bakalarka\.venv\Lib\site-packages\neuspell\../neuspell_data` script
loading vocab from path:C:\FIT\bakalarka\.venv\Lib\site-packages\neuspell\../neuspell_data/checkpoints/subwordbert-probwordnoise\vocab.pkl
initializing model
loading pretrained weights from path:C:\FIT\bakalarka\.venv\Lib\site-packages\neuspell\../neuspell_data/checkpoints/subwordbert-probwordnoise
Loading model params from checkpoint dir: C:\FIT\bakalarka\.venv\Lib\site-packages\neuspell\../neuspell_data/checkpoints/subwordbert-probwordnoise


In [6]:
benchmark = ModelBenchmark()

In [7]:
benchmark.benchmark_model(checker,
                          input_corrupt,
                          input_clean,
                          "neuspell-bert",
                          lambda model, data: model.correct_string(data),
                          warm_up_runs=0,
                          num_runs=2)


Starting 0 warm-up iterations for neuspell-bert...
Finished warm-up after 0.0 seconds.
Starting benchmark iterations...
Finished 1/2 iteration in 2.312450647354126 seconds.
Finished 2/2 iteration in 2.131199359893799 seconds.


Benchmark results:
	Model: neuspell-bert
	Size: 706.534797668457 MB
	Inference Time: 2.2218250036239624 s
	Peak Memory: 0.040357112884521484 MB
	GPU Memory: 841.85302734375 MB
	Throughput: 349.39364587594406 tokens/sec
	Throughput: 28.853152691690866 sentences/sec
	Accuracy tokens: 20.129%
	Accuracy sentences: 0.000%
	Correct → Correct: 144.0
	Correct → Incorrect: 452.0
	Incorrect → Correct: 12.0
	Incorrect → Incorrect: 167.0
	Word Correction Rate: 6.704%
	Word Incorrection Rate: 75.839%

In [None]:
print(*checker.evaluate(DATA_PATH + "small_clean.txt", DATA_PATH + "small_corrupt.txt"))

In [3]:
input_corrupt[0]

'team_number = tr[1].p.span.contents[0]'

In [14]:
checker.correct_string(input_corrupt[0])
# checker.correct_string(" I luk forawd to itd.")

_batch_orginal_sentences:['team _ number = tr [ 1 ] . p . span . contents [ 0 ]']
mystrings:['team _ number = tr [ 1 ] . p . span . contents [ 0 ]']
_batch_orginal_sentences:['team _ number = tr [ 1 ] . p . span . contents [ 0 ]']


'team _ number = tr [ 1 ] . p . span . contents [ 0 ]'

In [9]:
input_clean[1]

'* The internal method that handles the pointer over event from the browser.'

In [35]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

# Disable splitting on special characters using custom pre-tokenizer
tokenizer.backend_tokenizer.pre_tokenizer = None
tokenizer.tokenize('team_number = tr[1].p.span.contents[0]')

['[UNK]']

['[UNK]']