In [2]:
import re
from textwrap import wrap
from subprocess import Popen, PIPE
import pexpect
from timeit import timeit

In [3]:
ALL_CHARS = set("0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,!?'-")


In [10]:
def split_long(text):
    text = re.sub('- ','', text)
    text = re.sub(' +', ' ', text).strip()
    lines = wrap(text, 200, break_long_words=False)
    return lines


def tokenize_characters(text):
    text = text.strip()
    text = ''.join(ch if ch in ALL_CHARS else '#' for ch in text)
    text = re.sub(' +', ' ', text).strip()
    tokens = [ch if ch != ' ' else '▁' for ch in text]
    return tokens


def get_tokens(text):
    tokens = tokenize_characters(text)[:1023]
    return ' '.join(tokens)


def get_fairseq_output(line):
    command = ['/usr/local/bin/fairseq-interactive',
               'model7m/',
               '--path', 'model7m/checkpoint_best.pt',
               '--source-lang', 'fr',
               '--target-lang', 'en',
               '--beam', '10']
    p = Popen(command, stdin=PIPE, stdout=PIPE, stderr=PIPE, text=True)
    out = p.communicate(line)
    outlines = out[0].split('\n')
    return outlines


def format_output(lines):
    prev_line_no = None
    for line in lines:
        match = re.match(r'^H-(\d+)', line)
        if not match:
            continue
        tokens = line.split('\t')[2].split(' ')
        text = ''.join(tokens)
        line_no = int(match.group(1))
        assert not prev_line_no or line_no == prev_line_no + 1
        prev_line_no = line_no
        text = text.replace('▁', ' ')
        return text


def spell_check(text):
    lines = split_long(text)
    corrected_lines = []
    for line in lines:
        line_toks = get_tokens(line)
        corrected_toks = get_fairseq_output(line_toks)
        out = format_output(corrected_toks)
        corrected_lines.append(out)
    return ' '.join(corrected_lines)

def spell_check_alt(text):
    lines = split_long(text)
    corrected_lines = []

    command = ['/usr/local/bin/fairseq-interactive',
               'model7m/',
               '--path', 'model7m/checkpoint_best.pt',
               '--source-lang', 'fr',
               '--target-lang', 'en',
               '--beam', '10']
    command = ' '.join(command)

    p = pexpect.spawn(command, timeout=90)
    p.expect('.*return:')

    for line in lines:
        p.sendline(get_tokens(line))
        p.expect('\d\r\n')
        out = p.before
        out = out.decode().split('\r\n')
        out = format_output(out)
        corrected_lines.append(out)

    p.close()

    return ' '.join(corrected_lines)

In [11]:
doc =  '6 rioters hurt in firing at Mahim                 By Our Staff Reporters                 BOMBAY October 3                 SIX persons were injured when the police opened fire in dif- ferent parts of Mahim late to- night in attempts to quell riotous stone-throwing mob                 1 here were reports oi in tne Fishermens Colony area and in the vicinity of Mahim police station as well as Kapad Bazaar later tonight According to the police around 1 1 pm three rounds were fired at Kapad Bazaar to disperse the mob whose scuffling left 25 people injured number of tear-gas shells were also burst sources said The enraged crowd then swarmed towards the Mahim police station and surrounded it The policemen inside were forced to barricade the doors against it The state reserve police immedi- ately rushed to the spot and sur- rounded the station Three platoons had earlier been stationed in the area According to details pieced together from various sources group of 15 lorries returning from an Id-e-Milad procession was stoned at Mahim junction and near mosque at around 94S pm Eye-witnesses said the stones thrown at the lorries appeared to                 have come from the nearby Fisher- mens Colony This apparently trig- gered retaliation from the lorries occupants and heavy exchange of stones and soda bottles ensued The trouble then spread to the adjoining areas and even to the vicinity of the Bada mosque at Ban- dra junction at around 1115 pm Swords were allegedly used in the attack eyewitnesses aver As many as 15 people with sword injuries had to be taken to Bhabha Hospital where two were admitted Late at night nearly 1000 people had gathered in the Mahim-Bandra area and top police officials were maintaining vigil Contingents of the state reserve police were rushed to the spot The commissioner of police Mr Ramamurthi at the scene of the disturbances said The situation is under control My men are on the job While the riots on the main road had been brought under control sporadic incidents of violence were reported from the Bandra by-lanes Of the 25 injured the majority were from Bandra and three were from Mahim Among them were four women All were reported to have sword injuries on the arms and legs According to unconfirmed reports Continued on page 3)'

In [12]:
spell_check_alt(doc)

'6 rioters hurt in firing at Mahim By Our Staff Reporters BOMBAY October 3 SIX persons were injured when the police opened fire in different parts of Mahim late tonight in attempts to quell rioters stone-throwing mob 1 here were reports of in the Fishermens Colony area and in the vicinity of Mahim police station as well as Kapad Bazaar later tonight According to the police around 1 1 pm three rounds were fired at Kapad Bazaar to disperse the mob whose scuffling left 25 people injured number of tear-gas shells were also burst sources said The enraged crowd then swarmed towards the Mahim. police station and surrounded it The policemen inside were forced to barricade the doors against it The state reserve police immediately rushed to the spot and surrounded the station Three platoons had earlier been stationed in the area According to details pieced together from various sources group of 15 lorries returning from an Id-e-Milad procession was stoned at Mahim junction and near mosque at aro

In [16]:
spell_check(doc)

KeyboardInterrupt: 

In [13]:
popen_version = timeit('spell_check(doc)', 'from __main__ import spell_check, doc', number=1)

TypeError: sequence item 0: expected str instance, NoneType found

In [None]:
pexpect_version = timeit('spell_check_alt(doc)', 'from __main__ import spell_check_alt, doc', number=1)

In [None]:
print('With popen:', popen_version)
print('With pexpect:', pexpect_version)