In [1]:
import os, re
from string import punctuation
import numpy as np
import json
from collections import Counter
from pprint import pprint
punct = set(punctuation)
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

In [3]:
corpus = [sent.split() for sent in open('corpus_ng.txt', encoding='utf8').read().splitlines()]
WORDS = Counter()
for sent in corpus:
    WORDS.update(sent)

In [4]:
vocab = list(WORDS.keys())
id2word = {i:word for i, word in enumerate(vocab)}

vec = TfidfVectorizer(analyzer='char', ngram_range=(1,1))
X = vec.fit_transform(vocab)

In [6]:
import textdistance

In [7]:
def get_closest_hybrid_match(text, X, vec, metric=textdistance.levenshtein):
    v = vec.transform([text])
    similarities = cosine_distances(v, X)
    topn = similarities.argsort()[0][:5]
    variants = [id2word[top] for top in topn]
    
    similarities1 = Counter()
    for word in variants:
        similarities1[word] = metric.normalized_similarity(text, word) 
    
    return similarities1.most_common(1)[0][0]

In [9]:
bad = open('sents_with_mistakes.txt').read().splitlines()
true = open('correct_sents.txt').read().splitlines()

In [10]:
def align_words(sent_1, sent_2):
    tokens_1 = sent_1.lower().split()
    tokens_2 = sent_2.lower().split()
    
    tokens_1 = [re.sub('(^\W+|\W+$)', '', token) for token in tokens_1 if (set(token)-punct)]
    tokens_2 = [re.sub('(^\W+|\W+$)', '', token) for token in tokens_2 if (set(token)-punct)]
    
    return list(zip(tokens_1, tokens_2))

In [15]:
correct = 0
total = 0
mistakes = []

for i in range(len(true)):
    word_pairs = align_words(true[i], bad[i])
    
    for pair in word_pairs:
        
        corrected = get_closest_hybrid_match(pair[1], X, vec)
        if corrected == pair[0]:
            correct += 1
        else:
            mistakes.append([pair[0], corrected])
        total += 1

In [16]:
print(correct/total)

0.8294047143427886


In [None]:
def get_closest_match_with_metric(text, lookup, metric=textdistance.levenshtein):
    similarities = Counter()
    for word in lookup:
        similarities[word] = metric.normalized_similarity(text, word) 
    
    return similarities.most_common(1)[0]