In [1]:
import pickle as pkl 
import pandas as pd
from symspellpy.symspellpy import SymSpell, Verbosity
import pkg_resources
import re, string, json
import spacy
from tqdm import tqdm

In [2]:
file = open('x_train.pkl', 'rb')


In [3]:
data = pkl.load(file)

In [4]:
def spell_correction(sentence_list):
    max_edit_distance_dictionary= 3
    prefix_length = 4
    spellchecker = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    spellchecker.load_dictionary(dictionary_path, term_index=0, count_index=1)
    spellchecker.load_bigram_dictionary(dictionary_path, term_index=0, count_index=2)
    norm_sents = []
    print("Spell correcting")
    for sentence in tqdm(sentence_list):
        norm_sents.append(_spell_correction_text(sentence, spellchecker))
    return norm_sents

def _spell_correction_text(text, spellchecker):
    """
    This function does very simple spell correction normalization using pyspellchecker module. It works over a tokenized sentence and only the token representations are changed.
    """
    if len(text) < 1:
        return ""
    #Spell checker config
    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.TOP # TOP, CLOSEST, ALL
    #End of Spell checker config
    token_list = text.split()
    for word_pos in range(len(token_list)):
        word = token_list[word_pos]
        if word is None:
            token_list[word_pos] = ""
            continue
        if not '\n' in word and word not in string.punctuation and not is_numeric(word) and not (word.lower() in spellchecker.words.keys()):
            suggestions = spellchecker.lookup(word.lower(), suggestion_verbosity, max_edit_distance_lookup)
            #Checks first uppercase to conserve the case.
            upperfirst = word[0].isupper()
            #Checks for correction suggestions.
            if len(suggestions) > 0:
                correction = suggestions[0].term
                replacement = correction
            #We call our _reduce_exaggerations function if no suggestion is found. Maybe there are repeated chars.
            else:
                replacement = _reduce_exaggerations(word)
            #Takes the case back to the word.
            if upperfirst:
                replacement = replacement[0].upper()+replacement[1:]
            word = replacement
            token_list[word_pos] = word
    return " ".join(token_list).strip()

def _reduce_exaggerations(text):
    """
    Auxiliary function to help with exxagerated words.
    Examples:
        woooooords -> words
        yaaaaaaaaaaaaaaay -> yay
    """
    correction = str(text)
    #TODO work on complexity reduction.
    return re.sub(r'([\w])\1+', r'\1', correction)

def is_numeric(text):
    for char in text:
        if not (char in "0123456789" or char in ",%.$"):
            return False
    return True

In [5]:
data

['hey a guy i know is breathing down my neck to get him some bud anyway you had be able to get a half track to usf tonight',
 'mr deed am i 87 these number mean nothing to me fair well mr puffy jacket man ha ha ha',
 'why nothing ok anyway give me treat',
 'no child support',
 'why do you want a massage',
 'pastor do the same thing in church they shall be right in the middle of a statement and then say hello or something like it it a way to keep u add people paying attention lol',
 'haha can but i am having dinner with my cousin',
 'can i get your number',
 'alright i shall get on fb in a couple minute',
 'so many people seems to be special at first sight but only very few will remain special to you till your last sight maintain them till life end shjas',
 'i am from loudon you know wear that is',
 'i am at the gas station go there',
 'what are youdoing later sar xxx',
 'i really do not get these psycho fanboys attacking everyone who did not like the damn movie it a fing batman movie g

In [6]:
data = spell_correction(data)

  0%|          | 14/13105 [00:00<01:57, 111.33it/s]

Spell correcting


100%|██████████| 13105/13105 [00:59<00:00, 219.16it/s]


In [7]:
data

['hey a guy i know is breathing down my neck to get him some bud anyway you had be able to get a half track to us tonight',
 'or deed am i 87 these number mean nothing to me fair well or puffy jacket man a a a',
 'why nothing of anyway give me treat',
 'no child support',
 'why do you want a massage',
 'pastor do the same thing in church they shall be right in the middle of a statement and then say hello or something like it it a way to keep a add people paying attention low',
 'hama can but i am having dinner with my cousin',
 'can i get your number',
 'alright i shall get on feb in a couple minute',
 'so many people seems to be special at first sight but only very few will remain special to you till your last sight maintain them till life end shias',
 'i am from london you know wear that is',
 'i am at the gas station go there',
 'what are outdoing later car xxx',
 'i really do not get these psycho fanboys attacking everyone who did not like the damn movie it a find batman movie grow

In [9]:
pkl.dump( data, open( "x_train_corrected.pkl", "wb" ) )