### Preamble

In [4]:
import editdistance
import jellyfish as jf
from collections import Counter
import re
from ngram import NGram
# import nltk

n = NGram(dict1, key=lambda x:x.lower(), N=2)
# snowball = nltk.SnowballStemmer("english", ignore_stopwords=True)

### Levenshtein Distance 

Once a candidate has been identified for normalisation, firstly, 
edit distance (Levenshtein distance) technique is applied to find matches from (dict.txt) 
which are within 2 (inclusive) edit distance of the query. The results are stored in an array. 
We refer to this set as the “First Set of Matches based on Edit Distance” 
since they contain approximate matches based on their textual similarity to the query.

In [10]:
def levenshtein_distance(string, dictionary):   
    return [x for x in dictionary if editdistance.eval(string, x) <= 2]

### Phonetic Matching

In [4]:
from pyphonetics import RefinedSoundex
from pyphonetics import Metaphone
import py_stringmatching as ps

rs = RefinedSoundex()
mp = Metaphone()

def phonetic_match(string, dictionary):
    #return [x for x in dictionary if rs.distance(string, x) <= 1]
    #return [x for x in dictionary if ps.Editex.get_sim_score(string, x) >= 0.5]
    return [x for x in dictionary if mp.sounds_like(string, x)]

### [Peter Norvig’s Algorithm](http://norvig.com/spell-correct.html)

Algorithm generates all possible terms with an edit distance of less than or equal to 2 (which includes deletes, transposes, replaces, and inserts) from the query term and searches them in the dictionary (big.txt).

In [467]:
# author: Peter Norvig

def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('big.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

### Evaluation metrics
---
** Accuracy **

In [19]:
def accuracy(actual, prediction):
    i = 0
    for x,y in zip(actual, prediction):
        if x == y:
            i = i + 1
    return i/len(actual)  

** Precision **

In [20]:
def precision(actual, prediction):
    i = 0
    flat_list = [item for sublist in prediction for item in sublist]
    for x,y in zip(actual, prediction):
        if x in y:
            i = i + 1
    return i/len(flat_list)  

** Recall **

In [21]:
def recall(actual, prediction):
    i = 0
    for x,y in zip(actual, prediction):
        if x in y:
            i = i + 1
    return i/len(actual)  

## Labelled Tokens
---
### File I/O 
- read in the dictionary
- extract OOV tokens

In [None]:
f1 = open('dict.txt')
dict1 = []
for line in f1:
    words = line.split()
    dict1.append(words[0])
f1.close()

f =  open('labelled-tokens.txt', encoding='latin-1')
OOV = []
IV = []
NO = []
correct = []
for line in f:
    words = line.split()
    if words[1] != 'IV' and words[1] != 'NO':
        if words[2] in dict1:
            OOV.append(words[0])
            correct.append(words[2])
    if words[1] == 'IV':
        IV.append(words[0])
        #correct.append(words[2])
    if words[1] == 'NO':
        NO.append(words[0])
        #correct.append(words[2])
f.close()

In [2]:
print(tuple(zip(OOV, correct)))

(('comming', 'coming'), ('tomoroe', 'tomorrow'), ('dang', 'dang'), ('nd', 'and'), ('hve', 'have'), ('u', 'you'), ('ppl', 'people'), ('cuz', 'because'), ('c', 'see'), ('wat', 'what'), ('hehe', 'hehe'), ('jus', 'just'), ('iight', 'alright'), ('u', 'you'), ('soo', 'so'), ('ppl', 'people'), ('u', 'you'), ('prolly', 'probably'), ('waka', 'waka'), ('bt', 'but'), ('naw', 'no'), ('u', 'you'), ('riqht', 'right'), ('l', 'l'), ('acc', 'actually'), ('r', 'are'), ('u', 'you'), ('jus', 'just'), ('2', 'to'), ('pic', 'picture'), ('y', 'why'), ('u', 'you'), ('u', 'you'), ('u', 'you'), ('k', 'ok'), ('u', 'you'), ('rememberr', 'remember'), ('withh', 'with'), ('meh', 'me'), ('lil', 'little'), ('goood', 'good'), ('tay', 'tay'), ('m', 'am'), ('u', 'you'), ('trippn', 'tripping'), ('tnx', 'thanks'), ('birdman', 'birdman'), ('somethin', 'something'), ('t', 't'), ('tryin', 'trying'), ('fone', 'phone'), ('u', 'you'), ('r', 'are'), ('msg', 'message'), ('sayin', 'saying'), ('xi', 'xi'), ('ppl', 'people'), ('y', 'y

### Remove duplicating characters in tokens

In [5]:
OOV_no_duplicate = []

for x in zip(OOV, correct):
    if re.findall(r'((\w)\2{2,})', x[0]):
#     if Counter(x[0]).most_common(1)[0][1] >= 3: # min 3 duplicates
        OOV_no_duplicate.append(''.join(sorted(set(x[0]), key = x[0].index)))     
    else:
        OOV_no_duplicate.append(x[0])    

In [6]:
print(tuple(zip(OOV_no_duplicate, correct)))

(('comming', 'coming'), ('tomoroe', 'tomorrow'), ('dang', 'dang'), ('nd', 'and'), ('hve', 'have'), ('u', 'you'), ('ppl', 'people'), ('cuz', 'because'), ('c', 'see'), ('wat', 'what'), ('hehe', 'hehe'), ('jus', 'just'), ('iight', 'alright'), ('u', 'you'), ('soo', 'so'), ('ppl', 'people'), ('u', 'you'), ('prolly', 'probably'), ('waka', 'waka'), ('bt', 'but'), ('naw', 'no'), ('u', 'you'), ('riqht', 'right'), ('l', 'l'), ('acc', 'actually'), ('r', 'are'), ('u', 'you'), ('jus', 'just'), ('2', 'to'), ('pic', 'picture'), ('y', 'why'), ('u', 'you'), ('u', 'you'), ('u', 'you'), ('k', 'ok'), ('u', 'you'), ('rememberr', 'remember'), ('withh', 'with'), ('meh', 'me'), ('lil', 'little'), ('god', 'good'), ('tay', 'tay'), ('m', 'am'), ('u', 'you'), ('trippn', 'tripping'), ('tnx', 'thanks'), ('birdman', 'birdman'), ('somethin', 'something'), ('t', 't'), ('tryin', 'trying'), ('fone', 'phone'), ('u', 'you'), ('r', 'are'), ('msg', 'message'), ('sayin', 'saying'), ('xi', 'xi'), ('ppl', 'people'), ('y', 'yea

### Remove tokens contain numbers

In [46]:
OOV_no_number = []
correct_no_number = []
for x in zip(OOV, correct):
    if any(i.isdigit() for i in x[0]):
        continue
    else:
        OOV_no_number.append(x[0])
        correct_no_number.append(x[1])

### Frequency distribution of OOV tokens

In [483]:
import nltk
import operator
frequency = nltk.FreqDist(OOV_no_duplicate)
sorted_x = sorted(frequency.items(), key=operator.itemgetter(1),reverse=True)
print(sorted_x)

[('u', 184), ('n', 35), ('w', 22), ('jus', 21), ('ppl', 20), ('da', 19), ('2', 17), ('d', 16), ('so', 13), ('y', 12), ('cuz', 11), ('o', 10), ('r', 9), ('cont', 9), ('dat', 9), ('lil', 8), ('yu', 8), ('wat', 7), ('tht', 7), ('p', 7), ('c', 6), ('goin', 6), ('kno', 6), ('tha', 6), ('sayin', 5), ('s', 5), ('b', 5), ('x', 5), ('4', 5), ('doin', 5), ('no', 5), ('til', 5), ('1', 5), ('hehe', 4), ('tryin', 4), ('def', 4), ('pls', 4), ('bday', 4), ('youu', 4), ('lookin', 4), ('soo', 3), ('prolly', 3), ('bt', 3), ('pic', 3), ('fone', 3), ('thinkin', 3), ('hw', 3), ('gettin', 3), ('walkin', 3), ('al', 3), ('ima', 3), ('nite', 3), ('talkin', 3), ('sum1', 3), ('himm', 3), ('bro', 3), ('nah', 3), ('k', 2), ('withh', 2), ('meh', 2), ('m', 2), ('somethin', 2), ('t', 2), ('msg', 2), ('xi', 2), ('geez', 2), ('ly', 2), ('sittin', 2), ('skool', 2), ('hey', 2), ('fam', 2), ('nothin', 2), ('neway', 2), ('uu', 2), ('abt', 2), ('dha', 2), ('nuthin', 2), ('bball', 2), ('bc', 2), ('crunk', 2), ('lovin', 2), (

### [POS tagging of OOV tokens](http://www.cs.cmu.edu/~ark/TweetNLP/)

In [7]:
f =  open('OOV_tag.txt')
taggedOOV = {}
confidence = {}
for line in f:
    words = line.split()
    taggedOOV[words[0]] = words[1]
    confidence[words[0]] = words[2]  
f.close()

### Online look up for the definition of a slang (slow though)

- one-to-many normalisation e.g lol -> laugh out loud
- online look up: https://www.noslang.com/search

In [7]:
import requests
from bs4 import BeautifulSoup as BS
import re

def slangDict(slang):
    text = requests.get('https://www.noslang.com/search/' + slang).text
    soup = BS(text, 'lxml')
    try:
        definition = " ".join(re.search("means.*-" , soup.find(name="h3").text).group(0).split()[1:-1])
    except: 
        definition = slang
    return definition

In [8]:
slangDict("ily")

'i love you'

In [9]:
slangDict("lol")

'laughing out loud'

### Matching against the dictionary

In [40]:
sub1=[]
sub2=[]
sub3=[]

for index, item in enumerate(OOV_no_duplicate):    
    sub1.append(item)
    sub2.append(correct[index])
    sub3.append(levenshtein_distance(item,dict1))

In [35]:
recall(correct, sub3)

0.4588785046728972

In [36]:
precision(correct, sub3)

0.013009724172650432

In [41]:
accuracy(correct, sub3)

0.16261682242990655

In [42]:
import aspell
import enchant
d = enchant.Dict("en_US")
s = aspell.Speller('lang', 'en')

In [11]:
s.suggest("dont")

['dint',
 "don't",
 'font',
 'DOT',
 'Don',
 'Dot',
 'don',
 'dot',
 'Ont',
 'dent',
 'Dona',
 'Donn',
 'dona',
 'done',
 'Dons',
 'Mont',
 'cont',
 'dolt',
 'dong',
 'dons',
 'dost',
 'wont',
 "Don's",
 "don's",
 "won't"]

In [13]:
d.suggest("dont")

['dot',
 'don',
 'donut',
 "don't",
 'cont',
 'font',
 'wont',
 'Mont',
 'dent',
 'dint',
 'dost',
 'dolt',
 'done',
 'dons',
 'dona']

In [385]:
# plt.plot([abs(x - y) for x, y in zip(lenOOV, lencorrect)])
# plt.show()
# print([abs(x - y) for x, y in zip(lenOOV, lencorrect)])
# import nltk
nltk.FreqDist([abs(x - y) for x, y in zip(lenOOV, lencorrect)])
# wordlen = list(zip(lenOOV,lencorrect))
# wordlen.sort(key=lambda tup: tup[0])
# print(wordlen)

FreqDist({0: 164,
          1: 392,
          2: 328,
          3: 96,
          4: 50,
          5: 22,
          6: 5,
          7: 9,
          8: 3,
          14: 1})