# Using symspellpy - spell check

[github](https://github.com/mammothb/symspellpy/)

In [None]:
!python -m pip install -U symspellpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting symspellpy
  Downloading symspellpy-6.7.7-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 5.0 MB/s 
[?25hCollecting editdistpy>=0.1.3
  Downloading editdistpy-0.1.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (126 kB)
[K     |████████████████████████████████| 126 kB 67.3 MB/s 
[?25hInstalling collected packages: editdistpy, symspellpy
Successfully installed editdistpy-0.1.3 symspellpy-6.7.7


Alternatively, you can download the dictionary files from the repository and add them to your project directory:



```
curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_dictionary_en_82_765.txt
curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_bigramdictionary_en_243_342.txt
```



## Spell check

In [None]:
from itertools import islice
import pkg_resources
from symspellpy import SymSpell

input_term = "thequickbrownfoxjumpsoverthelazydog"

def spelling_correction_en(input_term):
  
  sym_spell = SymSpell()

  dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
  bigram_path = pkg_resources.resource_filename(
      "symspellpy", "frequency_bigramdictionary_en_243_342.txt"
  )
  # term_index is the column of the term and count_index is theolumn of the term frequency
  sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
  sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
  
  # result
  result = sym_spell.word_segmentation(input_term)
  # print(result.corrected_string)
  return (result.corrected_string).lower()

In [None]:
# test = "whereis th elove hehad dated forImuch of thepast whocouqdn'tread in sixtgrade and ins pired him"
test = [
"A red sedan drives forward.",
      "A red midsize sedan keep straight.",
      "A red car drove through an intersection."
]
result = spelling_correction_en(test[0])
print(result)

a red sedan drives forward


## NLTK

In [None]:
!wget https://raw.githubusercontent.com/dwyl/english-words/master/words.txt
!wget http://norvig.com/big.txt

--2022-12-09 06:38:58--  https://raw.githubusercontent.com/dwyl/english-words/master/words.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4862992 (4.6M) [text/plain]
Saving to: ‘words.txt’


2022-12-09 06:38:59 (57.3 MB/s) - ‘words.txt’ saved [4862992/4862992]

--2022-12-09 06:38:59--  http://norvig.com/big.txt
Resolving norvig.com (norvig.com)... 158.106.138.13
Connecting to norvig.com (norvig.com)|158.106.138.13|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6488666 (6.2M) [text/plain]
Saving to: ‘big.txt’


2022-12-09 06:38:59 (31.7 MB/s) - ‘big.txt’ saved [6488666/6488666]



In [None]:
import re
from collections import Counter

TEXT = open('/content/big.txt').read()
alphabet = 'abcdefghijklmnopqrstuvwxyz'

def tokens(text):
    "List all the word tokens (consecutive letters) in a text. Normalize to lowercase."
    return re.findall('[a-z]+', text.lower()) 
  
WORDS = tokens(TEXT)
COUNTS = Counter(WORDS)

def known(words):
    "Return the subset of words that are actually in the dictionary."
    return {w for w in words if w in COUNTS}

def edits0(word): 
    "Return all strings that are zero edits away from word (i.e., just word itself)."
    return {word}

def edits1(word):
    "Return all strings that are one edit away from this word."
    pairs      = splits(word)
    deletes    = [a+b[1:]           for (a, b) in pairs if b]
    transposes = [a+b[1]+b[0]+b[2:] for (a, b) in pairs if len(b) > 1]
    replaces   = [a+c+b[1:]         for (a, b) in pairs for c in alphabet if b]
    inserts    = [a+c+b             for (a, b) in pairs for c in alphabet]
    return set(deletes + transposes + replaces + inserts)

def edits2(word):
    "Return all strings that are two edits away from this word."
    return {e2 for e1 in edits1(word) for e2 in edits1(e1)}

def correct(word):
    "Find the best spelling correction for this word."
    # Prefer edit distance 0, then 1, then 2; otherwise default to word itself.
    candidates = (known(edits0(word)) or 
                  known(edits1(word)) or 
                  known(edits2(word)) or 
                  [word])
    return max(candidates, key=COUNTS.get)

def splits(word):
    "Return a list of all possible (first, rest) pairs that comprise word."
    return [(word[:i], word[i:]) 
            for i in range(len(word)+1)]

def correct_text(text):
    "Correct all the words within a text, returning the corrected text."
    return re.sub('[a-zA-Z]+', correct_match, text)

def case_of(text):
    "Return the case-function appropriate for text: upper, lower, title, or just str."
    return (str.upper if text.isupper() else
            str.lower if text.islower() else
            str.title if text.istitle() else
            str)

def correct_match(match):
    "Spell-correct word in match, and preserve proper upper/lower/title case."
    word = match.group()
    return case_of(word)(correct(word.lower()))

# Flair (Semantic role labeling) 


In [None]:
!pip install flair

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Tagging Text

In [None]:
from flair.models import SequenceTagger
from flair.data import Sentence

# load model
tagger = SequenceTagger.load('upos-fast')

# text with English and German sentences
# sentence = Sentence('George Washington went')
sentence = Sentence(result)

# predict PoS tags
tagger.predict(sentence)

# print sentence with predicted tags
print(sentence)

for label in sentence.get_labels('pos'):
  # print(label.text)
  print(label.data_point.text)
  print(label.value)

2022-12-09 06:39:04,972 loading file /root/.flair/models/upos-english-fast/b631371788604e95f27b6567fe7220e4a7e8d03201f3d862e6204dbf90f9f164.0afb95b43b32509bf4fcc3687f7c64157d8880d08f813124c1bd371c3d8ee3f7




2022-12-09 06:39:05,062 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, INTJ, PUNCT, VERB, PRON, NOUN, ADV, DET, ADJ, ADP, NUM, PROPN, CCONJ, PART, AUX, X, SYM, <START>, <STOP>
Sentence: "a red sedan drives forward" → ["a"/DET, "red"/ADJ, "sedan"/NOUN, "drives"/VERB, "forward"/ADV]
a
DET
red
ADJ
sedan
NOUN
drives
VERB
forward
ADV


### Use Case 2: Zero-shot Named Entity Recognition (NER) with TARS

In [None]:
from flair.models import TARSTagger
from flair.data import Sentence

# 1. Load zero-shot NER tagger
tars = TARSTagger.load('tars-ner')

# 2. Prepare some test sentences
sentences = [
    Sentence("A red midsize sedan keep straight."),
    Sentence("A red sedan drives forward."),
    Sentence("A red sedan keeping straight."),
]

# 3. Define some classes of named entities such as "color", "type", "motion"
labels = ["color", "type", "motion"]
tars.add_and_switch_to_new_task('task 1', labels, label_type='pos')

# 4. Predict for these classes and print results
for sentence in sentences:
    tars.predict(sentence)
    print(sentence.to_tagged_string("ner"))

## word count nlp

In [None]:
import nltk
from nltk import *
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

In [None]:
dict_w = []
s = ['A', 'red', 'sedan', 'drives', 'forward', '.']
print(type(s))
dict_w = dict_w + s
dict_w

In [None]:
sample=[
    "A red midsize sedan keep straight.",
    "A red sedan drives forward.",
    "A red sedan keeping straight.",
]
words = []
for s in sample:
  sent=(sent_tokenize(s))        #splitting sentence
  word=(word_tokenize(s))        #splitting words

  words += word

  # print(sent)
  print(word)

stop_word=set(stopwords.words("english"))       #depicts all stopwords fot english language
new=[]
for i in words:
    if i not in stop_word:
        new.append(i)
print(new)

tot_sent=FreqDist(sent)     #total sentences length
count=len(tot_sent)
print(count)
tot_word=FreqDist(new)     #total words length
counts=len(tot_word)
print(counts)
print(tot_word.most_common(25))     #how much times a word repeat

# Spacy + Pattern -> past tense 

In [None]:
!pip install spacy
!pip install pattern

In [None]:
import nltk
nltk.download('omw-1.4')

In [None]:
import string
import spacy
from spacy.lang.en import English
from spacy.symbols import nsubj, VERB, NOUN
from pattern.en import conjugate, PAST, PRESENT, tenses, parse, pprint, parsetree, SINGULAR, PLURAL
from itertools import tee
import string
from html.parser import HTMLParser

In [None]:
SUBJ_DEPS = {'agent', 'csubj', 'csubjpass', 'expl', 'nsubj', 'nsubjpass'}

nlp = spacy.load('en_core_web_sm')

def _get_conjuncts(tok):
    """
    Return conjunct dependents of the leftmost conjunct in a coordinated phrase,
    e.g. "Burton, [Dan], and [Josh] ...".
    """
    return [right for right in tok.rights
            if right.dep_ == 'conj']


def is_plural_noun(token):
    """
    Returns True if token is a plural noun, False otherwise.

    Args:
        token (``spacy.Token``): parent document must have POS information

    Returns:
        bool
    """
    if token.doc.is_tagged is False:
        raise ValueError('token is not POS-tagged')
    return True if token.pos == NOUN and token.lemma != token.lower else False


def get_subjects_of_verb(verb):
    if verb.dep_ == "aux" and list(verb.ancestors):
        return get_subjects_of_verb(list(verb.ancestors)[0])
    """Return all subjects of a verb according to the dependency parse."""
    subjs = [tok for tok in verb.lefts
             if tok.dep_ in SUBJ_DEPS]
    # get additional conjunct subjects
    subjs.extend(tok for subj in subjs for tok in _get_conjuncts(subj))
    if not len(subjs):
        ancestors = list(verb.ancestors)
        if len(ancestors) > 0:
            return get_subjects_of_verb(ancestors[0])
    return subjs


def is_plural_verb(token):
    if token.doc.is_tagged is False:
        raise ValueError('token is not POS-tagged')
    subjects = get_subjects_of_verb(token)
    if not len(subjects):
        return False
    plural_score = sum([is_plural_noun(x) for x in subjects])/len(subjects)

    return plural_score > .5

def preserve_caps(word, newWord):
    """Returns newWord, capitalizing it if word is capitalized."""
    if word[0] >= 'A' and word[0] <= 'Z':
        newWord = newWord.capitalize()
    return newWord

def change_tense(text, to_tense, nlp=nlp):
    """Change the tense of text.

    Args:
        text (str): text to change.
        to_tense (str): 'present','past', or 'future'
        npl (SpaCy model, optional):

    Returns:
        str: changed text.

    """
    tense_lookup = {'future': 'inf', 'present': PRESENT, 'past': PAST}
    tense = tense_lookup[to_tense]

    doc = nlp(text)

    out = list()
    out.append(doc[0].text)
    words = []
    for word in doc:
        words.append(word)
        if len(words) == 1:
            continue
        if (words[-2].text == 'will' and words[-2].tag_ == 'MD' and words[-1].tag_ == 'VB') or \
                        words[-1].tag_ in ('VBD', 'VBP', 'VBZ', 'VBN') or \
                (not words[-2].text in ('to', 'not') and words[-1].tag_ == 'VB'):

            if words[-2].text in ('were', 'am', 'is', 'are', 'was') or\
                    (words[-2].text == 'be' and len(words) > 2 and words[-3].text == 'will'):
                this_tense = tense_lookup['past']
            else:
                this_tense = tense

            subjects = [x.text for x in get_subjects_of_verb(words[-1])]
            if ('I' in subjects) or ('we' in subjects) or ('We' in subjects):
                person = 1
            elif ('you' in subjects) or ('You' in subjects):
                person = 2
            else:
                person = 3
            if is_plural_verb(words[-1]):
                number = PLURAL
            else:
                number = SINGULAR
            if (words[-2].text == 'will' and words[-2].tag_ == 'MD') or words[-2].text == 'had':
                out.pop(-1)
            if to_tense == 'future':
                if not (out[-1] == 'will' or out[-1] == 'be'):
                    out.append('will')
                # handle will as a noun in future tense
                if words[-2].text == 'will' and words[-2].tag_ == 'NN':
                    out.append('will')
            #if word_pair[0].dep_ == 'auxpass':
            oldWord = words[-1].text
            out.append(preserve_caps(oldWord, conjugate(oldWord, tense=this_tense, person=person, number=number)))
        else:
            out.append(words[-1].text)

        # negation
        if words[-2].text + words[-1].text in ('didnot', 'donot', 'willnot', "didn't", "don't", "won't"):
            if tense == PAST:
                out[-2] = 'did'
            elif tense == PRESENT:
                out[-2] = 'do'
            else:
                out.pop(-2)

        # future perfect, and progressives, but ignore for "I will have cookies"
        if words[-1].text in ('have', 'has') and len(list(words[-1].ancestors)) and words[-1].dep_ == 'aux':
            out.pop(-1)

    text_out = ' '.join(out)

    # Remove spaces before/after punctuation:
    for char in string.punctuation:
        if char in """(<['""":
            text_out = text_out.replace(char+' ', char)
        else:
            text_out = text_out.replace(' '+char, char)

    for char in ["-", "“", "‘"]:
        text_out = text_out.replace(char+' ', char)
    for char in ["…", "”", "'s", "n't"]:
        text_out = text_out.replace(' '+char, char)

    return text_out

In [None]:
# # fix bug
# try:
#     yield line
# except StopIteration:
#     return

In [None]:
test = 'It is a awesome weekend Sitting at the windows I can see bird chirping'
r = change_tense(test, 'past')

In [None]:
r

# Text augmentation

In [None]:
import nltk
nltk.download('wordnet')

In [None]:
# Easy data augmentation techniques for text classification
# Jason Wei and Kai Zou

import random
from random import shuffle
random.seed(1)

#stop words list
stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 
			'ours', 'ourselves', 'you', 'your', 'yours', 
			'yourself', 'yourselves', 'he', 'him', 'his', 
			'himself', 'she', 'her', 'hers', 'herself', 
			'it', 'its', 'itself', 'they', 'them', 'their', 
			'theirs', 'themselves', 'what', 'which', 'who', 
			'whom', 'this', 'that', 'these', 'those', 'am', 
			'is', 'are', 'was', 'were', 'be', 'been', 'being', 
			'have', 'has', 'had', 'having', 'do', 'does', 'did',
			'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
			'because', 'as', 'until', 'while', 'of', 'at', 
			'by', 'for', 'with', 'about', 'against', 'between',
			'into', 'through', 'during', 'before', 'after', 
			'above', 'below', 'to', 'from', 'up', 'down', 'in',
			'out', 'on', 'off', 'over', 'under', 'again', 
			'further', 'then', 'once', 'here', 'there', 'when', 
			'where', 'why', 'how', 'all', 'any', 'both', 'each', 
			'few', 'more', 'most', 'other', 'some', 'such', 'no', 
			'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 
			'very', 's', 't', 'can', 'will', 'just', 'don', 
			'should', 'now', '']

#cleaning up text
import re
def get_only_chars(line):

    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line

########################################################################
# Synonym replacement
# Replace n words in the sentence with synonyms from wordnet
########################################################################

#for the first time you use wordnet
#import nltk
#nltk.download('wordnet')
from nltk.corpus import wordnet 

def synonym_replacement(words, n):
	new_words = words.copy()
	random_word_list = list(set([word for word in words if word not in stop_words]))
	random.shuffle(random_word_list)
	num_replaced = 0
	for random_word in random_word_list:
		synonyms = get_synonyms(random_word)
		if len(synonyms) >= 1:
			synonym = random.choice(list(synonyms))
			new_words = [synonym if word == random_word else word for word in new_words]
			#print("replaced", random_word, "with", synonym)
			num_replaced += 1
		if num_replaced >= n: #only replace up to n words
			break

	#this is stupid but we need it, trust me
	sentence = ' '.join(new_words)
	new_words = sentence.split(' ')

	return new_words

def get_synonyms(word):
	synonyms = set()
	for syn in wordnet.synsets(word): 
		for l in syn.lemmas(): 
			synonym = l.name().replace("_", " ").replace("-", " ").lower()
			synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
			synonyms.add(synonym) 
	if word in synonyms:
		synonyms.remove(word)
	return list(synonyms)

########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################

def random_deletion(words, p):

	#obviously, if there's only one word, don't delete it
	if len(words) == 1:
		return words

	#randomly delete words with probability p
	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	#if you end up deleting all words, just return a random word
	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################

def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)
	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0
	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words
	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
	return new_words

########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################

def random_insertion(words, n):
	new_words = words.copy()
	for _ in range(n):
		add_word(new_words)
	return new_words

def add_word(new_words):
	synonyms = []
	counter = 0
	while len(synonyms) < 1:
		random_word = new_words[random.randint(0, len(new_words)-1)]
		synonyms = get_synonyms(random_word)
		counter += 1
		if counter >= 10:
			return
	random_synonym = synonyms[0]
	random_idx = random.randint(0, len(new_words)-1)
	new_words.insert(random_idx, random_synonym)

########################################################################
# main data augmentation function
########################################################################

def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
	
	sentence = get_only_chars(sentence)
	words = sentence.split(' ')
	words = [word for word in words if word != '']
	num_words = len(words)
	
	augmented_sentences = []
	num_new_per_technique = int(num_aug/4)+1

	#sr
	if (alpha_sr > 0):
		n_sr = max(1, int(alpha_sr*num_words))
		for _ in range(num_new_per_technique):
			a_words = synonym_replacement(words, n_sr)
			augmented_sentences.append(' '.join(a_words))

	#ri
	if (alpha_ri > 0):
		n_ri = max(1, int(alpha_ri*num_words))
		for _ in range(num_new_per_technique):
			a_words = random_insertion(words, n_ri)
			augmented_sentences.append(' '.join(a_words))

	#rs
	if (alpha_rs > 0):
		n_rs = max(1, int(alpha_rs*num_words))
		for _ in range(num_new_per_technique):
			a_words = random_swap(words, n_rs)
			augmented_sentences.append(' '.join(a_words))

	#rd
	if (p_rd > 0):
		for _ in range(num_new_per_technique):
			a_words = random_deletion(words, p_rd)
			augmented_sentences.append(' '.join(a_words))

	augmented_sentences = [get_only_chars(sentence) for sentence in augmented_sentences]
	shuffle(augmented_sentences)

	#trim so that we have the desired number of augmented sentences
	if num_aug >= 1:
		augmented_sentences = augmented_sentences[:num_aug]
	else:
		keep_prob = num_aug / len(augmented_sentences)
		augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

	#append the original sentence
	augmented_sentences.append(sentence)

	return augmented_sentences

In [None]:
def any2past_augment(train_orig, output_file, alpha_sr, alpha_ri, alpha_rs, alpha_rd, num_aug=9):
  writer = open(output_file, 'w')
  lines = open(train_orig, 'r').readlines()

  for i, line in enumerate(lines):
        parts = line[:-1].split('\t')
        sentence = parts[0]
        aug_sentences = eda(sentence, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, p_rd=alpha_rd, num_aug=num_aug)
        for aug_sentence in aug_sentences:
            result = change_tense(aug_sentence, 'past')
            writer.write("\t" + result + '\n')

  writer.close()
  print("generated augmented sentences with eda for " + train_orig + " to " + output_file + " with num_aug=" + str(num_aug))

In [None]:
#the output file
output = None
input = '/content/test.txt'
if output:
    output = output
else:
    from os.path import dirname, basename, join
    output = join(dirname(input), 'eda_' + basename(input))

#number of augmented sentences to generate per original sentence
num_aug = 9 #default
if num_aug:
    num_aug = num_aug

#how much to replace each word by synonyms
alpha_sr = 0.1#default
if alpha_sr is not None:
    alpha_sr = alpha_sr

#how much to insert new words that are synonyms
alpha_ri = 0.1#default
if alpha_ri is not None:
    alpha_ri = alpha_ri

#how much to swap words
alpha_rs = 0.1#default
if alpha_rs is not None:
    alpha_rs = alpha_rs

#how much to delete words
alpha_rd = 0.1#default
if alpha_rd is not None:
    alpha_rd = alpha_rd

# if alpha_sr == alpha_ri == alpha_rs == alpha_rd == 0:
#      ap.error('At least one alpha should be greater than zero')

In [None]:
#generate augmented sentences and output into a new file
any2past_augment(input, output, alpha_sr=alpha_sr, alpha_ri=alpha_ri, alpha_rs=alpha_rs, alpha_rd=alpha_rd, num_aug=num_aug)