# Text Analytics - Assignment 1

Grammatikopoulou Maria - f3352310

Phevos A. Margonis - f3352317

Moniaki Melina - f3352321

In [None]:
import string
import random
import math
import copy
import nltk
import pprint
import Levenshtein
from evaluate import load
from tqdm import tqdm
from nltk.corpus import brown
from nltk import WhitespaceTokenizer
from collections import Counter
from nltk.util import ngrams
from itertools import pairwise, chain, product
from more_itertools import windowed

# Uncomment the lines below for downloading NLTK resources if you haven't already.
# nltk.download('punkt')
# nltk.download('brown')

N = 10
alpha = 0.01  # Smoothing hyperparameter. For Laplace 1. Initial 0.01

In [None]:
# %% Load nltk corpus
# Load all words from Brown Corpus.
all_words = brown.words()
text = ' '.join(all_words)
# Load a specific category from the Brown Corpus.
# category = 'news'
# text = ' '.join(brown.words(categories=category))

In [None]:
# %% Split corpus to sentences
sentences = nltk.sent_tokenize(text)
sentences = [s[:-1] if s.endswith('.') else s for s in sentences]  # Remove trailing fullstops

# %% Split each sentence of the corpus into words
whitespace_wt = WhitespaceTokenizer()
sentences_tokenized = []

for sent in sentences:
    sent_tok = whitespace_wt.tokenize(sent)
    sent_tok = [word.lower() for word in sent_tok] # Convert word to lowercase
    sentences_tokenized.append(sent_tok)

# %% Train-Dev-Test Split
random.seed(4444) # Set a seed for reproducibility
random.shuffle(sentences_tokenized)

# Calculate the lengths of each part
total_len = len(sentences_tokenized)
trainSet_len = int(total_len * 0.8)
devSet_len = int(total_len * 0.1)
testSet_len = int(total_len * 0.1)

# Split the list into three parts
trainSet = sentences_tokenized[:trainSet_len]
devSet = sentences_tokenized[trainSet_len:trainSet_len + devSet_len]
testSet = sentences_tokenized[trainSet_len + devSet_len:]

In [None]:
# %% Vocabulary: For the trainSet
trainSet_words = list(chain(*trainSet))  # List of all the words in trainSet
word_freq = nltk.FreqDist(trainSet_words)  # Counter of word frequencies
vocab_words = [word for word, freq in word_freq.items() if freq > 9]  # Words that appear at least 10 times
vocab_size = len(vocab_words)  # Count of the unique words of trainSet

# %% Replace OOV with UNK
for i, sentence in enumerate(trainSet):
    trainSet[i] = ["UNK" if word not in vocab_words else word for word in sentence]
for i, sentence in enumerate(devSet):
    devSet[i] = ["UNK" if word not in vocab_words else word for word in sentence]
for i, sentence in enumerate(testSet):
    testSet[i] = ["UNK" if word not in vocab_words else word for word in sentence]

# %% Vocabulary: Add 'UNK' because it is needed for (vi) when i have to correct spellcheck 'UNK' words
vocab_words = vocab_words + ['UNK']
vocab_size += 1

In [None]:
# %% TRAINING phase: count n-grams
unigram_counter = Counter()
bigram_counter = Counter()
trigram_counter = Counter()

for sent in trainSet:
    unigram_counter.update([gram for gram in ngrams(sent, 1, pad_left=True, pad_right=True,
                                                    left_pad_symbol='<s>', right_pad_symbol='<e>')])
    bigram_counter.update([gram for gram in ngrams(sent, 2, pad_left=True, pad_right=True,
                                                   left_pad_symbol='<s>', right_pad_symbol='<e>')])
    trigram_counter.update([gram for gram in ngrams(sent, 3, pad_left=True, pad_right=True,
                                                    left_pad_symbol='<s>', right_pad_symbol='<e>')])

# %% Convert ngram counts to ngram log probabilities
bigram_model = {}  # Dictionary to store the bigram log probabilities

for ngram, count in bigram_counter.items():
    first_token, second_token = ngram
    bigram_prob = (bigram_counter[(first_token, second_token)] + alpha) / (unigram_counter[(first_token,)] + alpha*vocab_size)  # P(w2|w1)
    bigram_log_prob = math.log2(bigram_prob)
    bigram_model[ngram] = bigram_log_prob

trigram_model = {}  # Dictionary to store the trigram log probabilities

for ngram, count in trigram_counter.items():
    first_token, second_token, third_token = ngram
    trigram_prob = (trigram_counter[(first_token, second_token, third_token)] + alpha) / (bigram_counter[(first_token, second_token)] + alpha * vocab_size)
    trigram_log_prob = math.log2(trigram_prob)
    trigram_model[ngram] = trigram_log_prob

In [None]:
# %% Calculate LM Cross entropy & perplexity
sum_prob = 0  # Store sum of language probabilities
bigram_cnt = 0  # N: Number of bigrams

for sent in testSet:
    sent = ['<s>'] + sent + ['<e>']

    # Iterate over the bigrmas of the sentence
    for first_token, second_token in pairwise(sent):
        bigram_prob = (bigram_counter[(first_token, second_token)] + alpha) / (unigram_counter[(first_token,)] + alpha*vocab_size)
        sum_prob += math.log2(bigram_prob)
        bigram_cnt += 1

HC = -sum_prob / bigram_cnt
perpl = math.pow(2, HC)
print('=== Bigram Model ===')
print("Cross Entropy: {0:.3f}".format(HC))
print("perplexity: {0:.3f}".format(perpl))

=== Bigram Model ===
Cross Entropy: 7.322
perplexity: 160.059


In [None]:
# %% Tri-gram LM Cross entropy & perplexity
sum_prob = 0  # Store sum of language probabilities
trigram_cnt = 0  # N: Number of trigrams

for sent in testSet:
    sent = ['<s>'] + ['<s>'] + sent + ['<e>']

    for first_token, second_token, third_token in windowed(sent, n=3):
        trigram_prob = (trigram_counter[(first_token, second_token, third_token)] + alpha) / (bigram_counter[(first_token, second_token)] + alpha*vocab_size)
        sum_prob += math.log2(trigram_prob)
        trigram_cnt += 1

HC = -sum_prob / trigram_cnt
perpl = math.pow(2, HC)
print('=== Trigram Model ===')
print("Cross Entropy: {0:.3f}".format(HC))
print("perplexity: {0:.3f}".format(perpl))

=== Trigram Model ===
Cross Entropy: 9.252
perplexity: 609.508


In [None]:
# %% ===Bigram=== Beam search decoding
def autocomplete_bigram(input_text:str, max_depth:int=1, beam_width:int=2) -> str:
    """
    Takes half a sentence. Returns the completed sentence, using a BIGRAM LM.

    The sentence will be tokenized, and the last word will be used as seed.
    A beam search decoder will predict the most probable next words.
    The predicted sentence will be concatenated with the input_text.

    Args:
        input_text: A string sentence that will be completed.
        max_depth: The number of words to predict.
        beam_width: The number of beams/best alternatives to keep.

    Returns:
        str: The input + predicted sentence.
    """
    def generate_candidates(state:list[str]) -> list[list[str]]:
        """Given a sentence, generate possible next words (excluding UNK)"""
        last_word = state[-1]
        next_words = [word for (prev_word, word) in bigram_model if prev_word == last_word and word != 'UNK']
        return [state + [next_word] for next_word in next_words]

    def score(state:list[str]) -> float:
        """Return the probability assigned to this state by the bigram_model"""
        probability = 0.0
        for i in range(1, len(state)):
            prev_word, word = state[i-1], state[i]
            probability += bigram_model.get((prev_word, word), math.log2(1e-10))  # Σ[logP(<w1,w2,w3,...)]
        return probability

    def beam_search_decode(initial_state:list[str], max_depth:int, beam_width:int, generate_candidates_fn, score_fn) -> list[str]:
        """ Takes a word (initial_state), and returns the most probable sentence. """
        candidates = [(initial_state, 0.0)]

        for _ in range(max_depth):
            new_candidates = []
            for candidate, prob in candidates:
                for next_state in generate_candidates_fn(candidate):
                    new_prob = prob + score_fn(next_state)
                    new_candidates.append((next_state, new_prob))

            # Sort candidates. Best first.
            new_candidates = sorted(new_candidates, key=lambda x: x[1], reverse=True)
            # If there are no generated candidates from that bigram stop the autocomplete.
            if not new_candidates:
                candidates = [(list(candidates[0][0]) + ['<e>'], candidates[0][1])]
                break
            # Keep the top 2 candidates.
            candidates = new_candidates[:beam_width]

        best_sequence, _ = max(candidates, key=lambda x: x[1])
        return best_sequence

    initial_state = whitespace_wt.tokenize(input_text)
    initial_state = [initial_state[-1]]

    best_sequence = beam_search_decode(initial_state, max_depth, beam_width, generate_candidates, score)

    # Join the input text and the best sequence with spaces
    completed_sentence = ' '.join([input_text] + best_sequence[1:])
    return completed_sentence

# Example usage:
input_text = "I can't believe this"
autocompleted_sentence = autocomplete_bigram(input_text, max_depth=5, beam_width=2)
print(f"Autocompleted: '{autocompleted_sentence}'")

Autocompleted: 'I can't believe this is a few years ago'


In [None]:
# %% ===Trigram Autocomplete===
def autocomplete_trigram(input_text:str, max_depth:int=1, beam_width:int=2) -> str:
    """
    Takes half a sentence. Returns the completed sentence, using a TRIGRAM LM.

    The sentence will be tokenized, and the last word will be used as seed.
    A beam search decoder will predict the most probable next words.
    The predicted sentence will be concatenated with the input_text.

    Args:
        input_text: A string sentence that will be completed.
        max_depth: The number of words to predict.
        beam_width: The number of beams/best alternatives to keep.

    Returns:
        str: The input + predicted sentence.
    """
    def generate_candidates_tri(state:list[str]) -> list[list[str]]:
        """Given a sentence, generate possible next words (excluding UNK)"""
        last_word1, last_word2 = state[-2], state[-1]
        next_words = [word for (prev_word1, prev_word2, word) in trigram_model if prev_word1 == last_word1 and prev_word2 == last_word2 and word != 'UNK']
        return [state + [next_word] for next_word in next_words]

    def score_tri(state:list[str]) -> float:
        """Return the probability assigned to this state by the trigram_model"""
        probability = 0.0
        for i in range(2, len(state)):
            prev_word1, prev_word2, word = state[i-2], state[i-1], state[i]
            probability += trigram_model.get((prev_word1, prev_word2, word), math.log2(1e-10))  # Σ[logP(<w1,w2,w3,...)]
        return probability

    def beam_search_decode_tri(initial_state:list[str], max_depth:int, beam_width:int, generate_candidates_fn, score_fn) -> list[str]:
        """ Takes a word (initial_state), and returns the most probable sentence"""
        candidates = [(initial_state, 0.0)]

        for _ in range(max_depth):
            new_candidates = []
            for candidate, prob in candidates:
                for next_state in generate_candidates_fn(candidate):
                    new_prob = prob + score_fn(next_state)
                    new_candidates.append((next_state, new_prob))

            # Sort candidates. Best first.
            new_candidates = sorted(new_candidates, key=lambda x: x[1], reverse=True)
            # If there are no generated candidates from that trigram stop the autocomplete
            if not new_candidates:
                candidates = [(list(candidates[0][0]) + ['<e>'], candidates[0][1])]
                break
            # Keep the top 2 candidates.
            candidates = new_candidates[:beam_width]

        best_sequence, _ = max(candidates, key=lambda x: x[1])
        return best_sequence

    initial_state = whitespace_wt.tokenize(input_text)
    initial_state = initial_state[-2:]

    best_sequence = beam_search_decode_tri(initial_state, max_depth, beam_width, generate_candidates_tri, score_tri)

    # Join the input text and the best sequence with spaces
    completed_sentence = ' '.join([input_text] + best_sequence[2:])  # 2 to hide the seed words
    return completed_sentence

# Example usage:
input_text = "I can't believe this is"
autocompleted_sentence = autocomplete_trigram(input_text, max_depth=6, beam_width=2)
print(f"Autocompleted: '{autocompleted_sentence}'")

Autocompleted: 'I can't believe this is a matter of fact , the'


In [None]:
# %% (iv) Bigram Spellcheck
def spellcheck_bigram(word_list:list[str], beam_width:int=2) -> list[str]:
    """
    Context-aware spelling-corrector.

    Takes a list of words with spelling errors and
    Returns a context-aware spell-corrected sentence as a list
    Using a Bigram LM model and Levenshtein edit distance.

        Args:
            word_list:  the sentence to be corrected as a list of tokenized words
            beam_width (int): the number of beams / best candidates to check

        Returns:
            List[str]: The spell-corrected list
    """
    def generate_candidates(state:list[str]) -> list[list[str]]:
        """Given a sentence, generate possible next words"""
        last_word = state[-1]
        next_words = [word for (prev_word, word) in bigram_model if prev_word == last_word]
        return [state + [next_word] for next_word in next_words]

    def score(state:list[str], word_list:list[str], distances:dict[tuple,int]) -> float:
        """
        Return the probability assigned to this state by the bigram_model
        and the Levenshtein edit distance.
        """
        probability = 0.0
        for i in range(1, len(state)):
            prev_word, word = state[i-1], state[i]
            LM_proba = bigram_model.get((prev_word, word), math.log2(1e-10))
            dist_check = distances.get((word_list[i-1], word), 1e10)
            Edit_dist = math.log2(1/(dist_check + 1))
            probability += 0.2 * LM_proba + 0.8 * Edit_dist
        return probability

    def beam_search_decode(initial_state:list[str], max_depth:int, beam_width:int, generate_candidates_fn, score_fn, word_list, distances) -> list[str]:
        """ Takes a word (initial_state), and returns the most probable sentence. """
        candidates = [(initial_state, 0.0)]

        for _ in range(max_depth):
            new_candidates = []
            for candidate, prob in candidates:
                for next_state in generate_candidates_fn(candidate):
                    new_prob = prob + score_fn(next_state, word_list, distances)
                    new_candidates.append((next_state, new_prob))

            # Sort candidates. Best first.
            new_candidates = sorted(new_candidates, key=lambda x: x[1], reverse=True)
            # If there are no generated candidates from that bigram stop the autocomplete.
            if not new_candidates:
                candidates = [(list(candidates[0][0]) + ['<e>'], candidates[0][1])]
                break
            # Keep the top 2 candidates.
            candidates = new_candidates[:beam_width]

        best_sequence, _ = max(candidates, key=lambda x: x[1])
        return best_sequence

    Vocabulary = vocab_words
    distances = {(word, voc_token): Levenshtein.distance(word, voc_token) for word, voc_token in product(word_list, Vocabulary)}
    initial_state = ['<s>']  # To be able to correct even the first input word.
    max_depth = len(word_list)

    best_sequence = beam_search_decode(initial_state, max_depth, beam_width, generate_candidates, score, word_list, distances)

    return best_sequence[1:]  # Excluding the "<start>" token


# Example use:
inputText = ['thes', 'is', 'a', 'now', 'yrk', 'citi']  # word_list to be spell-corrected
print(spellcheck_bigram(inputText))

['this', 'is', 'a', 'new', 'york', 'city']


In [None]:
# %% (iv) TRIGRAM spellcheck
def spellcheck_trigram(word_list:list[str], beam_width=2) -> list[str]:

    """
    Takes a list of words with spelling errors and
    returns a context-aware spell-corrected sentence as a list
    using the trigram LM.

        Parameters:
        - word_list (str):  the sentence to be corrected as a list of tokenized words
        - beam_width (int): the number of beams / best candidates to check

        Returns:
        - List[str]: The spell-corrected list
    """

    def generate_candidates_tri(state:list[str]) -> list[list[str]]:
        """Given a sentence, generate possible next words"""
        last_word1, last_word2 = state[-2], state[-1]
        next_words = [word for (prev_word1, prev_word2, word) in trigram_model if prev_word1 == last_word1 and prev_word2 == last_word2]
        return [state + [next_word] for next_word in next_words]

    def score_tri(state:list[str], word_list:list[str], distances:dict[tuple,int]) -> float:
        """
        Return the probability assigned to this state by the Trigram_model
        and the Levenshtein edit distance.
        """
        probability = 0.0
        for i in range(2, len(state)):
            prev_word1, prev_word2, word = state[i-2], state[i-1], state[i]
            LM_proba = trigram_model.get((prev_word1, prev_word2, word), math.log2(1e-10))
            dist_check = distances.get((word_list[i-2], word), 1e10)
            Edit_dist = math.log2(1/(dist_check + 1))
            probability += 0.2 * LM_proba + 0.8 * Edit_dist
        return probability

    def beam_search_decode_tri(initial_state:list[str],
                               max_depth:int,
                               beam_width:int,
                               generate_candidates_fn,
                               score_fn,
                               word_list:list[str],
                               distances:dict[tuple,int]) -> list[str]:
        """ Takes a word (initial_state), and returns the most probable sentence"""
        candidates = [(initial_state, 0.0)]

        for _ in range(max_depth):
            new_candidates = []
            for candidate, prob in candidates:
                for next_state in generate_candidates_fn(candidate):
                    new_prob = prob + score_fn(next_state, word_list, distances)
                    new_candidates.append((next_state, new_prob))

            # Sort candidates. Best first.
            new_candidates = sorted(new_candidates, key=lambda x: x[1], reverse=True)
            # If there are no generated candidates from that trigram stop the autocomplete
            if not new_candidates:
                candidates = [(list(candidates[0][0]) + ['<e>'], candidates[0][1])]
                break
            # Keep the top 2 candidates.
            candidates = new_candidates[:beam_width]

        best_sequence, _ = max(candidates, key=lambda x: x[1])
        return best_sequence

    Vocabulary = vocab_words
    distances = {(word, voc_token): Levenshtein.distance(word, voc_token) for word, voc_token in product(word_list, Vocabulary)}
    initial_state = ['<s>', '<s>']  # To be able to correct even the first input word.
    max_depth = len(word_list)

    best_sequence = beam_search_decode_tri(initial_state, max_depth, beam_width, generate_candidates_tri, score_tri, word_list, distances)

    return best_sequence[2:]  # Excluding the "<start>" token


# Example use:
inputText = ['thes', 'is', 'a', 'now', 'yrk', 'citi']  # dianisma W gia na gine auto-corrected
print(spellcheck_trigram(inputText))

['this', 'is', 'a', 'new', 'york', 'city']


In [None]:
# %% (v) Generate artificial test data
artTestSet = copy.deepcopy(testSet)
for i, sentence in enumerate(artTestSet):
    for j, word in enumerate(sentence):
        if word != 'UNK':
            # Break the word into characters. Replace with a small proba each character with a random char
            scrambledWord = ''.join([random.choice(string.ascii_letters) if random.random() > 0.9 else letter for letter in word])
            artTestSet[i][j] = scrambledWord

In [None]:
# %% (vi) example use
inputText = artTestSet[98]
print(f"Inpput text:        {inputText}")
print(f"Bigram spellcheck:  {spellcheck_bigram(inputText)}")
print(f"Trigram spellcheck: {spellcheck_trigram(inputText)}")

Inpput text:        ['let', 'af', 'UNK', 'the', 'form', 'of', 'aZ']
Bigram spellcheck:  ['let', 'af', 'UNK', 'the', 'form', 'of', 'a']
Trigram spellcheck: ['let', 'us', 'UNK', 'the', 'UNK', 'of', 'a']


In [None]:
#%% Spell check the artificial test set
to_correct = artTestSet[:100]
to_compare = testSet[:100]

predictions_bigram = []
for sentence in tqdm(to_correct):
    corrected_sentence = spellcheck_bigram(sentence)
    predictions_bigram.append(corrected_sentence)

predictions_trigram = []
for sentence in tqdm(to_correct):
    corrected_sentence = spellcheck_trigram(sentence)
    predictions_trigram.append(corrected_sentence)

100%|██████████| 100/100 [06:00<00:00,  3.61s/it]
100%|██████████| 100/100 [04:55<00:00,  2.96s/it]


In [None]:
#%% Compute WER CER for bigram
references = to_compare

def compute_cer(reference, hypothesis):
    return Levenshtein.distance(reference, hypothesis) / max(len(reference), len(hypothesis))

# Flatten the lists for WER calculation
references_flat = [" ".join(sentence) for sentence in references]
predictions_flat = [" ".join(sentence) for sentence in predictions_bigram]

# Compute WER
wer = load("wer")
wer_score = wer.compute(predictions=predictions_flat, references=references_flat)
print(f"Word Error Rate: {wer_score:.4f}")

# Compute CER
cer_scores = [compute_cer(ref, hyp) for ref, hyp in zip(references_flat, predictions_flat)]
average_cer = sum(cer_scores) / len(cer_scores)
print(f"Character Error Rate: {average_cer:.4f}")

Word Error Rate: 0.3087
Character Error Rate: 0.2636


In [None]:
#%% Compute WER CER for trigram

# Flatten the lists for WER calculation
predictions_flat = [" ".join(sentence) for sentence in predictions_trigram]

# Compute WER
wer = load("wer")
wer_score = wer.compute(predictions=predictions_flat, references=references_flat)
print(f"Word Error Rate: {wer_score:.4f}")

# Compute CER
cer_scores = [compute_cer(ref, hyp) for ref, hyp in zip(references_flat, predictions_flat)]
average_cer = sum(cer_scores) / len(cer_scores)
print(f"Character Error Rate: {average_cer:.4f}")

Word Error Rate: 0.4576
Character Error Rate: 0.3621


# Examples
---

## Autocomplete:
### Notes:
- Bigram model can autocomplete almost any sentence but will tend to cycle through the most common bigrams.
- Trigram model is more fluent but if the 'seed' bigram is not present the autocompletion breakes.

In [None]:
# Example usage:
input_text = "I would like"
print(f"Autocompleted Bigram:  '{autocomplete_bigram(input_text, max_depth=5, beam_width=2)}'")
print(f"Autocompleted Trigram: '{autocomplete_trigram(input_text, max_depth=5, beam_width=2)}'")


Autocompleted Bigram:  'I would like a few years ago ,'
Autocompleted Trigram: 'I would like to see the car <e>'


In [None]:
input_text = "The most popular"
print(f"Autocompleted Bigram:  '{autocomplete_bigram(input_text, max_depth=15, beam_width=2)}'")
print(f"Autocompleted Trigram: '{autocomplete_trigram(input_text, max_depth=15, beam_width=2)}'")

Autocompleted Bigram:  'The most popular as a few years ago , and the same time , and the same time'
Autocompleted Trigram: 'The most popular man on the other hand , the first time in the world , and the'


In [None]:
input_text = "In conclusion"
print(f"Autocompleted Bigram:  '{autocomplete_bigram(input_text, max_depth=15, beam_width=2)}'")
print(f"Autocompleted Trigram: '{autocomplete_trigram(input_text, max_depth=15, beam_width=2)}'")

Autocompleted Bigram:  'In conclusion , and the same time , and the same time , and the same time'
Autocompleted Trigram: 'In conclusion <e>'


In [None]:
input_text = "federal policies"
print(f"Autocompleted Bigram:  '{autocomplete_bigram(input_text, max_depth=15, beam_width=2)}'")
print(f"Autocompleted Trigram: '{autocomplete_trigram(input_text, max_depth=15, beam_width=2)}'")

Autocompleted Bigram:  'federal policies and the same time , and the same time , and the same time ,'
Autocompleted Trigram: 'federal policies will produce a better understanding of the united states , and the other hand ,'


## Context-aware spelling corrector

In [None]:
# %% (vi) example use
inputText = artTestSet[91]
print(f"Inpput text:        {inputText}")
print(f"Bigram spellcheck:  {spellcheck_bigram(inputText)}")
print(f"Trigram spellcheck: {spellcheck_trigram(inputText)}")

Inpput text:        ['i', 'AaJted', 'him', 'p', 'with', 'a', 'UNK', 'UNK']
Bigram spellcheck:  ['i', 'have', 'him', ',', 'with', 'a', 'UNK', 'UNK']
Trigram spellcheck: ['i', 'asked', 'him', ',', 'with', 'a', 'UNK', 'UNK']


In [None]:
# %% (vi) example use
inputText = artTestSet[143]
print(f"Inpput text:        {inputText}")
print(f"Bigram spellcheck:  {spellcheck_bigram(inputText)}")
print(f"Trigram spellcheck: {spellcheck_trigram(inputText)}")

Inpput text:        ['and', 'onc', 'had', 'been', 'too', 'mPny']
Bigram spellcheck:  ['and', 'UNK', 'had', 'been', 'too', 'many']
Trigram spellcheck: ['and', 'one', 'had', 'been', 'too', 'many']


In [None]:
# %% (vi) example use
inputText = artTestSet[191]
print(f"Inpput text:        {inputText}")
print(f"Bigram spellcheck:  {spellcheck_bigram(inputText)}")
print(f"Trigram spellcheck: {spellcheck_trigram(inputText)}")

Inpput text:        ['yoc', 'cyuld', 'wish', 'that']
Bigram spellcheck:  ['you', 'could', 'wish', 'that']
Trigram spellcheck: ['you', 'could', 'wish', 'for']


In [None]:
# %% (vi) example use
inputText = artTestSet[348]
print(f"Inpput text:        {inputText}")
print(f"Bigram spellcheck:  {spellcheck_bigram(inputText)}")
print(f"Trigram spellcheck: {spellcheck_trigram(inputText)}")

Inpput text:        ['they', 'walked', 'tAwPrd', 'each', 'otheU']
Bigram spellcheck:  ['they', 'walked', 'toward', 'each', 'other']
Trigram spellcheck: ['they', 'walked', 'the', 'deck', ',']
