## N-Gram

For this solution we use the N-Gram model approach. The steps will be as follows:

1. Ingest data
2. Generate vocab, encoder, decoder
3. Generate a count dictionary of all n-grams generated from our data. This will serve as our frequency lookup table.
4. Read test data.
5. Iteratively for each character:

  * generate a bigram matching the length specified earlier. Where necessary, using padding character to maintain length.

  * Use our frequency lookup table to return count of the current bigram 

  * Divide by total no of occurences of all bigrams 

  * Normalize these probabilities and return the probability matrix generated

  * Look up generated probability of our expected character

  * calculate log of the value and subtract from overall loss of the entire model

  * remove first character of our bigram and append current character to the end of the bigram


Part of the code modified from: https://www.youtube.com/watch?v=zz1CFBS4NaY



In [1]:
def suggest_next_char(input_, ngram_counts, vocab, encoder, len_gram):

    # Consider the last bigram of sentence
    tokenized_input = [x.lower() for x in input_]
    last_gram = tokenized_input[-(len_gram-1):]

    # print("last gram", last_gram)
    
    # # Calculating probability for each char in vocab
    vocab_probabilities = {}

    total = sum(ngram_counts.values())

    for vocab_char in vocab:

        test_gram = [last_gram[x] for x in range(len_gram-1)]
        test_gram.append(str(vocab_char))
        test_gram = tuple(test_gram)
        # print("test_gram", test_gram)
        test_gram_count = ngram_counts.get(test_gram, 0)

        probability = (test_gram_count / total)
        vocab_probabilities[vocab_char] = probability

        # break
    # print(vocab_probabilities)
    # return
    # top_suggestions = sorted(vocab_probabilities.items(), key=lambda x: x[1], reverse=True)
    top_suggestions = list(vocab_probabilities.items())

    totals = sum(map(lambda x: x[1], top_suggestions))
    coeff = 1/totals if totals != 0 else 1

    top_suggestions_standardized =  [tuple([x[0], x[1] * coeff]) for x in top_suggestions]

    return top_suggestions_standardized

In [2]:
from math import log2, inf

def evaluate_one(input, ngram_counts, vocab, encoder,len_gram):

  input = input.lower()

  max_history = 10 #max sentence length
  history = ['<S>'] * len_gram
  loss_anything_goes = 0
  count = 0

  for ind, c in enumerate(input):

    count += 1

    loss_anything_goes -= log2(predict_next_proba(c, history, ngram_counts, vocab, encoder, len_gram))

    if len(history) == max_history:
      history.pop(0)

    history.append(c)
    
  return [loss_anything_goes/count]


def predict_next_proba(c, history, ngram_counts, vocab, encoder, len_gram):

  # pass our x into the model and return a prediction matrix
  y_pred = suggest_next_char(history, ngram_counts, vocab, encoder, len_gram)

  # print("Expected ", c)
  # print("top_suggestions", y_pred, " expected char prob: ", y_pred)
  # print("\n Char: ", c, " Predictions: ", y_pred, " Char prob", y_pred[encoder[c]][1])

  # return the computed probability of our character
  proba = y_pred[encoder[c]][1] if y_pred[encoder[c]][1] != 0.0 else 0.5

  return proba

## Test Swahili

### Ingest Data

In [None]:
import numpy as np

path_train = r"/content/sw-train.txt"
path_test = r"/content/sw-test.txt"

corpus = open(path_train).readlines()[0]

corpus = corpus.replace("\n", " ")

lower_case_corpus = [w.lower() for w in corpus]

len_gram = 6

for x in range(len_gram):
  lower_case_corpus.insert(0, '<S>')


vocab = sorted(list(set(lower_case_corpus)))
# vocab.append('<S>')
encoder = dict((c,i) for i,c in enumerate(vocab))
decoder = dict((i,c) for i,c in enumerate(vocab))

In [None]:
ngram_counts = {}

# Sliding through corpus to get bigram and trigram counts
for i in range(len(lower_case_corpus) - (len_gram-1)):
    # Getting ngram

    end = i + len_gram
    
    ngram = tuple([lower_case_corpus[x] for x in range(i, end)])
    
    # Keeping track of the bigram counts
    if ngram in ngram_counts.keys():
        ngram_counts[ngram] += 1
    else:
        ngram_counts[ngram] = 1
    

### Evaluate against test data

In [None]:
input = open(path_test).readlines()
print("Swahili cross entropy loss: ", evaluate_one(input[0], ngram_counts, vocab, encoder, len_gram))

## Test Kwere

### Ingest Data

In [3]:
import numpy as np

path_train_ = r"/content/cwe-train.txt"
path_test_ = r"/content/cwe-test.txt"

corpus_ = open(path_train_).readlines()[0]

corpus_ = corpus_.replace("\n", " ")

lower_case_corpus_ = [w.lower() for w in corpus_]

len_gram_ = 6

for x in range(len_gram_):
  lower_case_corpus_.insert(0, '<S>')


vocab_ = sorted(list(set(lower_case_corpus_)))
encoder_ = dict((c,i) for i,c in enumerate(vocab_))
decoder_ = dict((i,c) for i,c in enumerate(vocab_))

In [4]:
ngram_counts_ = {}

# Sliding through corpus to get ngram counts
for i in range(len(lower_case_corpus_) - (len_gram_-1)):

    # Getting ngram

    end = i + len_gram_
    
    ngram = tuple([lower_case_corpus_[x] for x in range(i, end)])
    
    # Keeping track of the ngram counts
    if ngram in ngram_counts_.keys():
        ngram_counts_[ngram] += 1
    else:
        ngram_counts_[ngram] = 1
    

### Evaluate against test data

In [11]:
input_ = open(path_test_).readlines()
input_[0] = input_[0].replace("\n", " ")
print("Kwere cross entropy loss: ", evaluate_one(input_[0], ngram_counts_, vocab_, encoder_, len_gram_))

Kwere cross entropy loss:  [1.2741946778164173]


## Generate sentences

In [None]:
input_sent = 'maji katika'
sent_len = 100

for i in range(sent_len):
  next_char = suggest_next_char(input_sent, ngram_counts_, vocab, encoder)
  print(input_sent)
  input_sent = input_sent + next_char[0][0]

print(input_sent)

