In [10]:
import itertools
import torch
import nltk
import re

from collections import defaultdict, Counter

nltk.download('punkt', quiet=True) # this module is used to tokenize the text

True

In [11]:
# Some utilities to manipulate the corpus

def preprocess(text):
  """Strips #comments and empty lines from a string
  """
  result = []
  for line in text.split("\n"):
    line = line.strip()
    line = re.sub('#.*$', '', line)
    if line != '':
      result.append(line)
  return result

def nltk_normpunc_tokenize(str):
  return nltk.tokenize.word_tokenize(str.lower())

def split(list, portions, offset):
  return ([list[i] for i in range(0, len(list)) if i%portions != offset],
          [list[i] for i in range(0, len(list)) if i%portions == offset])

def tokenize_lines(lines):
  result = []
  for line in lines:
    result += ["<s>"] + nltk_normpunc_tokenize(line)

  return result

Download the corpus

In [12]:
import nltk
!python -m nltk.downloader gutenberg

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


Load the corpus

In [13]:
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize, word_tokenize

lines = preprocess(gutenberg.raw('carroll-alice.txt'))
train_lines, test_lines = split(lines, 12, 0)

train_tokens = tokenize_lines(train_lines)
test_tokens = tokenize_lines(test_lines)

In [14]:
# Extract vocabulary from dataset
vocabulary = list(set(train_tokens))

In [15]:
# Creating the n-grams
def all_ngrams(vocabulary, n):
  return list(itertools.product(vocabulary, repeat=n))

def ngrams(tokens, n):
  return [tuple(tokens[i:i+n])
          for i in range(0, len(tokens)-n+1)]

In [16]:
print(train_tokens[:6])
print(ngrams(train_tokens[:6], 3))

['<s>', 'chapter', 'i.', 'down', 'the', 'rabbit-hole']
[('<s>', 'chapter', 'i.'), ('chapter', 'i.', 'down'), ('i.', 'down', 'the'), ('down', 'the', 'rabbit-hole')]


In [17]:
# Counting the ngrams
def ngram_counts(vocabulary, tokens, n):
  context_dict = defaultdict(lambda: defaultdict(int))
  for context in all_ngrams(vocabulary, n-1):
    for target in vocabulary:
      context_dict[context][target] = 0

  for ngram, count in Counter(ngrams(tokens, n)).items():
    context_dict[ngram[:-1]][ngram[-1]] = count

  return context_dict

In [None]:
unigram_counts = ngram_counts(vocabulary, train_tokens, 1)
bigram_counts = ngram_counts(vocabulary, train_tokens, 2)
trigram_counts = ngram_counts(vocabulary, train_tokens, 3)