In [1]:
import itertools
import torch
import nltk
import re
import wget
from sys import getsizeof
import os
import pandas as pd
import numpy as np
import random

from collections import defaultdict, Counter

nltk.download('punkt', quiet=True) # this module is used to tokenize the text

SEED = 1234
random.seed(SEED)

In [2]:
# Some utilities to manipulate the corpus

def preprocess(text):
    """Strips #comments and empty lines from a string
    """
    result = []
    for line in text.split("\n"):
        line = line.strip()
        line = re.sub('#.*$', '', line)
        if line != '':
            result.append(line)
    return result

def nltk_normpunc_tokenize(str):
    return nltk.tokenize.word_tokenize(str.lower())

def split(list, portions, offset):
    return ([list[i] for i in range(0, len(list)) if i%portions != offset],
          [list[i] for i in range(0, len(list)) if i%portions == offset])

def SMSSpamCollection_tokenize(lines):
    result = []
    for line in lines:
        # tokenize
        tokens = nltk_normpunc_tokenize(line)
        if tokens[0] == "ham":
            tokens[0] = "HAM:"
        elif tokens[0] == "spam":
            tokens[0] = "SPAM:"
        # add a start of message token
        result += ["<s>"] + tokens

    return result

def postprocess(tokens):
    return ' '.join(tokens)\
                .replace("<s> ", "\n")

Download the corpus

In [3]:
corpus_filename = ("https://github.com/DanielLevi6/NLP-home-assignment/tree/main/data/"
                  "SMSSpamCollection.txt")
os.makedirs('data', exist_ok=True)
wget.download(corpus_filename, out="data/")

-1 / unknown

'data//SMSSpamCollection.txt'

Load the corpus

In [4]:
with open("data/SMSSpamCollection.txt", 'r') as fin:
    lines = preprocess(fin.read())[:40]
    train_lines, test_lines = split(lines, 12, 0)
    train_tokens = SMSSpamCollection_tokenize(train_lines)
    test_tokens = SMSSpamCollection_tokenize(test_lines)

In [5]:
print(train_tokens[:50])
print(postprocess(train_tokens[:50]))
print(test_tokens[:50])
print(postprocess(test_tokens[:50]))

['<s>', '<', 'html', 'lang=', "''", 'en', "''", 'data-color-mode=', "''", 'auto', "''", 'data-light-theme=', "''", 'light', "''", 'data-dark-theme=', "''", 'dark', "''", 'data-a11y-animated-images=', "''", 'system', "''", '>', '<s>', '<', 'head', '>', '<s>', '<', 'meta', 'charset=', "''", 'utf-8', "''", '>', '<s>', '<', 'link', 'rel=', "''", 'dns-prefetch', "''", 'href=', "''", 'https', ':', '//github.githubassets.com', "''", '>']

< html lang= '' en '' data-color-mode= '' auto '' data-light-theme= '' light '' data-dark-theme= '' dark '' data-a11y-animated-images= '' system '' > 
< head > 
< meta charset= '' utf-8 '' > 
< link rel= '' dns-prefetch '' href= '' https : //github.githubassets.com '' >
['<s>', '<', '!', 'doctype', 'html', '>', '<s>', '<', 'link', 'crossorigin=', "''", 'anonymous', "''", 'media=', "''", 'all', "''", 'rel=', "''", 'stylesheet', "''", 'href=', "''", 'https', ':', '//github.githubassets.com/assets/primer-7e8db5e0affc.css', "''", '/', '>', '<s>', '<', 'script', 

In [6]:
# Extract vocabulary from dataset
vocabulary = list(set(train_tokens)) + list(set(test_tokens))

In [7]:
# Creating the n-grams
def all_ngrams(vocabulary, n):
    return list(itertools.product(vocabulary, repeat=n))

def ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(0, len(tokens)-n+1)]

In [8]:
print(train_tokens[:6])
print(ngrams(train_tokens[:6], 3))

['<s>', '<', 'html', 'lang=', "''", 'en']
[('<s>', '<', 'html'), ('<', 'html', 'lang='), ('html', 'lang=', "''"), ('lang=', "''", 'en')]


In [9]:
# Counting the ngrams
def ngram_counts(vocabulary, tokens, n):
    context_dict = defaultdict(lambda: defaultdict(int))
    for context in all_ngrams(vocabulary, n-1):
        for target in vocabulary:
            context_dict[context][target] = 0

    for ngram, count in Counter(ngrams(tokens, n)).items():
        context_dict[ngram[:-1]][ngram[-1]] = count

    return context_dict

In [10]:
unigram_counts = ngram_counts(vocabulary, train_tokens, 1)
bigram_counts = ngram_counts(vocabulary, train_tokens, 2)
trigram_counts = ngram_counts(vocabulary, train_tokens, 3)

In [11]:
tokens_count = len(train_tokens)
unigram_count = sum(len(unigram_counts[cntxt]) for cntxt in unigram_counts)
bigram_count = sum(len(bigram_counts[cntxt]) for cntxt in bigram_counts)
trigram_count = sum(len(trigram_counts[cntxt]) for cntxt in trigram_counts)

In [12]:
# Report on the totals
print(f"Tokens: {tokens_count:6}\n"
     f"Unigrams: {unigram_count:6}\n"
     f"Bigrams: {bigram_count:6}\n"
     f"Trigrams: {trigram_count:6}\n")


Tokens:   6569
Unigrams:    392
Bigrams: 153664
Trigrams: 60236288



In [13]:
def ngram_model(ngram_counts):
    probs = defaultdict(lambda: defaultdict(int))
    for cntxt, distrib in ngram_counts.items():
        total_count = sum(distrib.values())
        for token in distrib:
            probs[cntxt][token] = distrib[token] / total_count if total_count > 0 else 0.0
    
    return probs

In [14]:
unigram_model = ngram_model(unigram_counts)
bigram_model = ngram_model(bigram_counts)
trigram_model = ngram_model(trigram_counts)

In [15]:
print(f"Tokens: {getsizeof(train_tokens):6}\n"
      f"Unigrams: {getsizeof(unigram_model):6}\n"
      f"Bigrams: {getsizeof(bigram_model):6}\n"
      f"Trigrams: {getsizeof(trigram_model):6}")

Tokens:  54712
Unigrams:    240
Bigrams:  18528
Trigrams: 5242976


In [16]:
# Perplexity calculation
import math

def neglogprob(tokens, model, n):
    score = 0.0
    context = tokens[0:n-1]
    for token in tokens[n-1:]:
        prob = model[tuple(context)][token]
        score += -math.log2(prob) if prob > 0 else math.inf
        context = (context +[token])[1:]
    
    return score

def perplexity(tokens, model, n):
    return 2**(neglogprob(tokens, model, n) / (len(tokens) -n +1))

print(f"Test perplexity - unigram: {perplexity(test_tokens, unigram_model, 1):.3f}\n"
      f"Test perplexity - bigram: {perplexity(test_tokens, bigram_model, 2):.3f}\n"
      f"Test perplexity - trigram: {perplexity(test_tokens, trigram_model, 3):.3f}\n")

Test perplexity - unigram: inf
Test perplexity - bigram: inf
Test perplexity - trigram: inf



In [17]:
# Delta smoothing

def ngram_model_smoothed(ngram_counts, delta=2):
    vocab_size = len(list(ngram_counts.items())[0][1])
    probs = defaultdict(lambda: defaultdict(int))
    for cntxt, distrib in ngram_counts.items():
        total_count = sum(distrib.values())
        for token in distrib:
            probs[cntxt][token] = (distrib[token] + delta) / (total_count + vocab_size * delta)
            if probs[cntxt][token] == 0:
                print("{context} {token} prob is zero")
                
    return probs

In [18]:
unigram_model_smoothed = ngram_model_smoothed(unigram_counts)
bigram_model_smoothed = ngram_model_smoothed(bigram_counts)
trigram_model_smoothed = ngram_model_smoothed(trigram_counts)
print(f"Test smoothed perplexity - unigram: {perplexity(test_tokens, unigram_model_smoothed, 1):.3f}\n"
      f"Test smoothed perplexity - bigram: {perplexity(test_tokens, bigram_model_smoothed, 2):.3f}\n"
      f"Test smoothed perplexity - trigram: {perplexity(test_tokens, trigram_model_smoothed, 3):.3f}\n")

Test smoothed perplexity - unigram: 123.649
Test smoothed perplexity - bigram: 50.845
Test smoothed perplexity - trigram: 58.432



In [19]:
def ngram_model_smoothed_kneser_ney(ngram_model, test_data, n):
    # Compute total number of words in test data
    total_words = len(test_data)

    # Compute the count of n-grams in the training data
    ngram_counts = defaultdict(int)
    for ngram in ngram_model:
        ngram_counts[ngram] += 1

    # Compute the count of unique n-1 grams in the training data
    n_1gram_counts = defaultdict(int)
    for ngram in ngram_model:
        n_1gram = tuple(ngram[:-1])
        n_1gram_counts[n_1gram] += 1

    # Compute the total number of unique n-1 grams in the training data
    total_n_1grams = len(n_1gram_counts)

    # Compute the total number of unique words in the training data
    total_vocabulary = len(set([word for ngram in ngram_model for word in ngram]))

    # Compute the perplexity of the test data
    log_prob = 0.0
    for i in range(n-1, len(test_data)):
        ngram = tuple(test_data[i-n+1:i+1])
        n_1gram = ngram[:-1]

        count_ngram = ngram_counts[ngram]
        count_n_1gram = n_1gram_counts[n_1gram]

        prob = max(count_ngram - 0.75, 0) / count_n_1gram
        prob += (0.75 / count_n_1gram) * (total_n_1grams / total_vocabulary)

        log_prob += math.log2(prob)

    perplexity = 2 ** (-log_prob / total_words)

    return perplexity

In [20]:
unigram_model_kneser_ney_smoothed = kneser_ney_perplexity(unigram_counts)
bigram_model_kneser_ney_smoothed = kneser_ney_perplexity(bigram_counts)
trigram_model_kneser_ney_smoothed = kneser_ney_perplexity(trigram_counts)
print(f"Test smoothed perplexity - unigram: {perplexity(test_tokens, unigram_model_smoothed, 1):.3f}\n"
      f"Test smoothed perplexity - bigram: {perplexity(test_tokens, bigram_model_smoothed, 2):.3f}\n"
      f"Test smoothed perplexity - trigram: {perplexity(test_tokens, trigram_model_smoothed, 3):.3f}\n")

NameError: name 'kneser_ney_perplexity' is not defined