# N-gram Language Models

In [None]:
from datasets import load_dataset
from collections import Counter, defaultdict
import math
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize

## Import dataset

The dataset that we will use is the DreamBank dataset (https://huggingface.co/datasets/DReAMy-lib/DreamBank-dreams-en). The dataset is a collection of ~20 k textual reports of dreams, originally scraped from the DreamBank databased by mattbierner. The DreamBank reports are divided into series, which are collections of individuals or research projects/groups that have gathered the dreams.

In [None]:
dataset = load_dataset("DReAMy-lib/DreamBank-dreams-en")

df = dataset["train"].to_pandas()
df["dreams"] = df["dreams"].astype(str)

train_df = df.query("series != 'natural_scientist'")
train_list = list(train_df["dreams"])
print("Number of dream reports in the training set:",len(train_list))

test_df = df.query("series == 'natural_scientist'")
test_list = list(test_df["dreams"])
print("Number of dream reports in the test set:",len(test_list))

In [None]:
train_strings = " ".join(train_list)
train_strings = train_strings.lower()
train_tokens = word_tokenize(train_strings)
print("Number of tokens in the training set:",len(train_tokens))

vocab = set(train_tokens)
print("Vocabulary size:",len(vocab))

test_strings = " ".join(test_list)
test_strings = test_strings.lower()
test_tokens = word_tokenize(test_strings)
print("Number of tokens in the test set:",len(test_tokens))

## Import dataset and prepare training and test sets

In [None]:
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE, StupidBackoff, Laplace

In [None]:
train_sents = sent_tokenize(train_strings)
print(len(train_sents))
train_sents_tokens = [word_tokenize(s) for s in train_sents]

test_sents = sent_tokenize(test_strings)
print(len(test_sents))
test_sents_tokens = [word_tokenize(s) for s in test_sents]

In [None]:
test_sents_tokens[:2]

In [None]:
n = 3

train, vocab = padded_everygram_pipeline(n, train_sents_tokens)

#lm_mle = MLE(n) # Maximum Likelihood Estimate
#lm_sb = StupidBackoff(order = n) # Stupid Backoff
lm_laplace = Laplace(n) # Laplace smoothing

In [None]:
lm_laplace.fit(train, vocab)
print(lm_laplace.vocab)
print(len(lm_laplace.vocab))

In [None]:
print(lm_laplace.counts)

In [None]:
lm_laplace.counts['dream']

In [None]:
lm_laplace.counts[['i']]['love']

In [None]:
lm_laplace.counts[['i']]['want']

In [None]:
lm_laplace.score("i")

In [None]:
lm_laplace.score("davide")

In [None]:
lm_laplace.perplexity(train_sents_tokens)

In [None]:
lm_laplace.perplexity(test_sents_tokens)

In [None]:
lm_laplace.generate(20)

In [None]:
lm_laplace.generate(20, text_seed=['so'], random_seed=42)

### Questions
- Compute and compare the perplexity for 2-grams LMs (MLE vs Laplace vs Stupid Backoff)