# Imports

In [None]:
from datasets import load_dataset
import torch
import nltk
import string

In [None]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 4050 Laptop GPU


In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\reda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\reda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

## Downloading dataset

In [None]:
IMDB_Dataset = load_dataset("stanfordnlp/imdb") #DatasetDict object

### Understanding the data

In [None]:
IMDB_Dataset # contains 3 splits

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
IMDB_Dataset['unsupervised']

Dataset({
    features: ['text', 'label'],
    num_rows: 50000
})

In [None]:
IMDB_Dataset['unsupervised'].features

{'text': Value('string'), 'label': ClassLabel(names=['neg', 'pos'])}

In [None]:
IMDB_Dataset['unsupervised'].num_rows # number of "documents"

50000

In [None]:
for i in range(5):
    print(f"Row {i}:")
    print(f"{IMDB_Dataset['unsupervised'][i]}")
    print(f"Text ALONE: {IMDB_Dataset['unsupervised']['text'][i]}")
    print(f"Label ALONE: {IMDB_Dataset['unsupervised']['label'][i]}")
    print("----------------------------------------------------------------")

Row 0:
{'text': 'This is just a precious little diamond. The play, the script are excellent. I cant compare this movie with anything else, maybe except the movie "Leon" wonderfully played by Jean Reno and Natalie Portman. But... What can I say about this one? This is the best movie Anne Parillaud has ever played in (See please "Frankie Starlight", she\'s speaking English there) to see what I mean. The story of young punk girl Nikita, taken into the depraved world of the secret government forces has been exceptionally over used by Americans. Never mind the "Point of no return" and especially the "La femme Nikita" TV series. They cannot compare the original believe me! Trash these videos. Buy this one, do not rent it, BUY it. BTW beware of the subtitles of the LA company which "translate" the US release. What a disgrace! If you cant understand French, get a dubbed version. But you\'ll regret later :)', 'label': -1}
Text ALONE: This is just a precious little diamond. The play, the script 

In [None]:
# for convenience
dataset = IMDB_Dataset

In [None]:
dataset_text = dataset['unsupervised']['text']
unsupervised_texts = dataset_text

### Text only

In [None]:
for i in range(5):
    print("Text",i)
    print(dataset_text[i])

Text 0
This is just a precious little diamond. The play, the script are excellent. I cant compare this movie with anything else, maybe except the movie "Leon" wonderfully played by Jean Reno and Natalie Portman. But... What can I say about this one? This is the best movie Anne Parillaud has ever played in (See please "Frankie Starlight", she's speaking English there) to see what I mean. The story of young punk girl Nikita, taken into the depraved world of the secret government forces has been exceptionally over used by Americans. Never mind the "Point of no return" and especially the "La femme Nikita" TV series. They cannot compare the original believe me! Trash these videos. Buy this one, do not rent it, BUY it. BTW beware of the subtitles of the LA company which "translate" the US release. What a disgrace! If you cant understand French, get a dubbed version. But you'll regret later :)
Text 1
When I say this is my favourite film of all time, that comment is not to be taken lightly. I 

In [None]:
len(dataset_text) # 50k strings

50000

# Actual Work

## Step 1: Data Preprocessing

### Different Methods of Tokenization

In [None]:
print("Before tokenization:", dataset_text[0])

Before tokenization: This is just a precious little diamond. The play, the script are excellent. I cant compare this movie with anything else, maybe except the movie "Leon" wonderfully played by Jean Reno and Natalie Portman. But... What can I say about this one? This is the best movie Anne Parillaud has ever played in (See please "Frankie Starlight", she's speaking English there) to see what I mean. The story of young punk girl Nikita, taken into the depraved world of the secret government forces has been exceptionally over used by Americans. Never mind the "Point of no return" and especially the "La femme Nikita" TV series. They cannot compare the original believe me! Trash these videos. Buy this one, do not rent it, BUY it. BTW beware of the subtitles of the LA company which "translate" the US release. What a disgrace! If you cant understand French, get a dubbed version. But you'll regret later :)


In [None]:
# OPTION 1
# will not use this tokenization because it leaves out punctuations
simple_tokens = dataset_text[0].split()
print("First few tokens using .split():",simple_tokens[:15])

First few tokens using .split(): ['This', 'is', 'just', 'a', 'precious', 'little', 'diamond.', 'The', 'play,', 'the', 'script', 'are', 'excellent.', 'I', 'cant']


In [None]:
# OPTION 2
# Better alternative using Natural Language Tool Kit
# -> tokenizes into words and punctuation -- not actual tokens
nltk_tokens = nltk.tokenize.word_tokenize(dataset_text[0])
print("First few tokens using nltk.tokenize.word_tokenize():", nltk_tokens[:15])

First few tokens using nltk.tokenize.word_tokenize(): ['This', 'is', 'just', 'a', 'precious', 'little', 'diamond', '.', 'The', 'play', ',', 'the', 'script', 'are', 'excellent']


In [None]:
# testing normalization
punctuation = set(string.punctuation)

# 1. Lowercase the token
# 2. Filter out tokens that are only punctuation

test_normalized_tokens = [
    token.lower() for token in nltk_tokens if token not in punctuation
]

print(test_normalized_tokens[:15])

['this', 'is', 'just', 'a', 'precious', 'little', 'diamond', 'the', 'play', 'the', 'script', 'are', 'excellent', 'i', 'cant']


In [None]:
# OPTION 3: Advanced Tokenization --- maybe not?

### Tokenization and Text Normalization

In [None]:
# I WILL USE OPTION 2: AT LEAST FOR NOW-- because it is the simplest and can be used to remove punctuation
l = len(dataset_text)

punctuation_to_filter = set(string.punctuation)
# punctuation that slips through
punctuation_to_filter.add("''")
punctuation_to_filter.add("``")
punctuation_to_filter.add("--")
punctuation_to_filter.add("'-")
punctuation_to_filter.add("'.")

normalized_tokens = [[] for _ in range(l)]
doc_tokens = ['']*l

for i in range(l):
    doc_tokens[i] = nltk.tokenize.word_tokenize(dataset_text[i]) # tokenizing to words and punctuation
    normalized_tokens[i] = [token.lower() for token in doc_tokens[i] if token not in punctuation_to_filter] # lower case and filter punctuation

In [None]:
print(doc_tokens[1])
print(normalized_tokens[1])
# problems like <br> <> removed but br kept
# limitations like n't, '99, 're, 's will have to stay

['When', 'I', 'say', 'this', 'is', 'my', 'favourite', 'film', 'of', 'all', 'time', ',', 'that', 'comment', 'is', 'not', 'to', 'be', 'taken', 'lightly', '.', 'I', 'probably', 'watch', 'far', 'too', 'many', 'films', 'than', 'is', 'healthy', 'for', 'me', ',', 'and', 'have', 'loved', 'quite', 'a', 'few', 'of', 'them', '.', 'I', 'first', 'saw', '``', 'La', 'Femme', 'Nikita', "''", 'nearly', 'ten', 'years', 'ago', ',', 'and', 'it', 'still', 'manages', 'to', 'be', 'my', 'absolute', 'favourite', '.', 'Why', '?', '<', 'br', '/', '>', '<', 'br', '/', '>', 'This', 'is', 'more', 'than', 'an', 'incredibly', 'stylish', 'and', 'sexy', 'thriller', '.', 'Luc', 'Besson', "'s", 'great', 'flair', 'for', 'impeccable', 'direction', ',', 'fashion', ',', 'and', 'appropriate', 'usage', 'of', 'music', 'makes', 'this', 'a', 'very', 'watchable', 'film', '.', 'But', 'it', 'is', 'Anne', 'Parillaud', "'s", 'perfect', 'rendering', 'of', 'a', 'complex', 'character', 'who', 'transforms', 'from', 'a', 'heartless', 'kill

In [None]:
# Option 4: Apostrophes aren't lost

import re
from nltk.tokenize import word_tokenize

contractions_pattern = re.compile(r"\b\w+'\w+\b")  # matches words like you'll, can't, doesn't etc.

def preprocessing_textNormalizing(text):
    # Lowercase
    text = text.lower()

    # Step 2: Temporarily protect contractions → you'll → you_ll
    protected = contractions_pattern.findall(text)
    for word in protected:
        text = text.replace(word, word.replace("'", "_"))  # mark with underscore instead

    # Step 3: Remove ALL punctuation except underscores (so apostrophe is now safe)
    text = re.sub(r"[^a-z0-9\s_]", "", text)

    return text

def preprocess_textTokenization(text):
    # Tokenize into words
    tokens = word_tokenize(text)
    # Step 4: Restore apostrophes → you_ll → you'll
    tokens = [t.replace("_", "'") for t in tokens]
    # Optionally remove empty tokens
    tokens = [t for t in tokens if len(t) > 0]
    return tokens

unsupervised_texts = [preprocessing_textNormalizing(t) for t in unsupervised_texts]
# Example: Apply to your dataset
processed_texts = [preprocess_textTokenization(t) for t in unsupervised_texts]
print(", ".join(processed_texts[0]))
print(processed_texts[0][0])

## Step 2: Constructing N-Gram Model

### Unigram Model

In [None]:
# Assuming greedy generation
# P(w) = count(w)/count(corpus)

count_corpus = sum(len(document_tokens) for document_tokens in normalized_tokens)
print(count_corpus)


12060456


In [None]:
# count w
def count_word(word_to_count):
    return sum(1 for document_token in normalized_tokens for word in document_token if word == word_to_count)

In [None]:
# need to look through all words and get the maximum.
# need to get a list of the vocabulary: unique words
vocabulary = set(word for document_tokens in normalized_tokens for word in document_tokens) # Make a set of the flattened words

In [None]:
print(len(vocabulary))

166753


In [None]:
# Assuming greedy generation
# P(w) = count(w)/count(corpus)
# we can get most frequent --- like getting highest probability -- to avoid floats
# for now
count_uni_dict = {}
sum_ = 0
for word in vocabulary:
    count_uni_dict[word] = count_word(word)
    sum_ += count_uni_dict[word]
print(sum_ == count_corpus)

# this code took more than 2 hrs and didn't even run.
# I am obviously doing something wrong here and my best guess is the counting method is really bad (linear counting per word)
# There might be some other improvements

KeyboardInterrupt: 

In [1]:
from collections import Counter

# Flatten all tokens from all sentences
all_tokens = [word for sentence in processed_texts for word in sentence]

# Count how many times each word appears
unigram_counts = Counter(all_tokens)
# print(unigram_counts)

V = len(unigram_counts)  # vocabulary size

# Total number of words
total_words = sum(unigram_counts.values())
# print(total_words)

# Compute probability of each word
unigram_prob = {word: (count + 1) / (total_words + V) for word, count in unigram_counts.items()}
# print(unigram_prob)

# Show top 10 (Example)
for word, prob in list(unigram_prob.items())[:10]:
    print(f"{word}: {prob:.4f}")

hendo


### Bigram Model

In [2]:
from collections import defaultdict, Counter

# Step 1: Count bigrams
bigram_counts = defaultdict(Counter)
unigram_counts = Counter()

for sentence in processed_texts:
    for i in range(len(sentence) - 1): #stop 1 word before the end
        w1, w2 = sentence[i], sentence[i + 1]
        bigram_counts[w1][w2] += 1
        unigram_counts[w1] += 1

# Laplace smoothing
V = len(unigram_counts)  # vocabulary size
# Step 2: Compute probabilities
bigram_prob = {}

for w1, next_words in bigram_counts.items():
    total_count = sum(next_words.values())
    bigram_prob[w1] = {}
    for w2 in unigram_counts:  # iterate over all possible next words
        count = bigram_counts[w1][w2]
        bigram_prob[w1][w2] = (count + 1) / (unigram_counts[w1] + V)

# Show top next words for "this" (Example)
print("Next words after 'i':")
print(bigram_prob.get("i", {}))

fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git
fatal: not a git repository (or any of the parent directories): .git


### Trigram Model

In [3]:
from collections import defaultdict, Counter

# Step 1: Count trigrams
trigram_counts = defaultdict(Counter)   # maps (w1, w2) -> next words and their counts
bigram_counts = Counter()               # counts how many times each (w1, w2) pair occurs

for sentence in processed_texts:
    for i in range(len(sentence) - 2):  # stop 2 words before the end
        w1, w2, w3 = sentence[i], sentence[i+1], sentence[i+2]

        trigram_counts[(w1, w2)][w3] += 1   # count how many times w3 follows (w1, w2)
        bigram_counts[(w1, w2)] += 1        # count how many times (w1, w2) occurs

V = len(unigram_counts)  # vocabulary size
# Step 2: Compute probabilities
trigram_prob = {}

for (w1, w2), next_words in trigram_counts.items():
    total = bigram_counts[(w1, w2)]
    trigram_prob[(w1, w2)] = {}
    for w3 in unigram_counts:
        count = trigram_counts[(w1, w2)][w3]
        trigram_prob[(w1, w2)][w3] = (count + 1) / (total + V)

# Example: show what usually comes after ("i", "love")
print("Next words after ('i', 'love'):")
print(trigram_prob.get(("i", "love"), {}))

sample_data


### Step 3: Evaluation Using Perplexity

In [None]:
import math
from collections import defaultdict, Counter

def calculate_perplexity(test_texts, ngram_prob, n_minus1_counts, V, n):
    total_log_prob = 0
    total_words = 0

    for sentence in test_texts:
        if len(sentence) < n:
            continue

        for i in range(len(sentence) - n + 1):
            ngram = tuple(sentence[i:i + n])
            context = ngram[:-1]
            word = ngram[-1]

            total_words += 1

            # Get probability (handle unseen)
            if context in ngram_prob and word in ngram_prob[context]:
                prob = ngram_prob[context][word]
            else:
                context_count = n_minus1_counts[context] if context in n_minus1_counts else 0
                prob = 1 / (context_count + V)

            total_log_prob += math.log(prob)

    perplexity = math.exp(-total_log_prob / total_words)
    return perplexity

Comparing the Three Models

In [None]:
pp_uni = calculate_perplexity(IMDB_Dataset['test'], unigram_prob, Counter({(): sum(unigram_counts.values())}), V, 1)
pp_bi  = calculate_perplexity(IMDB_Dataset['test'], bigram_prob, unigram_counts, V, 2)
pp_tri = calculate_perplexity(IMDB_Dataset['test'], trigram_prob, bigram_counts, V, 3)

print(f"Unigram Perplexity: {pp_uni:.2f}")
print(f"Bigram  Perplexity: {pp_bi:.2f}")
print(f"Trigram Perplexity: {pp_tri:.2f}")