In [1]:
from transformers import GPT2Tokenizer
from collections import Counter

tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")

In [2]:
# Generic English corpus for n-gram baseline
# This is NOT user input and NOT transformer training data

CORPUS_TEXT = """
Language is a system of communication used by humans to express ideas and emotions.
Machine learning models learn patterns from data and make predictions based on probability.
Artificial intelligence is widely used in applications such as search engines, assistants,
and recommendation systems. Learning from examples allows models to generalize to new inputs.
People read books, write code, and communicate using natural language every day.
Technology continues to evolve as data and computational power increase.
"""


In [3]:
corpus_text = CORPUS_TEXT.lower()


In [4]:
corpus_tokens = tokenizer.encode(corpus_text)

print("Total tokens in corpus:", len(corpus_tokens))
print("First 20 tokens:", corpus_tokens[:20])


Total tokens in corpus: 93
First 20 tokens: [198, 16129, 318, 257, 1080, 286, 6946, 973, 416, 5384, 284, 4911, 4213, 290, 10825, 13, 198, 30243, 4673, 4981]


In [5]:
def build_ngram(tokens, n=2):
    return Counter(zip(*[tokens[i:] for i in range(n)]))

bigram_counts = build_ngram(corpus_tokens, n=2)

print("Unique bigrams:", len(bigram_counts))

Unique bigrams: 87


In [6]:
def ngram_predict_next(
    input_text,
    ngram_counts,
    n=2,
    top_k=5
):
    input_tokens = tokenizer.encode(input_text.lower())

    if len(input_tokens) < n - 1:
        return []

    context = tuple(input_tokens[-(n - 1):])

    candidates = {
        gram[-1]: count
        for gram, count in ngram_counts.items()
        if gram[:-1] == context
    }

    if not candidates:
        return []

    total = sum(candidates.values())

    sorted_candidates = sorted(
        candidates.items(),
        key=lambda x: x[1],
        reverse=True
    )[:top_k]

    return [
        (tokenizer.decode([token_id]).strip(), count / total)
        for token_id, count in sorted_candidates
    ]


In [7]:
test_input = "machine learning"

predictions = ngram_predict_next(
    test_input,
    bigram_counts,
    top_k=5
)

print("Input:", test_input)
print("Predictions:")
for word, prob in predictions:
    print(f"{word} -> {prob:.4f}")


Input: machine learning
Predictions:
models -> 0.5000
from -> 0.5000


In [8]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

MODEL_NAME = "distilgpt2"

transformer_tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
transformer_model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)
transformer_model.eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [9]:
def transformer_predict_next(text, top_k=5):
    input_ids = transformer_tokenizer.encode(text, return_tensors="pt")

    with torch.no_grad():
        outputs = transformer_model(input_ids)
        logits = outputs.logits

    # Take logits for the last token
    next_token_logits = logits[0, -1, :]
    probs = torch.softmax(next_token_logits, dim=0)

    top_probs, top_indices = torch.topk(probs, top_k)

    return [
        (transformer_tokenizer.decode([idx]).strip(), prob.item())
        for idx, prob in zip(top_indices, top_probs)
    ]


In [10]:
test_text = "i want to learn"

preds = transformer_predict_next(test_text)

print("Transformer predictions:")
for word, prob in preds:
    print(f"{word} -> {prob:.4f}")


Transformer predictions:
more -> 0.3203
about -> 0.1343
how -> 0.0869
the -> 0.0428
from -> 0.0375


In [11]:
def compare_models(input_text, top_k=5):
    return {
        "input": input_text,
        "ngram": ngram_predict_next(
            input_text,
            bigram_counts,
            top_k=top_k
        ),
        "transformer": transformer_predict_next(
            input_text,
            top_k=top_k
        )
    }


In [12]:
sample_inputs = [
    "i want to learn",
    "artificial intelligence is",
    "people use language to"
]

for text in sample_inputs:
    print("\nINPUT:", text)

    results = compare_models(text)

    print("\nN-GRAM:")
    for w, p in results["ngram"]:
        print(f"{w} -> {p:.4f}")

    print("\nTRANSFORMER:")
    for w, p in results["transformer"]:
        print(f"{w} -> {p:.4f}")



INPUT: i want to learn

N-GRAM:
patterns -> 1.0000

TRANSFORMER:
more -> 0.3203
about -> 0.1343
how -> 0.0869
the -> 0.0428
from -> 0.0375

INPUT: artificial intelligence is

N-GRAM:
a -> 0.5000
widely -> 0.5000

TRANSFORMER:
a -> 0.1444
the -> 0.0589
not -> 0.0493
an -> 0.0297
one -> 0.0278

INPUT: people use language to

N-GRAM:
express -> 0.2500
general -> 0.2500
new -> 0.2500
evolve -> 0.2500

TRANSFORMER:
communicate -> 0.1417
express -> 0.0639
understand -> 0.0556
describe -> 0.0468
make -> 0.0309
