In [9]:
import re

def clean_gutenberg_text(text):
    # Remove header
    start_pattern = r"\*\*\* START OF THIS PROJECT GUTENBERG EBOOK.*?\*\*\*"
    end_pattern   = r"\*\*\* END OF THIS PROJECT GUTENBERG EBOOK.*?\*\*\*"

    text = re.sub(start_pattern, "", text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(end_pattern, "", text, flags=re.DOTALL | re.IGNORECASE)

    # Lowercase
    text = text.lower()

    # Normalize whitespace
    text = re.sub(r"\s+", " ", text)

    return text.strip()


paths = [
    r".\data\Adventures of Sherlock Holmes.txt",
    r".\data\Memories and Adventures.txt",
    r".\data\The case-book of Sherlock Holmes.txt",
    r".\data\The Memoirs of Sherlock Holmes.txt",
    r".\data\The Return of Sherlock Holmes.txt"
]

# ---------- Read, clean, and merge ----------
cleaned_texts = []

for path in paths:
    with open(path, "r", encoding="utf-8") as f:
        raw_text = f.read()
        cleaned_text = clean_gutenberg_text(raw_text)
        cleaned_texts.append(cleaned_text)

final_corpus = "\n".join(cleaned_texts)

output_path = r"C:\Users\sunka\OneDrive\Desktop\VI Sem\CS374 NLP\Lab\BigramModel\N-Gram Model\data\Arthur_Conan_Doyle_corpus.txt"

with open(output_path, "w", encoding="utf-8") as f:
    f.write(final_corpus)

print("Corpus saved as:", output_path)
print("Total characters:", len(final_corpus))


Corpus saved as: C:\Users\sunka\OneDrive\Desktop\VI Sem\CS374 NLP\Lab\BigramModel\N-Gram Model\data\Arthur_Conan_Doyle_corpus.txt
Total characters: 2963014


In [10]:
with open(r"C:\Users\sunka\OneDrive\Desktop\VI Sem\CS374 NLP\Lab\BigramModel\N-Gram Model\data\Arthur_Conan_Doyle_corpus.txt"
,"r", encoding="utf-8") as f:
    corpus = f.read()

# Tokenize into words and punctuation
tokens = re.findall(r"\b\w+\b|[.,!?;]", corpus)

print("Total tokens:", len(tokens))
print("Sample tokens:", tokens[:30])


Total tokens: 632023
Sample tokens: ['adventures', 'of', 'sherlock', 'holmes', 'adventure', 'i', 'a', 'scandal', 'in', 'bohemia', 'i', 'to', 'sherlock', 'holmes', 'she', 'is', 'always', '_the_', 'woman', '.', 'i', 'have', 'seldom', 'heard', 'him', 'mention', 'her', 'under', 'any', 'other']


In [11]:
N = 5
pad_token = "<s>"

tokens = [pad_token] * (N - 1) + tokens

five_grams = []

for i in range(len(tokens) - N + 1):
    five_grams.append(tuple(tokens[i:i+N]))

print("Total 5-grams:", len(five_grams))
print("Sample 5-gram:", five_grams[10])


Total 5-grams: 632023
Sample 5-gram: ('a', 'scandal', 'in', 'bohemia', 'i')


In [12]:
from collections import defaultdict

fivegram_counts = defaultdict(int)
context_counts = defaultdict(int)

for gram in five_grams:
    context = gram[:-1]   # first 4 words
    target = gram[-1]     # 5th word

    fivegram_counts[gram] += 1
    context_counts[context] += 1

print("Unique 5-grams:", len(fivegram_counts))
print("Unique contexts (4-grams):", len(context_counts))


Unique 5-grams: 610706
Unique contexts (4-grams): 558656


In [13]:
def fivegram_probability(context, word):
    gram = context + (word,)
    return fivegram_counts[gram] / context_counts[context]


In [14]:
from collections import defaultdict

context_to_candidates = defaultdict(set)

for gram in fivegram_counts:
    context = gram[:-1]   # first 4 words
    word = gram[-1]       # predicted word
    context_to_candidates[context].add(word)

def generate_text_max_prob(seed_text, num_words=50):
    seed_tokens = re.findall(r"\b\w+\b|[.,!?;]", seed_text.lower())

    if len(seed_tokens) < 4:
        raise ValueError("Seed text must contain at least 4 words")

    generated = seed_tokens[:]

    for _ in range(num_words):
        context = tuple(generated[-4:])

        if context not in context_to_candidates:
            break

        # Find word with highest probability
        best_word = None
        best_prob = 0.0

        for word in context_to_candidates[context]:
            prob = fivegram_probability(context, word)

            if prob > best_prob:
                best_prob = prob
                best_word = word

        if best_word is None:
            break

        generated.append(best_word)

    return " ".join(generated)

print(generate_text_max_prob("we arrived at baker street", 40))
print(generate_text_max_prob("holmes looked at me", 40))
print(generate_text_max_prob("i could not believe", 40))
print(generate_text_max_prob("he said nothing but smiled", 40))

we arrived at baker street late in the evening , and he tossed it across to me . it was a very bad day in which i failed to entirely avoid . the next few minutes were delicious . it was a very bad day
holmes looked at me thoughtfully and shook his head . i am afraid that i must leave you to your papers for a little . you appear to take it for granted that , although the door was forced , the robber never got
i could not believe it to be true . i had no idea that the case was serious , for i had a fit in the station , and we were able to find out where it was . evans had indeed done great
he said nothing but smiled
