In [None]:
import nltk
nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nltkdata/brown-corpus")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/nltkdata/brown-corpus?dataset_version_number=3...


100%|██████████| 9.29M/9.29M [00:00<00:00, 86.0MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/nltkdata/brown-corpus/versions/3


In [None]:
import nltk
from nltk.corpus import brown
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

# Download necessary resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

brown_words = brown.words()
tokens = [word.lower() for word in brown_words]
tokens = [word for word in tokens if word not in string.punctuation]
stop_words = set(stopwords.words('english'))
tokens = [word for word in tokens if word not in stop_words]

print("Preprocessed text sample:", tokens[:50])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Preprocessed text sample: ['fulton', 'county', 'grand', 'jury', 'said', 'friday', 'investigation', "atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'evidence', "''", 'irregularities', 'took', 'place', 'jury', 'said', 'term-end', 'presentments', 'city', 'executive', 'committee', 'over-all', 'charge', 'election', '``', 'deserves', 'praise', 'thanks', 'city', 'atlanta', "''", 'manner', 'election', 'conducted', 'september-october', 'term', 'jury', 'charged', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'investigate', 'reports']


In [None]:
from collections import Counter
from nltk.util import ngrams

unigrams = tokens
bigrams = list(ngrams(tokens, 2))
trigrams = list(ngrams(tokens, 3))
quadgrams = list(ngrams(tokens, 4))

unigram_counts = Counter(unigrams)
bigram_counts = Counter(bigrams)
trigram_counts = Counter(trigrams)
quadgram_counts = Counter(quadgrams)

print("Unigram sample:", list(unigram_counts.items())[:5])
print("Bigram sample:", list(bigram_counts.items())[:5])
print("Trigram sample:", list(trigram_counts.items())[:5])

Unigram sample: [('fulton', 17), ('county', 155), ('grand', 48), ('jury', 67), ('said', 1961)]
Bigram sample: [(('fulton', 'county'), 6), (('county', 'grand'), 1), (('grand', 'jury'), 10), (('jury', 'said'), 9), (('said', 'friday'), 4)]
Trigram sample: [(('fulton', 'county', 'grand'), 1), (('county', 'grand', 'jury'), 1), (('grand', 'jury', 'said'), 1), (('jury', 'said', 'friday'), 1), (('said', 'friday', 'investigation'), 1)]


In [None]:
def probability_ngram(ngram, ngram_counts, total_ngrams):
    return ngram_counts[ngram] / total_ngrams

total_unigrams = sum(unigram_counts.values())
total_bigrams = sum(bigram_counts.values())
total_trigrams = sum(trigram_counts.values())
total_quadgrams = sum(quadgram_counts.values())

print("Total unigrams:", total_unigrams)
print("Total bigrams:", total_bigrams)
print("Total trigrams:", total_trigrams)


sample_bigram = ('linear', 'algebra')
sample_prob = probability_ngram(sample_bigram, bigram_counts, total_bigrams)

print(f"Probability of {sample_bigram}: {sample_prob}")

Total unigrams: 557984
Total bigrams: 557983
Total trigrams: 557982
Probability of ('linear', 'algebra'): 3.5843385909606566e-06


In [None]:
def laplace_smoothing(ngram, ngram_counts, total_ngrams, vocab_size):
    return (ngram_counts[ngram] + 1) / (total_ngrams + vocab_size)

vocab_size = len(unigram_counts)

sample_prob_smooth = laplace_smoothing(sample_bigram, bigram_counts, total_bigrams, vocab_size)

print(f"Smoothed probability of {sample_bigram}: {sample_prob_smooth}")

Smoothed probability of ('linear', 'algebra'): 4.937426350056945e-06


In [None]:
import random

# Function for Unigram Model (Random words)
def generate_unigram_sentence(unigram_counts, length=10):
    words = list(unigram_counts.keys())
    return ' '.join(random.choices(words, k=length))

# Function for Bigram Model (Word pairs based on probability)
def generate_bigram_sentence(bigram_counts, length=10):
    sentence = [random.choice(list(bigram_counts.keys()))[0]]
    for _ in range(length - 1):
        prev_word = sentence[-1]
        candidates = [pair[1] for pair in bigram_counts.keys() if pair[0] == prev_word]
        sentence.append(random.choice(candidates) if candidates else random.choice(list(unigram_counts.keys())))
    return ' '.join(sentence)

# Function for Trigram Model (Word triples based on probability)
def generate_trigram_sentence(trigram_counts, length=10):
    sentence = list(random.choice(list(trigram_counts.keys())))
    while len(sentence) < length:
        prev_bigram = tuple(sentence[-2:])  # Last two words
        candidates = [trigram[2] for trigram in trigram_counts.keys() if trigram[:2] == prev_bigram]
        sentence.append(random.choice(candidates) if candidates else random.choice(list(unigram_counts.keys())))
    return ' '.join(sentence)

def generate_quadgram_sentence(quadgrams_counts, length=10):
    sentence = list(random.choice(list(quadgrams_counts.keys())))
    while len(sentence) < length:
        prev_trigram = tuple(sentence[-3:])  # Last two words
        candidates = [quadgrams[3] for quadgrams in quadgrams_counts.keys() if quadgrams[:3] == prev_trigram]
        sentence.append(random.choice(candidates) if candidates else random.choice(list(unigram_counts.keys())))
    return ' '.join(sentence)


unigram_sentence1 = generate_unigram_sentence(unigram_counts)
unigram_sentence2 = generate_unigram_sentence(unigram_counts)
unigram_sentence3 = generate_unigram_sentence(unigram_counts)
unigram_sentence4 = generate_unigram_sentence(unigram_counts)
unigram_sentence5 = generate_unigram_sentence(unigram_counts)

bigram_sentence1 = generate_bigram_sentence(bigram_counts)
bigram_sentence2 = generate_bigram_sentence(bigram_counts)
bigram_sentence3 = generate_bigram_sentence(bigram_counts)
bigram_sentence4 = generate_bigram_sentence(bigram_counts)
bigram_sentence5 = generate_bigram_sentence(bigram_counts)

trigram_sentence1 = generate_trigram_sentence(trigram_counts)
trigram_sentence2 = generate_trigram_sentence(trigram_counts)
trigram_sentence3 = generate_trigram_sentence(trigram_counts)
trigram_sentence4 = generate_trigram_sentence(trigram_counts)
trigram_sentence5 = generate_trigram_sentence(trigram_counts)

quadgram_sentence1 = generate_quadgram_sentence(quadgram_counts)
quadgram_sentence2 = generate_quadgram_sentence(quadgram_counts)
quadgram_sentence3 = generate_quadgram_sentence(quadgram_counts)
quadgram_sentence4 = generate_quadgram_sentence(quadgram_counts)
quadgram_sentence5 = generate_quadgram_sentence(quadgram_counts)

print("Unigram Sentence 1:", unigram_sentence1)
print("Unigram Sentence 2:", unigram_sentence2)
print("Unigram Sentence 3:", unigram_sentence3)
print("Unigram Sentence 4:", unigram_sentence4)
print("Unigram Sentence 5:", unigram_sentence5)

print("Bigram Sentence 1:", bigram_sentence1)
print("Bigram Sentence 2:", bigram_sentence2)
print("Bigram Sentence 3:", bigram_sentence3)
print("Bigram Sentence 4:", bigram_sentence4)
print("Bigram Sentence 5:", bigram_sentence5)

print("Trigram Sentence 1:", trigram_sentence1)
print("Trigram Sentence 2:", trigram_sentence2)
print("Trigram Sentence 3:", trigram_sentence3)
print("Trigram Sentence 4:", trigram_sentence4)
print("Trigram Sentence 5:", trigram_sentence5)

print("Quadgram Sentence 1:", quadgram_sentence1)
print("Quadgram Sentence 2:", quadgram_sentence2)
print("Quadgram Sentence 3:", quadgram_sentence3)
print("Quadgram Sentence 4:", quadgram_sentence4)
print("Quadgram Sentence 5:", quadgram_sentence5)

Unigram Sentence 1: severna newman extractor colquitt burne 695 featured race-driver no-one brim
Unigram Sentence 2: 350 fille neonatal forearms civilizing compositions ted crafts spattered plaques
Unigram Sentence 3: drown inhibition chattered belle imperiled much-craved reverse hull smoothbore peed
Unigram Sentence 4: anglophobia 607-608 rumen epicurus low-foam l impertinent spirit-gum non-commissioned conceiving
Unigram Sentence 5: prussia tangy offer warmed 70 rodent thimble dawning bathrooms glottochronological
Bigram Sentence 1: additional eleven mines protection forage commercial sources scholars really failing
Bigram Sentence 2: leave sequence relationships wives expression anti-catholicism first ball tickets back
Bigram Sentence 3: swarthy complexion pale chest bubble bauble '' crucial difference present
Bigram Sentence 4: quarters -- soirees fox chapel assembly make computations set marginal
Bigram Sentence 5: wants '' squire '' poems 1912 united coddington running object
Tri