In [None]:
import pickle

# Function to load any n-gram pkl file
def load_ngram(path):
    with open(path, "rb") as f:
        df = pickle.load(f)
    return dict(zip(df["Ngram"], df["Count"]))


# Load all n-grams
unigrams = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/unigram.pkl")
bigrams = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/bigram.pkl")
trigrams = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/trigram.pkl")
quadrigrams = load_ngram("C:/Users/ashis/OneDrive/Desktop/NLP/N-Grams/quadrigram.pkl")

In [12]:
import random
import pandas as pd
from collections import defaultdict

# ------------------ Build Probabilities ------------------
def build_probabilities(ngram_dict):
    probs = defaultdict(dict)
    for ngram, count in ngram_dict.items():
        if len(ngram) == 1:  # Unigram
            probs[()][ngram[0]] = count
        else:
            prefix = ngram[:-1]
            word = ngram[-1]
            probs[prefix][word] = probs[prefix].get(word, 0) + count

    # Normalize probabilities
    for prefix in probs:
        total = sum(probs[prefix].values())
        for word in probs[prefix]:
            probs[prefix][word] /= total

    return probs

# ------------------ Greedy Generation ------------------
def generate_greedy(probs, n, max_len=10):
    if n == 1:
        return max(probs[()], key=probs[()].get)

    start = tuple(["<s>"] * (n - 1))
    if start not in probs:
        start = random.choice([p for p in probs.keys() if len(p) == n - 1])

    sentence = list(start)
    for _ in range(max_len - len(start)):
        prefix = tuple(sentence[-(n - 1):])
        if prefix not in probs:
            break
        next_word = max(probs[prefix], key=probs[prefix].get)
        sentence.append(next_word)
        if next_word in ['.', '!', '?']:
            break

    return " ".join([w for w in sentence if w != "<s>"])

# ------------------ Beam Search Generation ------------------
def generate_beam_search(probs, n, beam_size=20, max_len=10):
    if n == 1:
        return generate_greedy(probs, n, max_len)

    start = tuple(["<s>"] * (n - 1))
    if start not in probs:
        start = random.choice([p for p in probs.keys() if len(p) == n - 1])

    beams = [(list(start), 1.0)]
    for _ in range(max_len - len(start)):
        candidates = []
        for seq, score in beams:
            prefix = tuple(seq[-(n - 1):])
            for next_word, next_prob in probs.get(prefix, {}).items():
                new_seq = seq + [next_word]
                new_score = score * next_prob
                candidates.append((new_seq, new_score))

        if not candidates:
            break

        beams = sorted(candidates, key=lambda x: x[1], reverse=True)[:beam_size]
        if all(seq[-1] in ['.', '!', '?'] for seq, _ in beams):
            break

    best_sentence = beams[0][0]
    return " ".join([w for w in best_sentence if w != "<s>"])

# ------------------ Generate Unique Sentences ------------------
def generate_unique_sentences(probs, n, method, num_sentences=100):
    sentences = set()
    attempts = 0
    while len(sentences) < num_sentences and attempts < num_sentences * 5:
        if method == "greedy":
            s = generate_greedy(probs, n)
        elif method == "beam":
            s = generate_beam_search(probs, n)
        if 2 <= len(s.split()) <= 10:      # Filter very short or empty outputs
            sentences.add(s.strip().lower())
        attempts += 1
    return list(sentences)

# ------------------ Main Execution ------------------
ngram_dicts = {
    1: unigrams,
    2: bigrams,
    3: trigrams,
    4: quadrigrams,
}

results = {}

for n, ngram_dict in ngram_dicts.items():
    print(f"\nProcessing {n}-gram model...")
    probs = build_probabilities(ngram_dict)

    greedy_sents = generate_unique_sentences(probs, n, "greedy")
    beam_sents = generate_unique_sentences(probs, n, "beam")

    results[f"{n}-gram Greedy"] = greedy_sents
    results[f"{n}-gram Beam"] = beam_sents

# Save results
df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in results.items()]))
df.to_csv("unique_generated_sentences_optimized.csv", index=False)

print("\nSaved optimized output to 'unique_generated_sentences_optimized.csv'")



Processing 1-gram model...

Processing 2-gram model...

Processing 3-gram model...

Processing 3-gram model...

Processing 4-gram model...

Processing 4-gram model...

Saved optimized output to 'unique_generated_sentences_optimized.csv'

Saved optimized output to 'unique_generated_sentences_optimized.csv'
