In [11]:
import re
from collections import Counter, defaultdict
from math import log
import spacy
import urllib.request

In [12]:
# Step 1: Download the dataset
def download_dataset(url, filename):
    urllib.request.urlretrieve(url, filename)
    print(f"Dataset downloaded and saved as {filename}")

In [13]:
# Step 2: Preprocess the text (clean and normalize)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)  # Remove text inside brackets
    text = re.sub(r'<.*?>', '', text)  # Remove text inside angle brackets
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    return text.strip()


In [14]:
# Step 3: Tokenize sentences and add start/end tokens
def tokenize_sentences(text):
    return [['<s>'] + sentence.split() + ['</s>'] for sentence in text.split('\n') if sentence.strip()]


In [15]:
# Step 4: Compute bigram probabilities with Add-1 smoothing
def compute_bigram_probabilities(sentences):
    unigram_counts = Counter()
    bigram_counts = Counter()
    for sentence in sentences:
        for i in range(len(sentence) - 1):
            unigram_counts[sentence[i]] += 1
            bigram_counts[(sentence[i], sentence[i+1])] += 1
    vocabulary_size = len(unigram_counts)
    bigram_probs = defaultdict(float)
    for (w1, w2), count in bigram_counts.items():
        bigram_probs[(w1, w2)] = (count + 1) / (unigram_counts[w1] + vocabulary_size)
    return bigram_probs, unigram_counts, vocabulary_size

In [16]:
# Step 5: Compute the probability of a sentence
def compute_sentence_probability(sentence, bigram_probs, unigram_counts, vocabulary_size):
    words = ['<s>'] + sentence.split() + ['</s>']
    probability = 0
    for i in range(len(words) - 1):
        word_prob = bigram_probs.get((words[i], words[i+1]), 1 / (unigram_counts.get(words[i], 0) + vocabulary_size))
        probability += log(word_prob)
    return probability

In [17]:
# Step 6: Main execution
if __name__ == "__main__":
    # Download dataset
    dataset_url = "https://raw.githubusercontent.com/wooters/berp-trans/master/transcript.txt"
    dataset_filename = "transcript.txt"
    download_dataset(dataset_url, dataset_filename)

    # Load NLP model and dataset
    nlp = spacy.load("en_core_web_sm")
    with open(dataset_filename, "r") as f:
        text = f.read()

Dataset downloaded and saved as transcript.txt


In [20]:

    # Preprocessing and tokenization
    preprocessed_text = preprocess_text(text)
    tokenized_sentences = tokenize_sentences(preprocessed_text)

    # Compute bigram probabilities
    bigram_probs, unigram_counts, vocab_size = compute_bigram_probabilities(tokenized_sentences)



In [23]:

    # Sentences for probability calculation
    sentence1 = "show me all the Arabic food restaurants"
    sentence2 = "I am learning mathematics"

    # Compute probabilities for given sentences
    probability1 = compute_sentence_probability(sentence1, bigram_probs, unigram_counts, vocab_size)
    probability2 = compute_sentence_probability(sentence2, bigram_probs, unigram_counts, vocab_size)


In [24]:

    # Print results
    print("Bigram Probabilities:")
    print(bigram_probs)
    print("\nSentence Probabilities:")
    print(f"P('{sentence1}') = {probability1}")
    print(f"P('{sentence2}') = {probability2}")

Bigram Probabilities:
defaultdict(<class 'float'>, {('<s>', 'okay'): 0.006410888647795641, ('okay', 'lets'): 0.011958997722095672, ('lets', 'see'): 0.007547169811320755, ('see', 'i'): 0.0036036036036036037, ('i', 'want'): 0.20741414600864225, ('want', 'to'): 0.26136797860145206, ('to', 'go'): 0.0856610800744879, ('go', 'to'): 0.08321238509917755, ('to', 'a'): 0.018854748603351956, ('a', 'thai'): 0.007674815235929506, ('thai', 'restaurant'): 0.020589872008903727, ('restaurant', 'with'): 0.007274283269148481, ('with', 'less'): 0.001215066828675577, ('less', 'than'): 0.09971988795518208, ('than', 'ten'): 0.05156723963599596, ('ten', 'dollars'): 0.08202716823406478, ('dollars', 'per'): 0.013548951048951048, ('per', 'person'): 0.015508684863523574, ('person', '</s>'): 0.016770186335403725, ('<s>', 'i'): 0.10770292928296676, ('i', 'like'): 0.01864907891744371, ('like', 'to'): 0.3819176319176319, ('to', 'eat'): 0.1776070763500931, ('eat', 'at'): 0.017420157610949814, ('at', 'lunch'): 0.004427