In [1]:
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import bigrams
from nltk.probability import FreqDist, ConditionalFreqDist
from nltk import word_tokenize, sent_tokenize
from gensim.models import Word2Vec

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Relative Frequency approach**

In [5]:
import string
import re
import nltk
from nltk import FreqDist, bigrams
import math

def preprocess_text(file_path):
    with open(file_path, 'r') as file:
        letters = file.read().lower()
        letters = letters.translate(str.maketrans('', '', string.punctuation))
    return letters

def calculate_relative_frequency_perplexity(cfdist, fdist_uni, tokens, smoothing_factor=0.01):
    num_words = len(tokens)
    vocab_size = len(fdist_uni)
    log_sum = 0

    for i in range(num_words - 1):
        current_word = tokens[i]
        next_word = tokens[i + 1]

        count_current_word = fdist_uni[current_word] + smoothing_factor * vocab_size
        count_bigram = cfdist[current_word][next_word] + smoothing_factor

        relative_frequency = count_bigram / count_current_word

        if relative_frequency > 0:
            log_sum += math.log2(relative_frequency)

    perplexity = 2 ** (-log_sum / (num_words - 1))
    return perplexity

# Preprocess the Warren Buffet's letters
file_path = '/content/drive/My Drive/WarrenBuffet.txt'
letters = preprocess_text(file_path)

# Tokenize the text into words
words = nltk.word_tokenize(letters)

# Create a list of bigrams
word_bigrams = list(bigrams(words))

# Calculate the frequency distribution of bigrams
freq_dist = FreqDist(word_bigrams)

print(freq_dist.most_common(5))

# Calculate the relative frequency approach perplexity
cfdist = nltk.ConditionalFreqDist(word_bigrams)
fdist_uni = FreqDist(words)
tokens = words

rf_preplexity = calculate_relative_frequency_perplexity(cfdist, fdist_uni, tokens)
print(f"Relative Frequency Approach Perplexity: {rf_preplexity}")


[(('of', 'the'), 223), (('in', 'the'), 210), (('and', 'i'), 116), (('will', 'be'), 113), (('we', 'will'), 105)]
Relative Frequency Approach Perplexity: 78.53120031377827


In [6]:
len(fdist_uni)

6875

# **Neural embedding approach**

In [7]:
import string
import re
import nltk
from nltk import FreqDist, bigrams
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
import math

def preprocess_text(file_path):
    with open(file_path, 'r') as file:
        letters = file.read().lower()
        letters = letters.translate(str.maketrans('', '', string.punctuation))
    return letters

def calculate_neural_embedding_perplexity(model, tokens):
    num_words = len(tokens)
    log_sum = 0

    for i in range(num_words - 1):
        current_word = tokens[i]
        next_word = tokens[i + 1]

        if current_word in model and next_word in model:
            similarity = model.similarity(current_word, next_word)
            if similarity > 0:
                log_sum += math.log2(similarity)

    perplexity = 2 ** (-log_sum / (num_words - 1))
    return perplexity

# Preprocess the Warren Buffet's letters
file_path = '/content/drive/My Drive/WarrenBuffet.txt'
letters = preprocess_text(file_path)

# Tokenize the text into words
words = nltk.word_tokenize(letters)

# Train Word2Vec model on the text corpus
model = Word2Vec([words], vector_size=100, window=5, min_count=1, workers=4)

# Save and load the model to get the KeyedVectors instance
model.wv.save_word2vec_format("word2vec_model.bin", binary=True)
word_vectors = KeyedVectors.load_word2vec_format("word2vec_model.bin", binary=True)

# Calculate the perplexity using the neural embedding approach
ne_perplexity = calculate_neural_embedding_perplexity(word_vectors, words)
print(f"Neural Embedding Approach Perplexity: {ne_perplexity}")



Neural Embedding Approach Perplexity: 2.209746106510776


In [10]:
if rf_preplexity < ne_perplexity:
    print("Relative Frequency Approach has lower perplexity.")
elif rf_preplexity > ne_perplexity:
    print("Neural Embedding Approach has lower perplexity.")
else:
    print("Both approaches have the same perplexity.")

Neural Embedding Approach has lower perplexity.


The Neural Embedding Approach has lower preplexity compared to the Relative frequency approach.