In [20]:
import numpy as np
import torch
import torch.nn as nn

import string
import nltk
from nltk.corpus import stopwords
from nltk import bigrams, trigrams
from nltk.tokenize import word_tokenize

import random
from tqdm import tqdm
from collections import Counter, defaultdict
from dataclasses import dataclass

In [21]:
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/akeresh/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to /Users/akeresh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/akeresh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
updated_punctuation = string.punctuation + "''--"

In [23]:
stop_words = list(stopwords.words('english'))

In [24]:
# text dataset Edgeworth Parents.:
edgeworth = nltk.corpus.gutenberg.words('edgeworth-parents.txt')
#filter from punctuation:
edgeworth_filt = [token for token in edgeworth if token not in updated_punctuation]

In [25]:
# Word count
len(edgeworth), len(edgeworth_filt)

(210663, 174879)

In [26]:
edgeworth_tokens =  word_tokenize(" ".join(edgeworth_filt).lower())
clean_edgeworth_tokens = [token for token in edgeworth_tokens if token not in stop_words and token not in updated_punctuation]

In [27]:
# Token count
len(edgeworth_tokens), len(clean_edgeworth_tokens)

(179099, 78183)

## Create bigrams, and calculate frequency, conditional frequency

In [94]:
edgeworth_bigram_freq = Counter(bigrams(clean_edgeworth_tokens))

In [145]:
list(edgeworth_bigram_freq.items())[:10]

[(('parent', 'assistant'), 1),
 (('assistant', 'maria'), 1),
 (('maria', 'edgeworth'), 1),
 (('edgeworth', 'orphans'), 1),
 (('orphans', 'near'), 1),
 (('near', 'ruins'), 1),
 (('ruins', 'castle'), 1),
 (('castle', 'rossmore'), 4),
 (('rossmore', 'ireland'), 1),
 (('ireland', 'small'), 1)]

In [114]:
total_counts_for_w1 = defaultdict(int)
for (w1, w2), freq in edgeworth_bigram_freq.items():
    total_counts_for_w1[w1] += freq

In [144]:
list(total_counts_for_w1.items())[:10]

[('parent', 1),
 ('assistant', 1),
 ('maria', 1),
 ('edgeworth', 1),
 ('orphans', 21),
 ('near', 53),
 ('ruins', 12),
 ('castle', 35),
 ('rossmore', 12),
 ('ireland', 10)]

In [115]:
bigram_conditional_freq = defaultdict(lambda: defaultdict(float))
for (w1, w2), freq in tqdm(edgeworth_bigram_freq.items()):
    bigram_conditional_freq[w1][w2] = freq / total_counts_for_w1[w1]

100%|██████████| 63855/63855 [00:00<00:00, 1365532.15it/s]


In [118]:
len(bigram_conditional_freq)

8197

In [124]:
total_counts_for_w1["ruins"]

12

In [123]:
bigram_conditional_freq["ruins"]

defaultdict(float,
            {'castle': 0.08333333333333333,
             'old': 0.08333333333333333,
             'farthest': 0.08333333333333333,
             'could': 0.08333333333333333,
             'improbability': 0.08333333333333333,
             'rossmore': 0.08333333333333333,
             'ancient': 0.08333333333333333,
             'palaces': 0.08333333333333333,
             'therefore': 0.08333333333333333,
             'herculaneum': 0.16666666666666666,
             'try': 0.08333333333333333})

In [138]:
edgeworth_bigram_freq[("ruins", "castle")], bigram_conditional_freq["ruins"]["castle"] 

(1, 0.08333333333333333)

In [139]:
edgeworth_bigram_freq[("ruins", "herculaneum")], bigram_conditional_freq["ruins"]["herculaneum"]

(2, 0.16666666666666666)

## Create trigrams, and calculate frequency, conditional frequency

In [146]:
edgeworth_triram_freq = Counter(trigrams(clean_edgeworth_tokens))

In [152]:
list(edgeworth_triram_freq.items())[:10]

[(('parent', 'assistant', 'maria'), 1),
 (('assistant', 'maria', 'edgeworth'), 1),
 (('maria', 'edgeworth', 'orphans'), 1),
 (('edgeworth', 'orphans', 'near'), 1),
 (('orphans', 'near', 'ruins'), 1),
 (('near', 'ruins', 'castle'), 1),
 (('ruins', 'castle', 'rossmore'), 1),
 (('castle', 'rossmore', 'ireland'), 1),
 (('rossmore', 'ireland', 'small'), 1),
 (('ireland', 'small', 'cabin'), 1)]

In [159]:
total_counts_for_w1_w2 = defaultdict(int)
for (w1, w2, w3), freq in edgeworth_triram_freq.items():
    total_counts_for_w1_w2[(w1, w2)] += freq

In [160]:
list(total_counts_for_w1_w2.items())[:10]

[(('parent', 'assistant'), 1),
 (('assistant', 'maria'), 1),
 (('maria', 'edgeworth'), 1),
 (('edgeworth', 'orphans'), 1),
 (('orphans', 'near'), 1),
 (('near', 'ruins'), 1),
 (('ruins', 'castle'), 1),
 (('castle', 'rossmore'), 4),
 (('rossmore', 'ireland'), 1),
 (('ireland', 'small'), 1)]

In [162]:
trigram_conditional_freq = defaultdict(lambda: defaultdict(lambda: defaultdict(float)))
for (w1, w2, w3), freq in tqdm(edgeworth_triram_freq.items()):
    trigram_conditional_freq[w1][w2][w3] = freq / total_counts_for_w1_w2[(w1, w2)]

100%|██████████| 76702/76702 [00:00<00:00, 433093.09it/s]


In [183]:
def print_trigram_conditional_freq(trigram_conditional_freq, head=10):
    counter = 0
    for w1, w2_dicts in trigram_conditional_freq.items():
        for w2, w3_dict in w2_dicts.items():
            for w3, freq in w3_dict.items():
                print(f"('{w1}', '{w2}', '{w3}'): {freq}")
                counter += 1
                if counter == head:
                    return

print_trigram_conditional_freq(trigram_conditional_freq)

('parent', 'assistant', 'maria'): 1.0
('assistant', 'maria', 'edgeworth'): 1.0
('maria', 'edgeworth', 'orphans'): 1.0
('edgeworth', 'orphans', 'near'): 1.0
('orphans', 'near', 'ruins'): 1.0
('orphans', 'soon', 'one'): 1.0
('orphans', 'left', 'alone'): 1.0
('orphans', 'removed', 'taking'): 1.0
('orphans', 'ready', 'help'): 1.0
('orphans', 'putting', 'garland'): 1.0


## Generate text by bigrams and conditional freq

In [269]:
bigrams_model = {}
for w1, w2_freq in iter(bigram_conditional_freq.items()):
    if w1 not in bigrams_model:
        bigrams_model[w1] = []
    for w2, freq in w2_freq.items():
        bigrams_model[w1].append((w2, freq))

In [290]:
def generate_sentence_bigram(bigrams_model, start_seed, length, debug=False):
    if start_seed not in bigrams_model:
        raise ValueError("The start seed is not in the trigram model.")
    
    current_word = start_seed
    generated_text = [current_word]
    
    for _ in range(length-2):
        if current_word in bigrams_model:
            possible_continuations = bigrams_model[current_word]
            words = [word for word, _ in possible_continuations]
            weights = [probability for _, probability in possible_continuations]
            assert len(words) == len(weights), "Words and weights length must be same"
            # next_word = random.choice(possible_continuations)[0]
            new_weighted_word = random.choices(words, weights, k=1)[0]
            # print("|" * 26)
            # print(sorted(possible_continuations, key=lambda x: x[1], reverse=False))
            # print(sorted(possible_continuations, key=lambda x: x[1], reverse=True))
            # print("|" * 26)
            generated_text.append(new_weighted_word)
            current_word = generated_text[-1]
            if debug:
                print("Possible continuations:", len(possible_continuations))
                print(possible_continuations)
                print("Next word:", new_weighted_word)
                print("Generated text:", " ".join(generated_text))
                print("---" * 26)
        else:
            print("No continuation available")
            break 
    
    return ' '.join(generated_text)

start_seed = ('assistant') 
generated_text = generate_sentence_bigram(bigrams_model, start_seed, 8, debug=False) 
print("Result:", generated_text)

Result: assistant maria edgeworth orphans near till morning


## Generate text by trigrams with random

In [230]:
trigrams_model = {}
for (w1, w2, w3), freq in edgeworth_triram_freq.items():
    if (w1, w2) not in trigrams_model:
        trigrams_model[(w1, w2)] = []
    trigrams_model[(w1, w2)].append((w3, freq))

def generate_sentence_trigram(trigrams_model, start_seed, length, debug=False):
    if start_seed not in trigrams_model:
        raise ValueError("The start seed is not in the trigram model.")
    
    current_pair = start_seed
    generated_text = [current_pair[0], current_pair[1]]
    
    for _ in range(length-2):
        if current_pair in trigrams_model:
            possible_continuations = trigrams_model[current_pair]
            
            next_word = random.choice(possible_continuations)[0]
            generated_text.append(next_word)
            current_pair = (generated_text[-2], generated_text[-1])
            if debug:
                print("Possible continuations:", len(possible_continuations))
                print(possible_continuations)
                print("Next word:", next_word)
                print("Generated text:", " ".join(generated_text))
                print("New generated pair:", generated_text[-2], generated_text[-1])
                print("---" * 26)
        else:
            print("No continuation available")
            break 
    
    return ' '.join(generated_text)

start_seed = ('assistant', 'maria') 
generated_text = generate_sentence_trigram(trigrams_model, start_seed, 5, debug=True) 
print("Result:", generated_text)

Possible continuations: 1
[('edgeworth', 1)]
Next word: edgeworth
Generated text: assistant maria edgeworth
New generated pair: maria edgeworth
------------------------------------------------------------------------------
Possible continuations: 1
[('orphans', 1)]
Next word: orphans
Generated text: assistant maria edgeworth orphans
New generated pair: edgeworth orphans
------------------------------------------------------------------------------
Possible continuations: 1
[('near', 1)]
Next word: near
Generated text: assistant maria edgeworth orphans near
New generated pair: orphans near
------------------------------------------------------------------------------
Result: assistant maria edgeworth orphans near


## Create and train Word2Vec model

In [52]:
import torch
import torch.optim as optim
import numpy as np


def build_vocabulary(tokenized_training_text):
    vocabulary = set()
    for token in tokenized_training_text:
        vocabulary.add(token)
    return list(vocabulary)


tokenized_text = clean_edgeworth_tokens

vocabulary = build_vocabulary(tokenized_text)
word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

In [61]:
def get_input_layer(word_idx, vocabulary_size):
    x = torch.zeros(vocabulary_size)
    x[word_idx] = 1.0
    return x


def create_idx_pairs(tokens, word2idx, window_size=1):
    idx_pairs = []
    for i in range(len(tokens)):
        token = tokens[i]
        for neighbor in tokens[max(i - window_size, 0):min(i + window_size + 1, len(tokens))]:
            if neighbor != token:
                idx_pairs.append((word2idx[token], word2idx[neighbor]))
    return idx_pairs

idx_pairs = create_idx_pairs(clean_edgeworth_tokens, word2idx)

In [63]:
len(idx_pairs)

155290

In [64]:
class SkipGramModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)  # Embedding layer
        self.linear = nn.Linear(embedding_dim, vocab_size)  # Prediction layer

    def forward(self, inputs):
        embeds = self.embeddings(inputs)  # Convert words to embeddings
        out = self.linear(embeds)  # Predict context words
        return out

In [69]:
# Model parameters
vocab_size = len(vocabulary)
embedding_dim = 100  # Adjust based on experimentation
learning_rate = 0.001
epochs = 50  # Adjust based on experimentation

# Initialize model, loss, and optimizer
model = SkipGramModel(vocab_size, embedding_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Convert idx_pairs to a tensor for more efficient processing
idx_pairs_tensor = torch.tensor(idx_pairs, dtype=torch.long)

# Training loop
for epoch in range(epochs):
    total_loss = 0
    for center, context in tqdm(idx_pairs_tensor):
        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        log_probs = model(center.unsqueeze(0))  # Center word needs to be wrapped in a tensor

        # Compute the loss
        loss = criterion(log_probs, context.unsqueeze(0))  # Context word needs to be wrapped in a tensor

        # Backward pass and optimize
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    # Print loss every epoch
    print(f'Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(idx_pairs)}')


  0%|          | 0/155290 [00:00<?, ?it/s]

  7%|▋         | 10159/155290 [01:14<17:40, 136.89it/s]


KeyboardInterrupt: 

## Another datasets

In [71]:
shakham = nltk.corpus.gutenberg.words('melville-moby_dick.txt')
#filter from punctuation:
shakham_filt = [token for token in shakham if token not in string.punctuation]

In [72]:
# Token count
len(shakham), len(shakham_filt)

(260819, 221767)

In [12]:
austen = nltk.corpus.gutenberg.words('austen-emma.txt')
#filter from punctuation:
austen_filt = [token for token in austen if token not in string.punctuation]

In [19]:
# Token count
len(austen), len(austen_filt)

(192427, 167028)

In [15]:
whitman = nltk.corpus.gutenberg.words('whitman-leaves.txt')
#filter from punctuation:
whitman_filt = [token for token in whitman if token not in string.punctuation]

In [17]:
# Token count
len(whitman), len(whitman_filt)

(154883, 127995)