In [22]:
import tensorflow as tf

def one_hot_encode_tensorflow(text, vocab):
    """
    Encodes text tokens into one-hot vectors using TensorFlow.

    Args:
        text: A list of tokens (words) in the text.
        vocab: A dictionary mapping words to their indices in the vocabulary.

    Returns:
        A tensor of shape (len(text), vocab_size) containing one-hot encoded vectors.
    """
    vocab_size = len(vocab)
    indices = [vocab[word] for word in text if word in vocab]
    encoded_text = tf.one_hot(indices, depth=vocab_size, dtype=tf.float32)
    return encoded_text

# Example usage
text = ["hello", "world", "how", "are", "you"]
word_to_ix = {word: i for i, word in enumerate(text)}
encoded_text = one_hot_encode_tensorflow(text, word_to_ix)
print(encoded_text)

tf.Tensor(
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]], shape=(5, 5), dtype=float32)


In [13]:
import torch

# Define a simple list of words
vocab = ['hello', 'world', 'its', 'a', 'beautiful', 'day']
word_to_ix = {word: i for i, word in enumerate(vocab)}

# Function to convert words to one-hot vectors
def one_hot_encode(word, word_to_ix, vocab_size):
    vec = torch.zeros(vocab_size)
    vec[word_to_ix[word]] = 1
    return vec

# Example usage
one_hot_hello = one_hot_encode('day', word_to_ix, len(vocab))
print(one_hot_hello)


tensor([0., 0., 0., 0., 0., 1.])


In [9]:
import tensorflow as tf

# Define a simple list of words
vocab = ['hello', 'world']
word_to_ix = {word: i for i, word in enumerate(vocab)}

# Function to convert words to one-hot vectors
def one_hot_encode(word, word_to_ix, vocab_size):
    vec = tf.one_hot(word_to_ix[word], depth=vocab_size)
    return vec

# Example usage
one_hot_hello = one_hot_encode('hello', word_to_ix, len(vocab))
print(one_hot_hello)


tf.Tensor([1. 0.], shape=(2,), dtype=float32)


In [8]:
import torch
import torch.nn as nn

# Define a simple list of words
vocab = ['hello', 'world']
word_to_ix = {word: i for i, word in enumerate(vocab)}

# Embedding layer
embedding_dim = 5
embeddings = nn.Embedding(len(vocab), embedding_dim)
print(word_to_ix)
print(embeddings)

# Example usage
hello_idx = torch.tensor([word_to_ix['hello']], dtype=torch.long)
hello_embed = embeddings(hello_idx)
print(hello_embed)


{'hello': 0, 'world': 1}
Embedding(2, 5)
tensor([[-2.1952, -0.2398,  0.0591, -2.0666, -0.3344]],
       grad_fn=<EmbeddingBackward0>)


In [15]:
import torch
import torch.nn as nn

# Define a simple list of words
vocab = ['hello', 'world', 'how', 'are', 'you']
word_to_ix = {word: i for i, word in enumerate(vocab)}

# Embedding layer
embedding_dim = 5
embeddings = nn.Embedding(len(vocab), embedding_dim)

# Example usage
hello_idx = torch.tensor([word_to_ix['hello']], dtype=torch.long)
hello_embed = embeddings(hello_idx)
print(hello_embed)

tensor([[ 1.3312, -0.9726, -1.6614, -0.4809, -0.4103]],
       grad_fn=<EmbeddingBackward0>)


In [24]:
import tensorflow as tf

# Define a simple list of words
vocab = ['hello', 'world', 'how', 'are', 'you']
word_to_ix = {word: i for i, word in enumerate(vocab)}

# Embedding layer
embedding_dim = 5
embeddings = tf.keras.layers.Embedding(input_dim=len(vocab), output_dim=embedding_dim)

# Example usage
hello_idx = tf.constant([word_to_ix['hello']])
hello_embed = embeddings(hello_idx)
print(hello_embed)

tf.Tensor([[ 0.04670728  0.04406584  0.04991266 -0.0131113   0.00373048]], shape=(1, 5), dtype=float32)


In [26]:
vocab = ['hello', 'world', 'how', 'are', 'you']
word_to_ix = {word: i for i, word in enumerate(vocab)}

# Function to convert words to integers
def integer_encode(word, word_to_ix):
    return word_to_ix[word]

# Example usage
hello_int = integer_encode('you', word_to_ix)
print(hello_int)

4


In [27]:
import torch

# Function to convert word to character-level one-hot vectors
def char_one_hot_encode(word, char_to_ix, vocab_size):
    char_vectors = []
    for char in word:
        vec = torch.zeros(vocab_size)
        vec[char_to_ix[char]] = 1
        char_vectors.append(vec)
    return torch.stack(char_vectors)

# Define characters and create index
chars = list(set('hello world'))
char_to_ix = {char: i for i, char in enumerate(chars)}

# Example usage
char_vectors = char_one_hot_encode('hello', char_to_ix, len(chars))
print(char_vectors)

tensor([[0., 0., 0., 1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0.]])


In [28]:
import tensorflow as tf

# Function to convert word to character-level one-hot vectors
def char_one_hot_encode(word, char_to_ix, vocab_size):
    char_vectors = []
    for char in word:
        vec = tf.one_hot(char_to_ix[char], depth=vocab_size)
        char_vectors.append(vec)
    return tf.stack(char_vectors)

# Define characters and create index
chars = list(set('hello world'))
char_to_ix = {char: i for i, char in enumerate(chars)}

# Example usage
char_vectors = char_one_hot_encode('hello', char_to_ix, len(chars))
print(char_vectors)

tf.Tensor(
[[0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]], shape=(5, 8), dtype=float32)


In [29]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers

# Define the tokenizer
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()

# Training data
training_data = ["hello world"]

# Trainer
trainer = trainers.BpeTrainer(vocab_size=50, min_frequency=2)

# Train the tokenizer
tokenizer.train_from_iterator(training_data, trainer)

# Encode
encoded = tokenizer.encode("hello world")
print(encoded.ids)
print(encoded.tokens)

[7, 2, 1, 3, 3, 4, 7, 6, 4, 5, 3, 0]
['Ġ', 'h', 'e', 'l', 'l', 'o', 'Ġ', 'w', 'o', 'r', 'l', 'd']


In [31]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers

# Define the tokenizer using Byte-Pair Encoding (BPE)
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()

# Training data
training_data = ["hello world"]

# Trainer setup
trainer = trainers.BpeTrainer(vocab_size=50, min_frequency=2)

# Train the tokenizer
tokenizer.train_from_iterator(training_data, trainer)

# Encode the text
encoded = tokenizer.encode("hello world")
print("Encoded IDs:", encoded.ids)
print("Encoded tokens:", encoded.tokens)

Encoded IDs: [7, 2, 1, 3, 3, 4, 7, 6, 4, 5, 3, 0]
Encoded tokens: ['Ġ', 'h', 'e', 'l', 'l', 'o', 'Ġ', 'w', 'o', 'r', 'l', 'd']


In [30]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers

# Define the tokenizer using Byte-Pair Encoding (BPE)
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
tokenizer.decoder = decoders.ByteLevel()

# Training data
training_data = ["hello world"]

# Trainer setup
trainer = trainers.BpeTrainer(vocab_size=50, min_frequency=2)

# Train the tokenizer
tokenizer.train_from_iterator(training_data, trainer)

# Encode the text
encoded = tokenizer.encode("hello world")
print("Encoded IDs:", encoded.ids)

Encoded IDs: [7, 2, 1, 3, 3, 4, 7, 6, 4, 5, 3, 0]


In [36]:
from transformers import BertTokenizer
import torch

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example text
text = "unaffable"

# Encode the text using WordPiece encoding
encoded = tokenizer.encode(text, add_special_tokens=False)
encoded_tokens = tokenizer.convert_ids_to_tokens(encoded)

# Convert to PyTorch tensor
encoded_tensor = torch.tensor(encoded)

print("Encoded IDs:", encoded)
print("Encoded Tokens:", encoded_tokens)
print("Encoded Tensor:", encoded_tensor)

Encoded IDs: [14477, 20961, 3468]
Encoded Tokens: ['una', '##ffa', '##ble']
Encoded Tensor: tensor([14477, 20961,  3468])


In [35]:
from transformers import BertTokenizer
import tensorflow as tf

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example text
text = "unaffable"

# Encode the text using WordPiece encoding
encoded = tokenizer.encode(text, add_special_tokens=False)
encoded_tokens = tokenizer.convert_ids_to_tokens(encoded)

# Convert to TensorFlow tensor
encoded_tensor = tf.constant(encoded)

print("Encoded IDs:", encoded)
print("Encoded Tokens:", encoded_tokens)
print("Encoded Tensor:", encoded_tensor)

Encoded IDs: [14477, 20961, 3468]
Encoded Tokens: ['una', '##ffa', '##ble']
Encoded Tensor: tf.Tensor([14477 20961  3468], shape=(3,), dtype=int32)


In [34]:
import torch

# Sample data
text = 'unaffable'

# Create a vocabulary (usually learned from a larger corpus)
vocab = ['un', 'aff', 'able', 'a', 'affable']
vocab_to_index = {word: idx for idx, word in enumerate(vocab)}

# WordPiece Encoding
def wordpiece_tokenize(text, vocab):
    tokens = []
    i = 0
    while i < len(text):
        for j in range(len(text), i, -1):
            if text[i:j] in vocab:
                tokens.append(text[i:j])
                i = j - 1
                break
        i += 1
    return tokens

# Encode text
wordpiece_encoded = wordpiece_tokenize(text, vocab)

# Convert to indices
wordpiece_indices = [vocab_to_index[token] for token in wordpiece_encoded]

# Convert to PyTorch tensor
wordpiece_tensor = torch.tensor(wordpiece_indices, dtype=torch.long)
print(wordpiece_tensor)

tensor([0, 4])
