# 1. Word Vector Initialization

In [160]:
import numpy as np
np.random.seed(1)  # For reproducibility

# Vocabulary: ["cat", "dog", "mat"]
V = 3  
d = 3  

# Random initialization (small Gaussian values)
word_vectors = {
    "cat": np.random.randn(d) * 0.5,  # e.g., [0.02, -0.07, 0.12]
    "dog": np.random.randn(d) * 0.1,  # e.g., [-0.03, 0.11, 0.05]
    "mat": np.random.randn(d) * 0.1   # e.g., [0.09, -0.02, -0.06]
}
word_vectors

{'cat': array([ 0.81217268, -0.30587821, -0.26408588]),
 'dog': array([-0.10729686,  0.08654076, -0.23015387]),
 'mat': array([ 0.17448118, -0.07612069,  0.03190391])}

# 2. How These Vectors Are Used in RNNs

# Step - 1 Each word is converted to its vector using a lookup table (embedding layer):

In [54]:
# Embedding layer (randomly initialized)
embedding_layer = np.random.randn(V, d) * 0.1 
print(embedding_layer)
 # Shape: [vocab_size × embedding_dim]

# Get vector for "cat"
cat_idx = 0
dog_idx = 1 
mat_idx = 2 # Assume "cat" is the 0th word in vocabulary
cat_vector = embedding_layer[cat_idx]
dog_vector = embedding_layer[dog_idx] 
mat_vector = embedding_layer[mat_idx] # e.g., [0.02, -0.07, 0.12]

[[-0.09357694 -0.02678881  0.05303555]
 [-0.06916608 -0.03967535 -0.06871727]
 [-0.08452056 -0.06712461 -0.00126646]]


In [55]:
mat_vector

array([-0.08452056, -0.06712461, -0.00126646])

# Step 2: Pass to RNN

In [56]:
# Example: Sentence = ["cat", "sat"]
input_vectors = [word_vectors["cat"], word_vectors["dog"]]
print(input_vectors)

[array([ 0.81217268, -0.30587821, -0.26408588]), array([-0.10729686,  0.08654076, -0.23015387])]


In [162]:
h_prev = np.zeros(d)
h_prev.shape

(3,)

In [161]:
W_xh = np.random.randn(3, 3) 
W_xh 

array([[-0.24937038,  1.46210794, -2.06014071],
       [-0.3224172 , -0.38405435,  1.13376944],
       [-1.09989127, -0.17242821, -0.87785842]])

In [60]:
W_hh = np.random.randn(3, 3)

In [61]:
b_h = np.random.randn(3)  # Shape: [hidden_dim]

In [63]:
h_prev = np.zeros(d)  # Initial hidden state
for word_vec in input_vectors:
    h_prev = np.tanh(np.dot(W_xh, word_vec) + np.dot(W_hh, h_prev) + b_h)
h_prev


array([-0.29018794,  0.78296709,  0.42265808])

# RNN Implementation

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.models import Sequential
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string


# Data Preparation

In [163]:
text = "This is GeeksforGeeks a software training institute Benefits of GeeksforGeeks are: 1. Learn from the best instructors 2. Get hands-on experience 3. Get a certificate of completion"
text = ''.join([char for char in text.lower() if not char.isdigit()])
text = text.translate(str.maketrans('','',string.punctuation))
words = word_tokenize(text)


# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]
print(filtered_words)

# Create word-to-index mappings
vocab = sorted(list(set(filtered_words)))
#print(vocab)
word_to_idx = {word: i+1 for i, word in enumerate(vocab)}  # 0 = padding
idx_to_word = {i+1: word for i, word in enumerate(vocab)}

['geeksforgeeks', 'software', 'training', 'institute', 'benefits', 'geeksforgeeks', 'learn', 'best', 'instructors', 'get', 'handson', 'experience', 'get', 'certificate', 'completion']


In [167]:
filtered_words

['geeksforgeeks',
 'software',
 'training',
 'institute',
 'benefits',
 'geeksforgeeks',
 'learn',
 'best',
 'instructors',
 'get',
 'handson',
 'experience',
 'get',
 'certificate',
 'completion']

In [168]:
# Generate word sequences (X) and next-word labels (y)
seq_length = 3  # Predict next word from previous 3 words
X = []
y = []
for i in range(len(filtered_words) - seq_length):
    seq = filtered_words[i:i + seq_length]
    print(seq)
    label = filtered_words[i + seq_length]
    print(label)
    X.append([word_to_idx[w] for w in seq])
    y.append(word_to_idx[label])
# Convert to numpy arrays
x = np.array(X)
y = np.array(y)

['geeksforgeeks', 'software', 'training']
institute
['software', 'training', 'institute']
benefits
['training', 'institute', 'benefits']
geeksforgeeks
['institute', 'benefits', 'geeksforgeeks']
learn
['benefits', 'geeksforgeeks', 'learn']
best
['geeksforgeeks', 'learn', 'best']
instructors
['learn', 'best', 'instructors']
get
['best', 'instructors', 'get']
handson
['instructors', 'get', 'handson']
experience
['get', 'handson', 'experience']
get
['handson', 'experience', 'get']
certificate
['experience', 'get', 'certificate']
completion


In [103]:
x,y

(array([[ 6, 12, 13],
        [12, 13,  9],
        [13,  9,  1],
        [ 9,  1,  6],
        [ 1,  6, 11],
        [ 6, 11,  2],
        [11,  2, 10],
        [ 2, 10,  7],
        [10,  7,  8],
        [ 7,  8,  5],
        [ 8,  5,  7],
        [ 5,  7,  3]]),
 array([ 9,  1,  6, 11,  2, 10,  7,  8,  5,  7,  3,  4]))

In [104]:
print([idx_to_word[idx] for idx in x[0]])

['geeksforgeeks', 'software', 'training']


# Step 4: Convert Sequences and Labels to One-Hot Encoding

In [172]:
# Model
vocab_size = len(vocab) + 1 
print(vocab_size) # +1 for padding
embedding_dim = 8

14


# Build a RNN model

In [179]:
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length = seq_length, mask_zero=True))
model.add(SimpleRNN(7, activation='tanh'))
model.add(Dense(vocab_size, activation='softmax'))



In [180]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(x, y, epochs=50, batch_size=2)

Epoch 1/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 34ms/step - accuracy: 0.1155 - loss: 2.6331  
Epoch 2/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.0560 - loss: 2.6148   
Epoch 3/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.1536 - loss: 2.6272    
Epoch 4/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.2429 - loss: 2.5897
Epoch 5/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.1952 - loss: 2.5873     
Epoch 6/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.3821 - loss: 2.5659
Epoch 7/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5131 - loss: 2.5576
Epoch 8/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.3048 - loss: 2.5718
Epoch 9/50
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[

<keras.src.callbacks.history.History at 0x2c594546e10>

In [182]:
model.summary()

In [177]:
# Predict next word given a sequence
input_seq = ["training", "institute", "benefits"]  # Must be words from vocab
input_idx = [word_to_idx[w] for w in input_seq]
print(input_idx)

[13, 9, 1]


In [178]:
pred = model.predict(np.array([input_idx]))
print(pred)
a = np.argmax(pred)
pred_word = idx_to_word[a]
print(f"Next word: {pred_word}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
[[0.02843598 0.03574641 0.07744852 0.04097601 0.02225319 0.04058674
  0.24685675 0.20690927 0.04952041 0.08318052 0.07470404 0.03824499
  0.02337885 0.03175832]]
Next word: geeksforgeeks


# Building RNN and LSTM on your own

In [202]:
text = "GeeksforGeeks is a ....computer science '''   portal for geeks"

# Clean the data

In [468]:
text = "I am chetan Fernandis. Learning Datascience and Generatiiev AI"
text = ''.join([char for char in text.lower() if not char.isdigit()])
print(text)
text = text.translate(str.maketrans('','',string.punctuation))
words = word_tokenize(text)
print(words)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word not in stop_words]

## Create word-to-index mappings
vocab = sorted(list(set(filtered_words)))
word_to_idx = {word: i+1 for i, word in enumerate(vocab)}
idx_to_word = {i+1: word for i, word in enumerate(vocab)}



i am chetan fernandis. learning datascience and generatiiev ai
['i', 'am', 'chetan', 'fernandis', 'learning', 'datascience', 'and', 'generatiiev', 'ai']


In [469]:
filtered_words

['chetan', 'fernandis', 'learning', 'datascience', 'generatiiev', 'ai']

In [470]:
seq_length = 2
x = []
y = []
for i in range(len(filtered_words) - seq_length):
    seq = filtered_words[i:i + seq_length]
    label = filtered_words[i + seq_length]
    x.append([word_to_idx[w] for w in seq])
    y.append(word_to_idx[label])
x = np.array(x)
y = np.array(y)

# Define RNN

In [471]:
embedding_dim = 5
vocab_size = len(vocab) + 1
vocab_size

7

In [472]:
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = seq_length, mask_zero=True))
model.add(SimpleRNN(7, activation='tanh',return_sequences = False))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.fit(x, y, epochs=50, batch_size=2)
model.summary()

Epoch 1/50




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 74ms/step - accuracy: 0.0000e+00 - loss: 1.9299
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step - accuracy: 0.0000e+00 - loss: 1.9203
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step - accuracy: 0.0000e+00 - loss: 1.9286
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.3333 - loss: 1.9220    
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.3333 - loss: 1.9155    
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.3333 - loss: 1.9090    
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.5000 - loss: 1.9029 
Epoch 8/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.6667 - loss: 1.8870
Epoch 9/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━

# Prediction

In [473]:
vocab

['ai', 'chetan', 'datascience', 'fernandis', 'generatiiev', 'learning']

In [474]:
input_seq = ["datascience", "generatiiev"]# Must be words from vocab
input_array = [word_to_idx[w] for w in input_seq]
print(input_array) # Must be words from vocab
result = model.predict(np.array([input_array]))
output = np.argmax(result)
pred_word = idx_to_word[output]
print(pred_word)

[3, 5]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
ai


In [475]:
result.shape

(1, 7)

# Sequence to sequence for above example with 2 label output

In [476]:
vocab_size = len(vocab) + 1
vocab_size

7

In [477]:
x1 = []
y1 = []
#vocab_size = len(word_to_idx)  # Your vocabulary size
seq_length = 2  # Input sequence length
output_tokens = 2 
for i, word in enumerate(range(len(filtered_words) - seq_length - output_tokens + 1)):
    seq = filtered_words[i:i + seq_length]

    # Target tokens (next 2 tokens after the input sequence, e.g., ["sat", "on"])
    labels = filtered_words[i + seq_length : i + seq_length + output_tokens]
    print(f"{seq}:{labels}")
    
    # Convert to indices
    x1.append([word_to_idx[char] for char in seq])
    y1.append([word_to_idx[char] for char in labels])

x1 = np.array(x1)  # Shape: (n_samples, seq_length)
y1 = np.array(y1)  # Shape: (n_samples, output_tokens)


['chetan', 'fernandis']:['learning', 'datascience']
['fernandis', 'learning']:['datascience', 'generatiiev']
['learning', 'datascience']:['generatiiev', 'ai']


In [383]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Reshape

In [478]:
model = Sequential()
model.add(Embedding(input_dim = vocab_size, output_dim = embedding_dim, input_length = seq_length, mask_zero=False))
model.add(SimpleRNN(7, activation='tanh',return_sequences = False))
model.add(Dense(vocab_size * output_tokens, activation='softmax'))
model.add(Reshape((output_tokens, vocab_size)))
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.fit(x1, y1, epochs=50, batch_size=2)
model.summary()

Epoch 1/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 85ms/step - accuracy: 0.1944 - loss: 1.9488
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - accuracy: 0.1944 - loss: 1.9449
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step - accuracy: 0.2222 - loss: 1.9425     
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 94ms/step - accuracy: 0.3889 - loss: 1.9351
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.4167 - loss: 1.9340 
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 99ms/step - accuracy: 0.6111 - loss: 1.9296
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - accuracy: 0.6944 - loss: 1.9179
Epoch 8/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.6111 - loss: 1.9211
Epoch 9/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

In [479]:
text

'i am chetan fernandis learning datascience and generatiiev ai'

In [480]:
vocab

['ai', 'chetan', 'datascience', 'fernandis', 'generatiiev', 'learning']

In [482]:
input_seq = ["learning", "datascience"]# Must be words from vocab
input_array = [word_to_idx[w] for w in input_seq] # Must be words from vocab
result = model.predict(np.array([input_array]))  # Shape: (1, output_tokens, vocab_size)
for i in range(result.shape[1]):  # For each output position
    output_idx = np.argmax(result[0, i])  # Get most likely word at position i
    pred_word = idx_to_word[output_idx]
    print(f"Predicted word {i+1}: {pred_word}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 150ms/step
Predicted word 1: generatiiev
Predicted word 2: ai
