In [41]:
import nltk
nltk.download('brown')

import numpy as np
import random
import re
import pickle

from nltk.corpus import brown
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [42]:
# sentences = brown.sents()
# sentences = [' '.join(s).lower() for s in sentences]


In [43]:
from nltk.corpus import brown
print(len(brown.sents()))


57340


In [45]:
def get_clean_sentences(limit=2000):
    sentences = brown.sents()[:limit]
    clean = []
    for sent in sentences:
        s = " ".join(sent).lower()
        s = re.sub(r"[^a-z\s]", "", s)
        clean.append(s.strip())
    return clean

clean_sentences = get_clean_sentences()
print(clean_sentences[0])


the fulton county grand jury said friday an investigation of atlantas recent primary election produced  no evidence  that any irregularities took place


In [46]:

from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_sentences)


In [47]:
word_index = tokenizer.word_index
print("Total unique words:", len(word_index))


Total unique words: 6893


In [48]:
sequences = []

for sentence in clean_sentences:
    token_list = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(token_list)):
        sequences.append(token_list[:i+1])


In [49]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = max(len(seq) for seq in sequences)
sequences = pad_sequences(sequences, maxlen=max_len, padding='pre')

X = sequences[:, :-1]
y = sequences[:, -1]


In [50]:
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size:", vocab_size)


Vocabulary size: 6894


In [51]:
from tensorflow.keras.utils import to_categorical

y = to_categorical(y, num_classes=vocab_size)


In [52]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input

model = Sequential([
    Input(shape=(max_len-1,)),
    Embedding(input_dim=vocab_size, output_dim=100),
    LSTM(150),
    Dense(vocab_size, activation='softmax')
])

model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.summary()


In [53]:
history = model.fit(
    X,
    y,
    epochs=20,
    batch_size=64,
    verbose=1
)


Epoch 1/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 64ms/step - accuracy: 0.0685 - loss: 7.5104
Epoch 2/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 64ms/step - accuracy: 0.0864 - loss: 6.7547
Epoch 3/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 91ms/step - accuracy: 0.1029 - loss: 6.4158
Epoch 4/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 93ms/step - accuracy: 0.1182 - loss: 6.0998
Epoch 5/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 76ms/step - accuracy: 0.1281 - loss: 5.8385
Epoch 6/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 71ms/step - accuracy: 0.1406 - loss: 5.5064
Epoch 7/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 71ms/step - accuracy: 0.1543 - loss: 5.2037
Epoch 8/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 72ms/step - accuracy: 0.1698 - loss: 4.9200
Epoch 9/20
[1m553/553[

In [54]:
print(history.history.keys())


dict_keys(['accuracy', 'loss'])


In [55]:
print("Final loss:", history.history['loss'][-1])
print("Final accuracy:", history.history['accuracy'][-1])


Final loss: 2.2587697505950928
Final accuracy: 0.556269109249115


In [56]:
print(len(tokenizer.word_index))


6893


In [57]:
print(max_len)


67


In [58]:
def generate_text(seed_text, next_words=10):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')

        predicted = np.argmax(model.predict(token_list, verbose=0))

        for word, index in tokenizer.word_index.items():
            if index == predicted:
                seed_text += " " + word
                break
    return seed_text


In [59]:
print(generate_text("the government", 10))
print(generate_text("he was", 12))


the government was the noted exotic dancer patti waggin who was a
he was married on the wedding club for the first of the sixth on


In [61]:
model.save("language_model_lstm.h5")
model.save("language_model_lstm.keras")




In [62]:
import pickle
with open("tokenizer_lm.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


In [63]:
print("Step-1 Language Model & Tokenizer saved successfully.")


Step-1 Language Model & Tokenizer saved successfully.


In [30]:
# def sample_with_temperature(preds, temperature=0.8):
#     preds = np.asarray(preds).astype('float64')
#     preds = np.log(preds + 1e-8) / temperature
#     exp_preds = np.exp(preds)
#     preds = exp_preds / np.sum(exp_preds)
#     return np.random.choice(len(preds), p=preds)


In [36]:
# def generate_text(seed_text, next_words=10, temperature = 0.8):
#     for _ in range(next_words):
#         token_list = tokenizer.texts_to_sequences([seed_text])[0]
#         token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')

#         preds = model.predict(token_list, verbose=0)[0]
#         predicted = sample_with_temperature(preds, temperature)


#         for word, index in tokenizer.word_index.items():
#             if index == predicted:
#                 seed_text += " " + word
#                 break
#     return seed_text
# #  AGAR HM ES FUNCTIION KA US EKR RAHE HAI TO HAME kuchh random words mil rahe hai HIGH PROBABILITY WALE KE ALAWA BHI KISI KO CONSIDER KR LE RHA HAI

In [37]:
# print(generate_text("the government", 15, temperature=0.7))
# print(generate_text("the government", 15, temperature=1.0))
# print(generate_text("the government", 15, temperature=1.2))



the government would increase from the balkan strings playing for a movement of which the democratic gubernatorial
the government was in marriage along with may only the official boards four or now that included
the government would not be special increase in portland would been for my children in the world


In [14]:
# #  MUST be written AFTER tokenization & padding
# vocab_size = len(tokenizer.word_index) + 1

# model = Sequential([
#     Embedding(
#         input_dim=vocab_size,
#         output_dim=64,
#         input_length=max_len
#     ),
#     Bidirectional(LSTM(128, return_sequences=True)),
#     Dense(vocab_size, activation='softmax')
# ])

# model.compile(
#     optimizer='adam',
#     loss='sparse_categorical_crossentropy',
#     metrics=['accuracy']
# )

# # model.summary()



In [15]:
# print("Vocab size:", vocab_size)
# print("Max sequence length:", max_len)
# print("X_train shape:", X_train.shape)
# print("y_train shape:", y_train.shape)


Vocab size: 32
Max sequence length: 399
X_train shape: (4000, 399)
y_train shape: (4000, 399, 1)


In [10]:
# y_train = np.expand_dims(y_train, -1)
# y_test = np.expand_dims(y_test, -1)


In [11]:
# indices = np.arange(len(X_train))
# np.random.shuffle(indices)

# X_train = X_train[indices]
# y_train = y_train[indices]


In [12]:
# model = Sequential([
#     Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=64),
#     Bidirectional(LSTM(128, return_sequences=True)),
#     Dense(len(tokenizer.word_index)+1, activation='softmax')
# ])

# model.compile(
#     optimizer='adam',
#     loss='sparse_categorical_crossentropy',
#     metrics=['accuracy']
# )

# model.summary()
