<a href="https://colab.research.google.com/github/AnshJindalll/Ai-Stuff/blob/main/text_gen_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense,Dropout,LSTM
# Correct the import for np_utils
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
file = open("frankenstein.txt").read()

In [6]:
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return "".join(filtered)

processed_input = tokenize_words(file)

In [7]:
chars = sorted(list(set(processed_input)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [8]:
input_len = len(processed_input)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 232972
Total vocab: 37


In [9]:
seq_length = 100
x_data = []
y_data = []

In [10]:
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_input[i:i + seq_length]
    out_seq = processed_input[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

n_patterns = len(x_data)
print ("Total Patterns:",n_patterns)

Total Patterns: 232872


In [11]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [12]:
y = to_categorical(y_data)

In [13]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

  super().__init__(**kwargs)


In [14]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [15]:
filepath = "model_weights_saved.keras"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [16]:
model.fit(X,y, epochs=4,batch_size=256,callbacks=desired_callbacks)

Epoch 1/4
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - loss: 2.9599
Epoch 1: loss improved from inf to 2.92912, saving model to model_weights_saved.keras
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 68ms/step - loss: 2.9599
Epoch 2/4
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 68ms/step - loss: 2.9448
Epoch 2: loss did not improve from 2.92912
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 68ms/step - loss: 2.9448
Epoch 3/4
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - loss: 2.9098
Epoch 3: loss improved from 2.92912 to 2.91105, saving model to model_weights_saved.keras
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 69ms/step - loss: 2.9098
Epoch 4/4
[1m910/910[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 69ms/step - loss: 2.9088
Epoch 4: loss improved from 2.91105 to 2.90948, saving model to model_weights_saved.keras
[1m910/910

<keras.src.callbacks.history.History at 0x7931da2a7d10>

In [17]:
filename = "model_weights_saved.keras"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [18]:
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [19]:
start=numpy.random.randint(0,len(x_data)-1)
pattern=x_data[start]
print("Random Seed:")
print("\"",''.join([num_to_char[value] for value in pattern]),"\"")

Random Seed:
" ocriticalfiendmournstilllivedstillwouldobjectwouldbecomepreyaccursedvengeancepityfeellamentvictimmal "


In [23]:
for i in range(1000):
  x = numpy.reshape(pattern, (1, len(pattern), 1))
  x = x/float(vocab_len)
  prediction = model.predict(x, verbose=0)
  index = numpy.argmax(prediction)
  result = num_to_char[index]
  seq_in = [num_to_char[value] for value in pattern]
  sys.stdout.write(result)
  pattern.append(index)
  pattern = pattern[1:len(pattern)]

eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee

KeyboardInterrupt: 