<a href="https://colab.research.google.com/github/Amey2510/Test-Repo/blob/main/Harry_Potter_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

### Text Cleaning

In [None]:
import os
os.listdir()

['.config', 'harry2.txt', 'harry.txt', 'harry3.txt', 'sample_data']

In [None]:
files = ['harry.txt','harry2.txt','harry3.txt']

In [None]:
import re
sentences = []
for file in files: # loop through each file
  with open(file,'r',encoding='utf-8') as f:
    for line in f:
      if ';' not in line: # split the line if ';' found, so that sentences are obtained
        continue
      _,sentence = line.split(';')

      ## Sentence obtained from above method is now put for cleaning
      sentence = sentence.lower()
      sentence = sentence.replace('\t',' ')
      sentence = re.sub(r'[^a-z0-9?.!, ]+',' ',sentence)
      sentence = re.sub(r'\s+'," ",sentence).strip()

      if sentence:
        sentences.append(sentence)
print("Total sentences:", len(sentences))


Total sentences: 4927


In [None]:
for sent in sentences[:500]:
  print(sent)

sentence
i should ve known that you would be here professor mcgonagall.
good evening professor dumbledore.
are the rumors true albus?
i m afraid so professor.
the good and the bad.
and the boy?
hagrid is bringing him.
do you think it wise to trust hagrid with something as important as this?
ah professor i would trust hagrid with my life.
professor dumbledore sir.
professor mcgonagall.
no problems i trust hagrid?
no sir.
little tyke fell asleep just as we were flying over bristol.
try not to wake him.
there you go.
albus do you really think it s safe leaving him with these people?
i ve watched them all day.
they re the worst sort of muggles imaginable.
they really are...
the only family he has.
this boy will be famous.
there won t be a child in our world who doesn t know his name.
exactly.
he s far better off growing up away from all of that.
until he s ready.
there there hagrid.
it s not really goodbye after all.
good luck...harry potter.
up. get up!
now!
wake up cousin!
we re going to

### Tokenization

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

total_words = len(tokenizer.word_index) + 1
print('Total words:', total_words)

Total words: 3236


In [None]:
from keras.preprocessing.sequence import pad_sequences
sequences = tokenizer.texts_to_sequences(sentences)
input_sequences = []
for seq in sequences:
    # create n-grams: [w1,w2], [w1,w2,w3], ...
    for i in range(1, len(seq)):
        ngram = seq[:i+1]
        input_sequences.append(ngram)
input_sequences[:20]


[[3, 142],
 [3, 142, 58],
 [3, 142, 58, 419],
 [3, 142, 58, 419, 8],
 [3, 142, 58, 419, 8, 1],
 [3, 142, 58, 419, 8, 1, 77],
 [3, 142, 58, 419, 8, 1, 77, 14],
 [3, 142, 58, 419, 8, 1, 77, 14, 51],
 [3, 142, 58, 419, 8, 1, 77, 14, 51, 65],
 [3, 142, 58, 419, 8, 1, 77, 14, 51, 65, 293],
 [61, 420],
 [61, 420, 65],
 [61, 420, 65, 92],
 [32, 2],
 [32, 2, 966],
 [32, 2, 966, 270],
 [32, 2, 966, 270, 525],
 [3, 52],
 [3, 52, 315],
 [3, 52, 315, 68]]

In [None]:
max_seq_len = max(len(s) for s in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')
x = input_sequences[:, :-1]          # inputs (length = max_seq_len-1)
y = input_sequences[:, -1]           # labels (single token)
print('x shape:', x.shape, 'y shape:', y.shape, 'max_seq_len:', max_seq_len)



x shape: (26305, 37) y shape: (26305,) max_seq_len: 38


In [None]:
word_ind = tokenizer.word_index

In [None]:
reverse_word_index = {value:key for key,value in word_ind.items()}
reverse_word_index

{1: 'you',
 2: 'the',
 3: 'i',
 4: 'to',
 5: 's',
 6: 'it',
 7: 'a',
 8: 'that',
 9: 'of',
 10: 't',
 11: 'and',
 12: 'harry',
 13: 'is',
 14: 'be',
 15: 'what',
 16: 'he',
 17: 'in',
 18: 'me',
 19: 'we',
 20: 'on',
 21: 'this',
 22: 'your',
 23: 'no',
 24: 'not',
 25: 'have',
 26: 'do',
 27: 'was',
 28: 'but',
 29: 'for',
 30: 'there',
 31: 'come',
 32: 'are',
 33: 'my',
 34: 'don',
 35: 'll',
 36: 're',
 37: 'can',
 38: 'all',
 39: 'go',
 40: 'now',
 41: 'know',
 42: 'well',
 43: 'one',
 44: 'who',
 45: 'if',
 46: 'they',
 47: 'with',
 48: 'him',
 49: 'see',
 50: 'potter',
 51: 'here',
 52: 'm',
 53: 'just',
 54: 'will',
 55: 'at',
 56: 'up',
 57: 'think',
 58: 've',
 59: 'right',
 60: 'how',
 61: 'good',
 62: 'hagrid',
 63: 'oh',
 64: 'about',
 65: 'professor',
 66: 'get',
 67: 'like',
 68: 'so',
 69: 'as',
 70: 'got',
 71: 'ron',
 72: 'let',
 73: 'yes',
 74: 'out',
 75: 'did',
 76: 'been',
 77: 'would',
 78: 'very',
 79: 'then',
 80: 'back',
 81: 'them',
 82: 'his',
 83: 'why',
 8

In [None]:
total_length = len(word_ind) + 1

In [None]:
input_len = max_seq_len - 1

### Model Building

In [None]:
from keras.models import Sequential
from keras.layers import Input,Dense,LSTM,Dropout,Embedding

In [None]:
model = Sequential()
## Input Layer
model.add(Input((max_seq_len,)))
## Add the Layers
model.add(Embedding(input_dim=total_length,output_dim=256,trainable=True))
model.add(LSTM(256,return_sequences=True)) # return sequences will provide the sequences to next LSTM layer
model.add(Dropout(0.2))
model.add(LSTM(128))
# Add one Hidden layer
model.add(Dense(100, activation='tanh'))
# Add output layer
model.add(Dense(total_length, activation='softmax'))

In [None]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.summary()

In [None]:
from keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss',patience=3)

In [None]:
nn = model.fit(x,y,validation_split=0.2,epochs=30,callbacks=[early_stop])

Epoch 1/30
[1m658/658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 11ms/step - accuracy: 0.0341 - loss: 6.7233 - val_accuracy: 0.0363 - val_loss: 6.3771
Epoch 2/30
[1m658/658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.0451 - loss: 6.0913 - val_accuracy: 0.0675 - val_loss: 6.2113
Epoch 3/30
[1m658/658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.0864 - loss: 5.6892 - val_accuracy: 0.1004 - val_loss: 5.9941
Epoch 4/30
[1m658/658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.1224 - loss: 5.3166 - val_accuracy: 0.1148 - val_loss: 5.9630
Epoch 5/30
[1m658/658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.1450 - loss: 5.0740 - val_accuracy: 0.1173 - val_loss: 5.9410
Epoch 6/30
[1m658/658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.1549 - loss: 4.8591 - val_accuracy: 0.1272 - val_loss: 6.0224
Epoch 7/30
[1m658/65

In [None]:
import numpy as np
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_text(seed_text, next_words=20, temperature=1.0):
    result = seed_text
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([result])[0]
        token_list = pad_sequences([token_list], maxlen=input_len, padding='pre')
        preds = model.predict(token_list, verbose=0)[0]   # probabilities over vocab
        next_index = sample(preds, temperature)
        next_word = tokenizer.index_word.get(next_index, '')
        if next_word == '':
            break
        result += ' ' + next_word
    return result


In [None]:
print(generate_text("harry said", next_words=50, temperature=0.8))

harry said it s every part of the chamber of secrets s full heads to floo quidditch secondhand cauldron is six at the noted hat of all our time at the pitch to be opened these creature that luck to see you that ended you delighted to cure a little quidditch myrtle


In [None]:
print(generate_text("harry raised his wand and", next_words=50, temperature=0.9))

harry raised his wand and about our train leaves in hour back for the car wing freaky school and it making been arriving messages to the little time from the wrong of that voldemort should find thanks again the predisposition i could find to have happening that is a bit of the child of my
