<a href="https://colab.research.google.com/github/Akshay8055143/Artificial-Intelligence-1446/blob/main/LSTM_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Project Overview: Build a Text Generation Model using LSTM. The goal is to build a model that can learn language patterns from text and generate new sentence that mimic the training data

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

## Text Cleaning

In [15]:
import os
os.listdir()

['.config',
 'Harry Potter 2.csv',
 'Harry Potter 3.csv',
 'Harry Potter 1.csv',
 '.ipynb_checkpoints',
 'sample_data']

In [16]:
files = ['Harry Potter 2.csv',
 'Harry Potter 3.csv',
 'Harry Potter 1.csv']

In [17]:
import re
sentences = []
for file in files: # loop through each file
  with open(file,'r',encoding='utf-8') as f:
    for line in f:
      if ';' not in line: # split the line if ';' found, so that sentences are obtained
        continue
      _,sentence = line.split(';')

      ## Sentence obtained from above method is now put for cleaning
      sentence = sentence.lower()
      sentence = sentence.replace('\t',' ')
      sentence = re.sub(r'[^a-z0-9?.!, ]+',' ',sentence)
      sentence = re.sub(r'\s+'," ",sentence).strip()

      if sentence:
        sentences.append(sentence)
print("Total sentences:", len(sentences))

Total sentences: 4927


In [18]:
for sent in sentences[:500]:
  print(sent)

sentence
i can t let you out, hedwig.
i m not allowed to use magic outside of school.
besides, if uncle vernon
harry potter!
now you ve done it.
he s in there. vernon...
but she s bored!
if i could only let out for an hour or two
huh, huh! so you could send secret messages to you freaky little friends. no, sir!
but i haven t had any messages from any of my friends... not one all summer.
who d want to be friends with you?
i should think you d be a little more grateful.
we ve raised you since you were a baby, given you the food off our table, even let you have dudley s second bedroom, purely out of the goodness of our hearts.
not now, pupkins. for when the masons arrive.
which should be any minute!
ahem...now let s go over our schedule once again, shall we?
petunia when the masons arrive you will be...?
in the lounge, waiting to welcome them graciously into our home.
good! and and dudley, you will be...?
i ll be waiting to open the door!
excellent! ...and you...?
i ll be in my bedroom, m

## Tokenization

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

total_words = len(tokenizer.word_index) + 1
print('Total words:', total_words)

Total words: 3236


In [21]:
from keras.preprocessing.sequence import pad_sequences
sequences = tokenizer.texts_to_sequences(sentences)
input_sequences = []
for seq in sequences:
    # create n-grams: [w1,w2], [w1,w2,w3], ...
    for i in range(1, len(seq)):
        ngram = seq[:i+1]
        input_sequences.append(ngram)
input_sequences[:20]

[[3, 36],
 [3, 36, 10],
 [3, 36, 10, 72],
 [3, 36, 10, 72, 1],
 [3, 36, 10, 72, 1, 74],
 [3, 36, 10, 72, 1, 74, 793],
 [3, 51],
 [3, 51, 24],
 [3, 51, 24, 589],
 [3, 51, 24, 589, 4],
 [3, 51, 24, 589, 4, 338],
 [3, 51, 24, 589, 4, 338, 181],
 [3, 51, 24, 589, 4, 338, 181, 590],
 [3, 51, 24, 589, 4, 338, 181, 590, 9],
 [3, 51, 24, 589, 4, 338, 181, 590, 9, 125],
 [419, 44],
 [419, 44, 684],
 [419, 44, 684, 357],
 [12, 50],
 [40, 1]]

In [22]:
max_seq_len = max(len(s) for s in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_seq_len, padding='pre')
x = input_sequences[:, :-1]          # inputs (length = max_seq_len-1)
y = input_sequences[:, -1]           # labels (single token)
print('x shape:', x.shape, 'y shape:', y.shape, 'max_seq_len:', max_seq_len)

x shape: (26305, 37) y shape: (26305,) max_seq_len: 38


In [23]:
word_ind = tokenizer.word_index

In [24]:
reverse_word_index = {value:key for key,value in word_ind.items()}
reverse_word_index

{1: 'you',
 2: 'the',
 3: 'i',
 4: 'to',
 5: 's',
 6: 'it',
 7: 'a',
 8: 'that',
 9: 'of',
 10: 't',
 11: 'and',
 12: 'harry',
 13: 'is',
 14: 'be',
 15: 'what',
 16: 'he',
 17: 'in',
 18: 'me',
 19: 'we',
 20: 'on',
 21: 'this',
 22: 'your',
 23: 'no',
 24: 'not',
 25: 'have',
 26: 'do',
 27: 'was',
 28: 'but',
 29: 'for',
 30: 'there',
 31: 'come',
 32: 'my',
 33: 'are',
 34: 'don',
 35: 'll',
 36: 'can',
 37: 're',
 38: 'all',
 39: 'go',
 40: 'now',
 41: 'know',
 42: 'well',
 43: 'one',
 44: 'if',
 45: 'who',
 46: 'they',
 47: 'with',
 48: 'him',
 49: 'see',
 50: 'potter',
 51: 'm',
 52: 'here',
 53: 'just',
 54: 'will',
 55: 'at',
 56: 'up',
 57: 'think',
 58: 've',
 59: 'right',
 60: 'how',
 61: 'good',
 62: 'oh',
 63: 'about',
 64: 'hagrid',
 65: 'get',
 66: 'professor',
 67: 'like',
 68: 'so',
 69: 'as',
 70: 'got',
 71: 'ron',
 72: 'let',
 73: 'yes',
 74: 'out',
 75: 'been',
 76: 'did',
 77: 'would',
 78: 'very',
 79: 'then',
 80: 'back',
 81: 'she',
 82: 'them',
 83: 'why',
 8

In [25]:
total_length = len(word_ind) + 1

In [26]:
input_len = max_seq_len - 1

## Model Building

In [27]:
from keras.models import Sequential
from keras.layers import Input,Dense,LSTM,Dropout,Embedding

In [28]:
model = Sequential()
## Input Layer
model.add(Input((max_seq_len,)))
## Add the Layers
model.add(Embedding(input_dim=total_length,output_dim=256,trainable=True))
model.add(LSTM(256,return_sequences=True)) # return sequences will provide the sequences to next LSTM layer
model.add(Dropout(0.2))
model.add(LSTM(128))
# Add one Hidden layer
model.add(Dense(100, activation='tanh'))
# Add output layer
model.add(Dense(total_length, activation='softmax'))

In [29]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model.summary()

In [30]:
from keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='val_loss',patience=3)

In [31]:
nn = model.fit(x,y,validation_split=0.2,epochs=20,callbacks=[early_stop])

Epoch 1/20
[1m658/658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 11ms/step - accuracy: 0.0321 - loss: 6.7141 - val_accuracy: 0.0390 - val_loss: 6.4282
Epoch 2/20
[1m658/658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.0452 - loss: 6.0938 - val_accuracy: 0.0787 - val_loss: 6.2380
Epoch 3/20
[1m658/658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.0954 - loss: 5.6765 - val_accuracy: 0.0992 - val_loss: 6.0581
Epoch 4/20
[1m658/658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.1162 - loss: 5.3506 - val_accuracy: 0.1146 - val_loss: 6.0387
Epoch 5/20
[1m658/658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 12ms/step - accuracy: 0.1340 - loss: 5.1076 - val_accuracy: 0.1201 - val_loss: 6.0650
Epoch 6/20
[1m658/658[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 11ms/step - accuracy: 0.1486 - loss: 4.8861 - val_accuracy: 0.1281 - val_loss: 6.0890
Epoch 7/20
[1m658/65

In [32]:
import numpy as np
def sample(preds, temperature=1.0):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds + 1e-8) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def generate_text(seed_text, next_words=20, temperature=1.0):
    result = seed_text
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([result])[0]
        token_list = pad_sequences([token_list], maxlen=input_len, padding='pre')
        preds = model.predict(token_list, verbose=0)[0]   # probabilities over vocab
        next_index = sample(preds, temperature)
        next_word = tokenizer.index_word.get(next_index, '')
        if next_word == '':
            break
        result += ' ' + next_word
    return result

In [33]:
print(generate_text("harry said", next_words=50, temperature=0.8))

harry said hagrid sir there is a worst place what s a basilisk with my year from a eyes were after the school dormitories in up by course the capture of our night of the wizard and whelk up one i am to have the school between his owlery by a ministry


In [37]:
print(generate_text("Try not to wake him", next_words=50, temperature=0.9))

Try not to wake him since it was ask in me a attention at throw a events in azkaban gave the past family in the boy has say in the castle is the end they wouldn t supposed into the guess harry if that would have be a wizard by a most foot to weasle
