<a href="https://colab.research.google.com/github/Akshay8055143/Artificial-Intelligence-1446/blob/main/LSTM_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Project Overview: Build a Text Generation Model using LSTM. The goal is to build a model that can learn language patterns from text and generate new sentence that mimic the training data

In [1]:
pip install nltk



In [2]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

## Step1: Import all the necessery libraries

In [3]:
from warnings import filterwarnings
filterwarnings(action='ignore')

# Preprocess the text
from tensorflow.keras.preprocessing.text import Tokenizer # creates work tokens, number sequences
from keras.preprocessing import sequence # padding
# Model
from keras.models import Sequential
from keras.layers import Input,Dense,LSTM,Embedding

## Step2: Load the dataset

In [5]:
with open("/content/Harry Potter.csv",'r') as file:
  data = file.read()
print(data[:70])

﻿Character;Sentence
Dumbledore;I should've known that you would be her


In [7]:
data[:70]

"\ufeffCharacter;Sentence\nDumbledore;I should've known that you would be her"

## Step3:Tokenization & Sequence Creation

In [9]:
# Intializing the tokenizer
tokenizer = Tokenizer()

# apply the tokenizer on respective words
texts = ["I should've known that you would be her"]
tokenizer.fit_on_texts(texts) # this is going to develop tokens and their frequency count
print(tokenizer.word_index)


{'i': 1, "should've": 2, 'known': 3, 'that': 4, 'you': 5, 'would': 6, 'be': 7, 'her': 8}


In [10]:
# if we want to give this text to the model
# [[1,3,4,5,2],[1,2,6,7]]
tokenizer.texts_to_sequences(texts)

[[1, 2, 3, 4, 5, 6, 7, 8]]

In [11]:
# Intializing the tokenizer
tokenizer = Tokenizer()

# Fit on texts - pass the data
tokenizer.fit_on_texts([data])
word_ind = tokenizer.word_index
word_ind

{'harry': 1,
 'you': 2,
 'the': 3,
 'hagrid': 4,
 'to': 5,
 'ron': 6,
 'i': 7,
 'a': 8,
 'hermione': 9,
 'and': 10,
 'it': 11,
 'of': 12,
 'that': 13,
 'is': 14,
 'dumbledore': 15,
 'in': 16,
 'what': 17,
 'be': 18,
 'on': 19,
 'this': 20,
 'me': 21,
 'your': 22,
 'not': 23,
 'mcgonagall': 24,
 'do': 25,
 "it's": 26,
 'there': 27,
 'for': 28,
 'no': 29,
 'he': 30,
 'but': 31,
 'are': 32,
 'all': 33,
 'go': 34,
 'have': 35,
 'up': 36,
 'snape': 37,
 "don't": 38,
 'know': 39,
 'now': 40,
 'one': 41,
 'petunia': 42,
 'my': 43,
 'was': 44,
 'malfoy': 45,
 'see': 46,
 'with': 47,
 'we': 48,
 'quirrell': 49,
 "i'm": 50,
 'will': 51,
 'potter': 52,
 'oh': 53,
 'vernon': 54,
 'well': 55,
 'come': 56,
 'as': 57,
 'right': 58,
 'got': 59,
 'if': 60,
 'voldemort': 61,
 'just': 62,
 "that's": 63,
 'him': 64,
 "you're": 65,
 'like': 66,
 'good': 67,
 'how': 68,
 'about': 69,
 "he's": 70,
 'at': 71,
 'can': 72,
 'here': 73,
 'get': 74,
 'madam': 75,
 'who': 76,
 'going': 77,
 'hooch': 78,
 'think': 

## Texts on Sequence

In [12]:
total_length = len(word_ind) + 1
# to reserve for padding

In [13]:
total_length

1794

In [14]:
input_sequences = []
for line in data.split('\n'):
  token_list = tokenizer.texts_to_sequences([line])[0]
  # [4,5,6,7]
  for i in range(1,len(token_list)): # for i in range(1,4)
    n_gram_sequence = token_list[:i+1] # token_list[0:2] = 0th index,1st index
    # token_list[0:3] = 0th index,1st index,2nd index
    # token_list[0:4] = 0th index, 1st index, 2nd index, 3rd index
    input_sequences.append(n_gram_sequence) # [[4,5],[4,5,6],[4,5,6,7]]
print(input_sequences[:15])

[[884, 885], [15, 7], [15, 7, 886], [15, 7, 886, 334], [15, 7, 886, 334, 13], [15, 7, 886, 334, 13, 2], [15, 7, 886, 334, 13, 2, 101], [15, 7, 886, 334, 13, 2, 101, 18], [15, 7, 886, 334, 13, 2, 101, 18, 73], [15, 7, 886, 334, 13, 2, 101, 18, 73, 83], [15, 7, 886, 334, 13, 2, 101, 18, 73, 83, 24], [24, 67], [24, 67, 594], [24, 67, 594, 83], [24, 67, 594, 83, 15]]


## padding to bring all the ngram tokens to same size. to bring all the setences to same length

In [15]:
max([80,79,66])

80

In [16]:
max_length = max([len(lines) for lines in input_sequences])

input_sequences = sequence.pad_sequences(input_sequences,maxlen=max_length)
input_sequences[:15]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 884, 885],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  15,   7],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,  15,   7, 886],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,  15,   7, 886, 334],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,

In [17]:
print(max_length)

38


## Seperate X and Y

In [18]:
x = input_sequences[:,:-1]
y = input_sequences[:,-1]

In [19]:
x[:10]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0, 884],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  15],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,  15,   7],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,  15,   7, 886],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,  

In [20]:
y[:10]

array([885,   7, 886, 334,  13,   2, 101,  18,  73,  83], dtype=int32)

In [21]:
y.shape

(9707,)

## Model Building

In [22]:
model = Sequential()
# Provide the input as max length indicating total number of sentences
model.add(Input((max_length,)))
# Add the layers
model.add(Embedding(input_dim=total_length,output_dim=300,trainable=False))
model.add(LSTM(200,return_sequences=True,dropout=0.3)) # return sequences will provide the sequences to next LSTM layer
model.add(LSTM(150,dropout=0.2))
# Add one Hidden layer
model.add(Dense(100, activation='tanh'))
# Add output layer
model.add(Dense(total_length, activation='softmax'))

In [23]:
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])

In [24]:
from keras.callbacks import EarlyStopping
early_stop = EarlyStopping(monitor='accuracy',patience=5)

In [63]:
nn = model.fit(x,y,validation_split=0.2,epochs=20,callbacks=[early_stop])

Epoch 1/20
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.0744 - loss: 4.9209 - val_accuracy: 0.0505 - val_loss: 7.0140
Epoch 2/20
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.0797 - loss: 4.8377 - val_accuracy: 0.0515 - val_loss: 7.0696
Epoch 3/20
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.0851 - loss: 4.7692 - val_accuracy: 0.0530 - val_loss: 7.1100
Epoch 4/20
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.0840 - loss: 4.6944 - val_accuracy: 0.0520 - val_loss: 7.1606
Epoch 5/20
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.1014 - loss: 4.6290 - val_accuracy: 0.0494 - val_loss: 7.2435
Epoch 6/20
[1m243/243[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.1011 - loss: 4.5647 - val_accuracy: 0.0520 - val_loss: 7.3036
Epoch 7/20
[1m243/243

## Text Generation Function

In [64]:
import numpy as np
from nltk.tokenize import word_tokenize
def generate_text(seed_text, next_words=50):
    for _ in range(next_words):
        # preparing our seed text ready for the model
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = sequence.pad_sequences([token_list],maxlen=max_length-1)
        # give the proceesed text to model for prediction of next 50words
        predicted = np.argmax(model.predict(token_list, verbose=0)) # it gives you the index of next word
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                seed_text += ' ' + word
                break

    return seed_text

## Generate Output

In [65]:
seed = "the king"
generated_text = generate_text(seed, 50)
print(generated_text)


the king of a legendary of a unicorn and at to hogwarts to kill you are you flamel to themselves you flamel while nicholas of course flamel arse in the golden snitch of their rubbish celebrated that to even themselves of the pub wing corridor is death death death death death death
