<a href="https://colab.research.google.com/github/Anshiag11/machine_learning-projects/blob/master/text_generator_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import dependancies
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense,Dropout,LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Using TensorFlow backend.


In [2]:
#data loading
from google.colab import files
uploaded=files.upload()
#load data
#loading data and opening our input data in the form of a txt file
file=open("frankenstein-2.txt").read()

Saving frankenstein-2.txt to frankenstein-2.txt


In [5]:
#tokenisation 
#standardization
def tokenize_words(input):
  #lowercase everything to standardize it
  input=input.lower()
  #initiating the tokenizer
  tokenizer=RegexpTokenizer(r'\w+')
  #tokenize the txt into tokens
  tokens=tokenizer.tokenize(input)
  #filtering the stopwords using lambda
  filtered=filter(lambda token:token not in stopwords.words('english'),tokens)
  return "".join(filtered)
  
#preprocess the input data and make tokens
processed_inputs=tokenize_words(file)

In [7]:
#chars to numbers
#convert characters in our input to numbers
#we wil sort the list of the set of all characters that appear in our i/p txt and then use the enumerate fn to get numbers that represent characters
#we will then create a dictionary that stores the keys and values,or the characters and numbers that represent them
chars=sorted(list(set(processed_inputs)))
char_to_num=dict((c,i) for i,c in enumerate(chars))

In [9]:
#check if words to chars or chars to num(?!)has worked?

input_len=len(processed_inputs)
vocab_len=len(chars)
print("Total number of characters:",input_len)
print("Total vocab:",vocab_len)


Total number of characters: 7038
Total vocab: 27


In [10]:
#sequence length
seq_length=100
x_data=[]
y_data=[]

In [11]:
#loop through the sequence
for i in range(0,input_len - seq_length,1):
  in_seq=processed_inputs[i:i + seq_length]
  out_seq=processed_inputs[i+seq_length]
  x_data.append([char_to_num[char]for char in in_seq])
  y_data.append(char_to_num[out_seq])
n_patterns=len(x_data)
print("Total Patterns:",n_patterns)

Total Patterns: 6938


In [13]:
#convert input sequence to np array and so on
X=numpy.reshape(x_data,(n_patterns,seq_length,1))
X=X/float(vocab_len)

In [14]:
#one-hot encoding
y=np_utils.to_categorical(y_data)

In [15]:
#creating the model
model=Sequential()
model.add(LSTM(256,input_shape=(X.shape[1],X.shape[2]),return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1],activation='softmax'))

In [16]:
#compile the model
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [17]:
#saving weights
filepath="model_weights_saved.hdf5"
checkpoint=ModelCheckpoint(filepath,monitor='loss',verbose=1,save_best_only=True,mode='min')
desired_callbacks=[checkpoint]

In [28]:
#fit model and train
model.fit(X,y,epochs=4,batch_size=256,callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss did not improve from 2.90453
Epoch 2/4

Epoch 00002: loss improved from 2.90453 to 2.90205, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.90205 to 2.89572, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.89572 to 2.89511, saving model to model_weights_saved.hdf5


<keras.callbacks.callbacks.History at 0x7f205b735630>

In [20]:
#recompile model with saved weights
filename="model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [22]:
#output back into characters
num_to_char=dict((i,c) for i,c in enumerate(chars))

In [23]:
#random seed to help generate
start=numpy.random.randint(0,len(x_data)-1)
pattern=x_data[start]
print("Random Seed: ")
print("\"", ' '.join([num_to_char[value] for value in pattern]),"\"")

Random Seed: 
" a r v i c t o r w a s t e t i m e u p o n s a d t r a s h i n s t e a d r e m a r k f a t h e r t a k e n p a i n s e x p l a i n p r i n c i p l e s a g r i p p a e n t i r e l y e x p l o d e d m o "


In [27]:
#generate the text
for i in range(1000):
    x=numpy.reshape(pattern, (1, len(pattern), 1))
    x=x/float(vocab_len)
    prediction=model.predict(x, verbose=0)
    index=numpy.argmax(prediction)
    result=num_to_char[index]
    seq_in=[num_to_char[value]for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern=pattern[1:len(pattern)]

eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee