## Reading dataset from drive

In [None]:
from google.colab import drive
pth = "/content/drive/MyDrive/BE_Project/poem/"
dest = "/temp/test.csv"
drive.mount(pth[:14])

import shutil
shutil.copy(pth + "dataset/test.csv", dest)

## Importing Libraries

In [None]:
import pickle
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Attention, Dropout, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

## Reading and cleaning dataset

In [None]:
# Reading data from csv
data = pd.read_csv(dest, usecols=['Content'], nrows=600)['Content']

corpus = []
for poem in data:
    [corpus.append(text) for text in poem.lower().replace('\r', '').split("\n")]

## Tokenizing texts

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)

# Saving the tokenized text
pickle.dump(tokenizer, open(pth + 'tokenizer(test_csv).pkl', 'wb'))

## Tokenizing texts to sequences

In [None]:
total_words = len(tokenizer.word_index) + 1
input_sequences = []

for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range (1, len(token_list)):
        n_gram_sequence = token_list[ : i+1]
        input_sequences.append(n_gram_sequence)

## Adding padding to sequences

In [None]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre'))

## Plotting accuracy and loss graphs

In [None]:
def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.show()

## Model

### Preparing data for model

In [None]:
xs = input_sequences[:, :-1]
labels = input_sequences[:, -1]
ys = tf.keras.utils.to_categorical(labels, num_classes = total_words)

### Creating Layers

In [None]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150, return_sequences = True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(total_words, activation='softmax'))

### Compiling Model

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001), metrics=['accuracy'])

### Creating checkpoints and reducing learning rate

In [None]:
checkpoint = ModelCheckpoint(pth + "poem_generation(test_csv).h5", monitor='loss', verbose=1, save_best_only=True, mode='auto')
reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

### Fitting model

In [None]:
history = model.fit(xs, ys, epochs=200, verbose=1, callbacks=[checkpoint, reduce])

### Plotting accuracy and loss

In [None]:
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

# Prediction

## Importing Libraries

In [None]:
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Loading model and tokenizer

In [None]:
from google.colab import drive
pth = "/content/drive/MyDrive/BE_Project/poem/"
drive.mount(pth[:14])

model = load_model(pth + 'poem_generation(csv_txt).h5')
tokenizer = pickle.load(open(pth + 'tokenizer.pkl', 'rb'))

## Poem Generation

In [None]:
# Inputs
seed_text = input("\033[1mEnter phrase:\033[0m ") #I dont know where hes stationed, be it Cork or in Killarney
next_words = int(input("\033[1mEnter number of stanzas:\033[0m "))
len_phrase = len(seed_text)

word_index = tokenizer.word_index
words = list(word_index.keys())
values = list(word_index.values())

for i in range (next_words*8*4):
  token_list = tokenizer.texts_to_sequences([seed_text])[0]
  token_list = pad_sequences([token_list], maxlen=85, padding='pre')
  predicted = np.argmax(model.predict_on_batch(token_list))
  output_word = ""
  for index in values:
    if index == predicted:
      output_word = words[values.index(index)]
      break
  seed_text += " " + output_word
  if i <= (next_words*8*4 - 8):
    if (i + 1) % 32 == 0:
        seed_text += "\n"
    if (i + 1) % 8 == 0:
        seed_text += "\n"

seed_text = seed_text[ : len_phrase] + "\033[0m" + seed_text[len_phrase: ]
print("\n\n\033[1mGenerated Poem\n" + seed_text.replace("\n ", "\n"))