In [None]:
!pip install tensorflow
!pip install keras
!pip install transformers

# English Poem

In [None]:
import os
import re
import nltk
import json
import torch
import random
import pickle
import string
import itertools
import unicodedata
import numpy as np
import pandas as pd
import torch.nn as nn
import tensorflow as tf
import torch.optim as optim
from google.colab import drive
import torch.nn.functional as F
from nltk.corpus import stopwords
from tokenizers.models import BPE
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from transformers import GPT2Tokenizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from torchvision.transforms import ToTensor
from torch.nn.utils.rnn import pad_sequence
from tensorflow.keras.models import Sequential
from sklearn.metrics import mean_squared_error
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Sequence, NFKC
from tokenizers.processors import TemplateProcessing
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import LSTM, Dense, Embedding
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, DataLoader, TensorDataset
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
from transformers import BertForSequenceClassification, BertTokenizer
from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

In [None]:
# Authenticate and authorize Google Drive access
drive.mount('/content/drive/')

Mounted at /content/drive/


## Preprocessing

In [None]:
dataset_path = '/content/drive/MyDrive/PoetryFoundationData.csv'
dataset = pd.read_csv(dataset_path)

In [None]:
def get_data_statistics(data):
    num_samples = len(data)
    num_characters = sum(len(text) for text in data['Poem'])
    num_words = sum(len(text.split()) for text in data['Poem'])
    unique_words = set(word for text in data['Poem'] for word in text.split())
    num_unique_words = len(unique_words)
    avg_words_per_sample = num_words / num_samples

    statistics = {
        'Number of Samples': num_samples,
        'Number of Characters': num_characters,
        'Number of Words': num_words,
        'Number of Unique Words': num_unique_words,
        'Average Words per Sample': avg_words_per_sample
    }

    return statistics

In [None]:
get_data_statistics(dataset)

{'Number of Samples': 13854,
 'Number of Characters': 20856813,
 'Number of Words': 3461519,
 'Number of Unique Words': 274687,
 'Average Words per Sample': 249.85700880612097}

In [None]:
# select relevant columns (title and poem)
data = dataset[['Title', 'Poem']].head(900)

# preprocessing
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # lowercase the text
    text = text.lower()
    # remove punctuation and special characters
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    # tokenization
    tokens = nltk.word_tokenize(text)
    # remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # join the tokens back into a single string
    processed_text = ' '.join(tokens)

    return processed_text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
get_data_statistics(data)

{'Number of Samples': 900,
 'Number of Characters': 1542475,
 'Number of Words': 256615,
 'Number of Unique Words': 45321,
 'Average Words per Sample': 285.1277777777778}

In [None]:
# apply preprocessing to the poem column
data['Poem'] = data['Poem'].apply(preprocess_text)

In [None]:
data['Poem'][0]

'dog bone stapler cribbage board garlic press window looselacks suction lack grip bungee cord bootstrap dog leash leather belt window sash cord frayed broke feather duster thatch straw empty bottle elmers glue window loudits hinge clack open clack shut stuffed bear baby blanket single crib newel window split dividing two velvet moss sagebrush willow branch robin wing window paneless frame air'

In [None]:
# tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Poem'])
vocabulary_size = len(tokenizer.word_index) + 1

# convert the preprocessed poems to sequences of indices
encoded_poems = tokenizer.texts_to_sequences(data['Poem'])

In [None]:
# pad the encoded sequences for equal length
max_sequence_length = max(len(seq) for seq in encoded_poems)
padded_poems = pad_sequences(encoded_poems, maxlen=max_sequence_length)

In [None]:
window_size = 3 # window size // the model will consider a context of 3 words to predict the next word

# initialize lists to store input sequences (X) and target labels (y)
input_sequences = []
target_labels = []

# iterate over each encoded poem
for poem in encoded_poems:
    # slide the window over the poem
    for i in range(len(poem) - window_size):
        # extract the input sequence and target label for the current window
        window = poem[i:i+window_size]
        target = poem[i+window_size]

        # append the input sequence and target label to the lists
        input_sequences.append(window)
        target_labels.append(target)

## Training

In [None]:
# split the dataset into training and validation sets
train_inputs, val_inputs, train_targets, val_targets = train_test_split(input_sequences, target_labels, test_size=0.2, random_state=42)

# convert sequences to numpy arrays
train_inputs = np.array(train_inputs)
train_targets = np.array(train_targets)
val_inputs = np.array(val_inputs)
val_targets = np.array(val_targets)

# normalize input sequences
train_inputs = train_inputs / vocabulary_size
val_inputs = val_inputs / vocabulary_size

# convert target sequences to one-hot encoded vectors
train_targets = to_categorical(train_targets, num_classes=vocabulary_size)
val_targets = to_categorical(val_targets, num_classes=vocabulary_size)

In [None]:
# define the model architecture
model = Sequential()
model.add(Embedding(vocabulary_size, 100, input_length=window_size))
model.add(LSTM(128))
model.add(Dense(vocabulary_size, activation='softmax'))

In [None]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
model.fit(train_inputs, train_targets, validation_data=(val_inputs, val_targets), batch_size=32, epochs=10)


Epoch
Epoch
Epoch
Epoch
Epoch
Epoch
Epoch
Epoch
Epoch
Epoch
<keras.callbacks.History at 0x1eb6b5eecd0>


In [None]:
def calculate_perplexity(model, input_sequences, target_labels):
    # convert input sequences and target labels to numpy arrays
    input_sequences = np.array(input_sequences)
    target_labels = np.array(target_labels)
    # generate predictions using the model
    predictions = model.predict(input_sequences)
    # calculate cross-entropy loss
    cross_entropy = -np.log(predictions[np.arange(len(target_labels)), target_labels])
    # calculate average loss
    average_loss = np.mean(cross_entropy)
    # calculate perplexity
    perplexity = np.exp(average_loss)

    return perplexity, average_loss

perplexity, avg_loss = calculate_perplexity(model, input_sequences, target_labels)
print("Perplexity:", perplexity)
print("Avg_LOss:", avg_loss)

 Perplexity: 6324.4854
 Avg_LOss: 8.752184


In [None]:
def generate_poem(model, tokenizer, seed_text, num_words):
    generated_poem = seed_text
    for _ in range(num_words):
        # tokenize the seed text
        tokenized_text = tokenizer.texts_to_sequences([seed_text])[0]
        # pad the tokenized text for input to the model
        padded_text = pad_sequences([tokenized_text], maxlen=max_sequence_length)
        # generate the next word probabilities using the model
        predicted_probs = model.predict(padded_text, verbose=0)[0]
        # get the index of the predicted word with the highest probability
        predicted_index = np.argmax(predicted_probs)
        # convert the predicted word index to the actual word
        predicted_word = tokenizer.index_word[predicted_index]
        # add the predicted word to the generated poem
        generated_poem += " " + predicted_word
        # update the seed text with the predicted word
        seed_text += " " + predicted_word

    return generated_poem

# user topic or seed text
user_topic = "sun"
num_words = 20  # poem length

generated_poem = generate_poem(model, tokenizer, user_topic, num_words)
print("Generated Poem:")
print(generated_poem)

Generated Poem:sun like like like like like like like like like like like like like like like like like like like like
