In [2]:
import pandas as pd

In [3]:
# Load the dataset to inspect the contents
shakespeare = pd.read_csv("Shakespeare_data.csv")

In [4]:
# Display the first few rows of the dataset
shakespeare.head()

Unnamed: 0,Dataline,Play,PlayerLinenumber,ActSceneLine,Player,PlayerLine
0,1,Henry IV,,,,ACT I
1,2,Henry IV,,,,SCENE I. London. The palace.
2,3,Henry IV,,,,"Enter KING HENRY, LORD JOHN OF LANCASTER, the ..."
3,4,Henry IV,1.0,1.1.1,KING HENRY IV,"So shaken as we are, so wan with care,"
4,5,Henry IV,1.0,1.1.2,KING HENRY IV,"Find we a time for frighted peace to pant,"


In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [6]:
#nltk.download('punkt')
#nltk.download('stopwords')

In [7]:
# Filter dataset to only include rows where PlayerLine is not NaN
spoken_lines = shakespeare['PlayerLine'].dropna()

# Convert all lines to lowercase
spoken_lines = spoken_lines.str.lower()

# Remove stop words
stop_words = set(stopwords.words('english'))
tokenized_lines = spoken_lines.apply(lambda line: [word for word in word_tokenize(line) if word.isalpha() and word not in stop_words])

# Display a sample of tokenized lines
tokenized_lines.head()

0                                                [act]
1                              [scene, london, palace]
2    [enter, king, henry, lord, john, lancaster, ea...
3                                  [shaken, wan, care]
4                  [find, time, frighted, peace, pant]
Name: PlayerLine, dtype: object

In [8]:
from collections import Counter

In [9]:
# Flatten all tokenized lines into a single list of words
all_words = [word for line in tokenized_lines for word in line]

# Create a vocabulary by counting the frequency of each word
word_counts = Counter(all_words)
vocabulary = {word: i+1 for i, (word, _) in enumerate(word_counts.most_common())}

# Convert the tokenized lines into sequences of integers
sequences = tokenized_lines.apply(lambda line: [vocabulary[word] for word in line])

# Display a sample of the vocabulary and sequences
vocabulary_sample = dict(list(vocabulary.items())[:10])  # Show first 10 entries of the vocabulary
sequences.head(), vocabulary_sample

(0                                                [211]
 1                                       [55, 679, 448]
 2    [12, 20, 193, 6, 273, 1021, 736, 2155, 7, 2716...
 3                                    [9004, 9005, 318]
 4                           [106, 38, 3719, 107, 9006]
 Name: PlayerLine, dtype: object,
 {'thou': 1,
  'thy': 2,
  'shall': 3,
  'thee': 4,
  'good': 5,
  'lord': 6,
  'sir': 7,
  'come': 8,
  'let': 9,
  'would': 10})

In [10]:
import numpy as np

In [11]:
#pip install tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
# Define the sequence length
sequence_length = 5

# Prepare the input sequences and their corresponding targets
input_sequences = []
targets = []

for seq in sequences:
    for i in range(sequence_length, len(seq)):
        # Extract sequences of the given length
        input_sequences.append(seq[i-sequence_length:i])
        # Target is the next word in the sequence
        targets.append(seq[i])

# Convert the lists to numpy arrays
input_sequences = np.array(input_sequences)
targets = np.array(targets)

# Pad the input sequences so all are of the same length
input_sequences = pad_sequences(input_sequences, maxlen=sequence_length, padding='pre')

# One-hot encode the target words
vocab_size = len(vocabulary) + 1  # Add 1 for padding
targets = to_categorical(targets, num_classes=vocab_size)

# Display a sample of the input sequences and targets
print(input_sequences[:5], targets[:5])

[[  12   20  193    6  273]
 [  20  193    6  273 1021]
 [ 193    6  273 1021  736]
 [   6  273 1021  736 2155]
 [ 273 1021  736 2155    7]] [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [25]:
import pickle

# Assuming 'vocabulary' is a dictionary mapping words to indices
# Save the vocabulary to a file
with open('vocabulary.pkl', 'wb') as f:
    pickle.dump(vocabulary, f)

# If you have other data structures like reverse vocabulary or list of words, save them similarly

In [15]:
# Define model architecture
model = Sequential()

# Embedding layer to convert word indexes into dense embeddings
model.add(Embedding(input_dim=vocab_size, output_dim=200, input_length=sequence_length))

# LSTM layer to learn sequential patterns
model.add(LSTM(units=300))

# Output layer with softmax activation to predict the next word
model.add(Dense(units=vocab_size, activation='softmax'))

# Compile the model with categorical crossentropy loss and Adam optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Display model summary
model.summary()



In [16]:
# Train the model
history = model.fit(input_sequences, targets, epochs=80, batch_size=128, validation_split=0.2)


Epoch 1/80
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 117ms/step - accuracy: 0.0135 - loss: 9.4728 - val_accuracy: 0.0241 - val_loss: 8.4780
Epoch 2/80
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 112ms/step - accuracy: 0.0183 - loss: 7.5586 - val_accuracy: 0.0244 - val_loss: 8.6393
Epoch 3/80
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 114ms/step - accuracy: 0.0146 - loss: 7.3605 - val_accuracy: 0.0241 - val_loss: 8.7691
Epoch 4/80
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 113ms/step - accuracy: 0.0137 - loss: 7.2635 - val_accuracy: 0.0244 - val_loss: 8.9083
Epoch 5/80
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 114ms/step - accuracy: 0.0126 - loss: 7.2112 - val_accuracy: 0.0230 - val_loss: 9.2495
Epoch 6/80
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 117ms/step - accuracy: 0.0201 - loss: 7.0484 - val_accuracy: 0.0223 - val_loss: 9.2970
Epoch 7/80
[1m87/87[

Epoch 51/80
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 143ms/step - accuracy: 0.9948 - loss: 0.0789 - val_accuracy: 0.0101 - val_loss: 15.7312
Epoch 52/80
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 144ms/step - accuracy: 0.9926 - loss: 0.0773 - val_accuracy: 0.0097 - val_loss: 15.7597
Epoch 53/80
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 143ms/step - accuracy: 0.9929 - loss: 0.0706 - val_accuracy: 0.0090 - val_loss: 15.8001
Epoch 54/80
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 143ms/step - accuracy: 0.9932 - loss: 0.0670 - val_accuracy: 0.0083 - val_loss: 15.8499
Epoch 55/80
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 150ms/step - accuracy: 0.9926 - loss: 0.0592 - val_accuracy: 0.0090 - val_loss: 15.8876
Epoch 56/80
[1m87/87[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 143ms/step - accuracy: 0.9946 - loss: 0.0542 - val_accuracy: 0.0086 - val_loss: 15.9078
Epoch 57/8

In [17]:
# Save the model to a file
model.save("shakespeare_lstm_model3.h5")



In [24]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the pre-trained model
model = tf.keras.models.load_model("shakespeare_lstm_model3.h5")

# Function to convert a sequence of words to indices using vocabulary
def text_to_sequence(text, vocabulary):
    words = text.lower().split()  # Tokenize and lowercase input text
    # Use 0 as a fallback value if a word is not in the vocabulary
    sequence = [vocabulary.get(word, 0) for word in words]  # Convert words to indices, use 0 for unknown words
    return sequence

# Function to convert index to a word using the reverse vocabulary
def index_to_word(index, vocabulary):
    reverse_vocab = {i: word for word, i in vocabulary.items()}  # Reverse the vocabulary
    return reverse_vocab.get(index, '<unk>')  # Get word from index, return '<unk>' if not found

# Function to predict the next word given a sequence of text
def predict_next_word(model, input_text, vocabulary, sequence_length):
    # Convert the input text to a sequence of indices
    sequence = text_to_sequence(input_text, vocabulary)
    
    # Ensure the sequence is of the correct length (padding/truncating if necessary)
    if len(sequence) < sequence_length:
        sequence = [0] * (sequence_length - len(sequence)) + sequence  # Pad with 0s (assuming 0 is padding index)
    else:
        sequence = sequence[-sequence_length:]  # Truncate to the correct sequence length
    
    # Reshape sequence for prediction (batch size of 1)
    input_sequence = pad_sequences([sequence], maxlen=sequence_length, padding='pre')
    
    # Get the model's prediction (output probabilities for the next word)
    predicted_probs = model.predict(input_sequence)
    
    # Get the index of the predicted word (the one with the highest probability)
    predicted_index = np.argmax(predicted_probs, axis=-1)[0]
    
    # Convert the predicted index back to a word
    predicted_word = index_to_word(predicted_index, vocabulary)
    
    return predicted_word

# Example:
input_text = "Poins!"
predicted_word = predict_next_word(model, input_text, vocabulary, sequence_length=1)
print(f"Input text: {input_text}")
print(f"Predicted next word: {predicted_word}")




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 186ms/step
Input text: Poins!
Predicted next word: sin


In [53]:
# pd.DataFrame(vocabulary.items(), columns=['word', 'index']).to_csv("vocabulary.csv", index=False)

In [54]:
# from flask import Flask, request, jsonify
# from flask_ngrok import run_with_ngrok
# from tensorflow.keras.models import load_model

#%pip install Flask flask-ngrok

In [None]:
# # Load the model
# model = load_model("shakespeare_lstm_model.h5")

# # Load your vocabulary
# vocabulary = pd.read_csv("vocabulary.csv", index_col=0).to_dict()['index']  # Adjust according to how you saved it

# # Create Flask app
# app = Flask(__name__)
# run_with_ngrok(app)  # Start ngrok when app is run

# # Define a function to preprocess input text
# def preprocess_input(text):
#     # Tokenize and filter words
#     stop_words = set(nltk.corpus.stopwords.words('english'))
#     tokenized_line = [word for word in word_tokenize(text.lower()) if word.isalpha() and word not in stop_words]
#     # Convert words to indexes
#     sequence = [vocabulary[word] for word in tokenized_line if word in vocabulary]
#     return sequence

# @app.route('/predict', methods=['POST'])
# def predict():
#     # Get the partial sentence from the request
#     data = request.json
#     partial_sentence = data['sentence']
    
#     # Preprocess the input
#     input_sequence = preprocess_input(partial_sentence)
#     input_sequence = input_sequence[-5:]  # Get the last 5 words
#     input_sequence = pad_sequences([input_sequence], maxlen=5, padding='pre')  # Pad if necessary

#     # Make prediction
#     predicted_probs = model.predict(input_sequence)
#     predicted_word_index = np.argmax(predicted_probs, axis=-1)[0]
    
#     # Reverse the index to get the word
#     predicted_word = [word for word, index in vocabulary.items() if index == predicted_word_index][0]

#     return jsonify({'next_word': predicted_word})

# # Start the Flask app (do not specify port here)
# app.run()



In [52]:
# Run on BASH?

#curl -X POST <http://127.0.0.1:5000>/predict -H "Content-Type: application/json" -d '{"sentence": "To be or not to"}'


In [14]:
#from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
#from scikeras.wrappers import KerasClassifier
#from sklearn.model_selection import GridSearchCV
#from tensorflow.keras.optimizers import Adam

# Function to build the model
# def build_model(lstm_units=100, embedding_dim=100, learning_rate=0.001):
#     model = Sequential()
#     model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=sequence_length))
#     model.add(LSTM(units=lstm_units))
#     model.add(Dense(units=vocab_size, activation='softmax'))
    
#     optimizer = Adam(learning_rate=learning_rate)
#     model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
#     return model

In [15]:
# model = KerasClassifier(build_fn=build_model, verbose=1)

In [16]:
# # Hyperparameters to tune
# param_grid = {
#     'lstm_units': [50, 100, 150],            # Number of LSTM units
#     'embedding_dim': [50, 100],              # Embedding output dimensions
#     'batch_size': [64, 128],                 # Batch size during training
#     'epochs': [10, 20],                      # Number of epochs
#     'learning_rate': [0.001, 0.0001]         # Learning rates
# }


In [18]:
# GridSearch with 3-fold cross-validation
# grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, verbose=1)

# Perform the grid search on the training data
# grid_result = grid.fit(input_sequences, targets)