# Loading the Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from collections import Counter

In [None]:
import tensorflow as tf
from tensorflow.keras.utils import to_categorical

# Loading the Dataset

In [None]:
data = pd.read_csv("Harry_Potter_Spells.csv", sep = ';')

In [None]:
data.head()

In [None]:
data["Incantation"].isna().sum()

In [None]:
data.dropna(subset = ['Incantation'], inplace = True)

In [None]:
data["Incantation"].isna().sum()

In [None]:
L = data["Incantation"][data["Incantation"] != 'Unknown'].tolist()
L.sort()
df = pd.DataFrame(L, columns = ["Spells"])

In [None]:
df.head()

In [None]:
with open('my_file.txt', 'w') as f_out:
    for i in range(df.shape[0]):
        content = df.iloc[i, 0]
        f_out.write(content + "\n")

# Data Visualization

In [None]:
data = open('my_file.txt', 'r').read()
data = data.lower()
data = data.replace("\xa0", "")
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print(f'There are {data_size} total characters and {vocab_size} unique characters in our data.')

In [None]:
lengths = [len(i) for i in data.split("\n")[:-1]]
d = Counter(lengths)
plt.bar(d.keys(), d.values())
plt.xlabel("Length of Spell")
plt.ylabel("Frequency")
plt.show()

In [None]:
res = {}
for keys in data:
    res[keys] = res.get(keys, 0) + 1
res["z"] = 0
res = dict(sorted(res.items()))
res.pop("\n")

In [None]:
plt.bar(res.keys(), res.values())
plt.xlabel("Character")
plt.ylabel("Frequency")
plt.show()

# Processing the Text - Encoding

In [None]:
string.punctuation = string.punctuation +'“'+'”'+'-'+'’'+'‘'+'—'
string.punctuation = string.punctuation.replace('.', '')
file_nl_removed = ""
for line in data:
  line_nl_removed = line.replace("\n", " ")
  file_nl_removed += line_nl_removed

file_p = "".join([char for char in file_nl_removed if char not in string.punctuation])
preprocessed_text = file_p.lower()

In [None]:
raw_text = preprocessed_text

chars = sorted(list(set(raw_text)))
chars.extend(["z", "\n"])
char_to_int = {c : i for i, c in enumerate(chars)}
int_to_char = {i : c for i, c in enumerate(chars)}

n_chars = len(raw_text)
n_vocab = len(chars)

In [None]:
seq_length = 7
dataX = []
dataY = []

for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]

    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])

n_patterns = len(dataX)
print ("Total Patterns: ", n_patterns)

X = np.reshape(dataX, (n_patterns, seq_length, 1))
y = to_categorical(dataY)

# Building and Training the RNN Model

In [None]:
embedding_dim = 128
max_length = 7
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(n_vocab, embedding_dim, input_length = max_length))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences = True)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256)))
model.add(tf.keras.layers.Dropout(0.2))
model.add(tf.keras.layers.Dense(y.shape[1], activation = 'softmax'))

In [None]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [None]:
model.summary()

In [None]:
history = model.fit(X, y, epochs = 100, batch_size = 128)

In [None]:
plt.plot(history.history['loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.show()

# Creating New Spells

In [None]:
def predict_next_n_chars(pattern, n):
    for i in range(n):
        x = np.reshape(pattern, (1, len(pattern), 1))
        prediction = model.predict(x, verbose = 0)
        print(int_to_char[np.argmax(prediction)], end = '')
        seq_in = [int_to_char[value] for value in pattern]
        pattern.append(np.argmax(prediction))
        pattern = pattern[1 : len(pattern)]
    return pattern

In [None]:
length = 15
count = 10
spells = []
for i in range(count):
    start = np.random.randint(0, len(dataX) - 1)
    pattern = dataX[start]
    spell = predict_next_n_chars(pattern, length)
    spells.append(spell)
    print("\n")