In [1]:
import pandas as pd
import tensorflow as tf



In [2]:
data = pd.read_csv('medium_articles.csv')

In [3]:
data["text"].values[0]

'Photo by Josh Riemer on Unsplash\n\nMerry Christmas and Happy Holidays, everyone!\n\nWe just wanted everyone to know how much we appreciate everyone and how thankful we are for all our readers and writers here. We wouldn’t be anywhere without you, so thank you all for bringing informative, vulnerable, and important pieces that destigmatize mental illness and mental health.\n\nWithout further ado, here are ten of our top stories from last week, all of which were curated:\n\n“Just as the capacity to love and inspire is universal so is the capacity to hate and discourage. Irrespective of gender, race, age or religion none of us are exempt from aggressive proclivities. Those who are narcissistically disordered, and accordingly repress deep seated feelings of inferiority with inflated delusions of grandeur and superiority, are more prone to aggression and violence. They infiltrate our interactions in myriad environments from home, work, school and the cyber world. Hence, bullying does not 

In [4]:
text_data = data["text"].values.tolist()

In [5]:
BATCH_SIZE = 64
MIN_STRING_LEN = 512
SEQ_LEN = 128

EMBED_DIM = 256
FEED_FORWARD_DIM = 128
NUM_HEADS = 3
NUM_LAYERS = 2
VOCAB_SIZE = 5000

EPOCHS = 15
NUM_TOKENS_TO_GENERATE = 80

In [6]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(text_data)

In [7]:
sequences = tokenizer.texts_to_sequences(text_data)

In [8]:
padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=SEQ_LEN)

In [9]:
inputs = tf.keras.layers.Input(shape=(SEQ_LEN,), dtype="int32")
x = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=EMBED_DIM, input_length=SEQ_LEN)(inputs)



In [10]:
for _ in range(NUM_LAYERS):
    x = tf.keras.layers.MultiHeadAttention(num_heads=NUM_HEADS, key_dim=EMBED_DIM)(x, x)
    x = tf.keras.layers.Dense(FEED_FORWARD_DIM, activation="relu")(x)
    x = tf.keras.layers.Dense(EMBED_DIM)(x)

In [11]:
outputs = tf.keras.layers.Dense(VOCAB_SIZE, activation='softmax')(x)

In [12]:
model = tf.keras.models.Model(inputs=inputs, outputs=outputs)

In [13]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

In [14]:
labels = []
for seq in sequences:
    labels.append(seq[1:] + [0] * (SEQ_LEN - len(seq) + 1))

labels = tf.keras.preprocessing.sequence.pad_sequences(labels, maxlen=SEQ_LEN)

In [15]:
model.fit(x=padded_sequences, y=labels, epochs=EPOCHS)

Epoch 1/15
[1m6012/6012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m220s[0m 35ms/step - accuracy: 0.4094 - loss: 3.8152
Epoch 2/15
[1m6012/6012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m214s[0m 36ms/step - accuracy: 0.9253 - loss: 0.4732
Epoch 3/15
[1m6012/6012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 36ms/step - accuracy: 0.9287 - loss: 0.4704
Epoch 4/15
[1m6012/6012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 33ms/step - accuracy: 0.1936 - loss: 5.8903
Epoch 5/15
[1m6012/6012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 33ms/step - accuracy: 0.0546 - loss: 6.4354
Epoch 6/15
[1m6012/6012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 33ms/step - accuracy: 0.0540 - loss: 6.4362
Epoch 7/15
[1m6012/6012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 33ms/step - accuracy: 0.0553 - loss: 6.4317
Epoch 8/15
[1m6012/6012[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 33ms/step - accuracy: 0.0550 - loss: 6.4324


<keras.src.callbacks.history.History at 0x7804a801d190>

In [16]:
def generate_text(prompt, num_tokens_to_generate=5):

    prompt_tokens = tokenizer.texts_to_sequences([prompt])[0]

    prompt_padded = tf.keras.preprocessing.sequence.pad_sequences([prompt_tokens], maxlen=SEQ_LEN)[0]

    output_sequence = []

    for _ in range(num_tokens_to_generate):
        batch = tf.expand_dims(prompt_padded, axis=0)
        predictions = model.predict(batch)
        last_token_pred = predictions[0, -1, :].reshape(1, -1)
        next_token = tf.random.categorical(last_token_pred, num_samples=1)[0, 0]
        output_sequence.append(next_token)
        prompt_padded = tf.concat([prompt_padded[1:], [next_token]], axis=0)

    output_sequence = tf.stack(output_sequence).numpy()
    generated_text = tokenizer.sequences_to_texts([output_sequence])[0]

    return generated_text

In [17]:
prompt = "hello"
generated_text = generate_text(prompt)
print(generated_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
animals wonderful limitations fancy reported
