In [2]:
%pip install tensorflow pandas

import tensorflow as tf
import numpy as np
import pandas as pd
import random
import sys

Note: you may need to restart the kernel to use updated packages.


In [3]:
tensorflow_version = tf.__version__

In [4]:
print("TensorFlow version:", tensorflow_version)

TensorFlow version: 2.20.0


In [5]:
print(pd.__version__)

2.3.2


In [6]:
df=pd.read_csv("train.csv")

In [10]:
df.head()

Unnamed: 0,title,text,subject,date
0,Greens say no support for Macron's EZ budget i...,BERLIN (Reuters) - None of the German parties ...,worldnews,"October 25, 2017"
1,Trump faces uphill battle to overcome court's ...,(Reuters) - U.S. President Donald Trump faces ...,politicsNews,"February 6, 2017"
2,Ukraine president denies hampering anti-corrup...,VILNIUS/KIEV (Reuters) - Ukrainian President P...,worldnews,"December 8, 2017"
3,U.S. defense chief: White House shakeup will n...,BRUSSELS (Reuters) - U.S. Defense Secretary Ji...,politicsNews,"February 14, 2017"
4,Irish government set to fall weeks before Brex...,DUBLIN (Reuters) - Ireland s minority governme...,worldnews,"November 24, 2017"


In [11]:
text = " ".join(df['text'].dropna().astype(str)).lower()
print(f"Corpus length: {len(text)} characters")

Corpus length: 35695884 characters


In [13]:
vocab = sorted(set(text))
print(f'Vocabulary size: {len(vocab)}')

char2idx = {c: i for i, c in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

Vocabulary size: 104


In [14]:
seq_length = 100 

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

def split_input_target(chunk):
    return chunk[:-1], chunk[1:]

dataset = sequences.map(split_input_target)

BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [15]:
vocab_size = len(vocab)
embedding_dim = 64
rnn_units = 128

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_shape=(None,)),
    tf.keras.layers.LSTM(rnn_units, return_sequences=True, recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
])


def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

model.summary()

  super().__init__(**kwargs)


In [16]:
EPOCHS = 20
history = model.fit(dataset, epochs=EPOCHS)

Epoch 1/20
[1m5522/5522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m644s[0m 116ms/step - loss: 1.8128
Epoch 2/20
[1m5522/5522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m652s[0m 118ms/step - loss: 1.4587
Epoch 3/20
[1m5522/5522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m636s[0m 115ms/step - loss: 1.3889
Epoch 4/20
[1m5522/5522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m697s[0m 126ms/step - loss: 1.3569
Epoch 5/20
[1m5522/5522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m576s[0m 104ms/step - loss: 1.3373
Epoch 6/20
[1m5522/5522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m563s[0m 102ms/step - loss: 1.3236
Epoch 7/20
[1m5522/5522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m631s[0m 114ms/step - loss: 1.3137
Epoch 8/20
[1m5522/5522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m620s[0m 112ms/step - loss: 1.3060
Epoch 9/20
[1m5522/5522[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m642s[0m 116ms/step - loss: 1.3000
Epoch 10/20
[1m5522/5522[0m [32m━━

In [22]:
def generate_text(model, start_string, num_generate=100, temperature=1.0):
    input_eval = [char2idx.get(s, 0) for s in start_string.lower()]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    model.layers[1].reset_states()  # Reset LSTM layer states

    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0) / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])

    return start_string + ''.join(text_generated)

print(generate_text(model, start_string="The Hero of Pakistan", num_generate=200, temperature=0.8))

The Hero of Pakistancthen t cofipringrngen eathorin an’sched heonal ue khemesternosthe r pat niangemer ay ine barecepechee alinome atrand ance r therarameral thede bed lerour ulinigs jownon fousaly ba anivedineanedopo n 
