In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import time
import re

In [None]:
data = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/NLP Project/Inshorts Cleaned Data.xlsx')

In [None]:
data.head()

Unnamed: 0,Headline,Short,Source,Time,Publish Date
0,4 ex-bank officials booked for cheating bank o...,The CBI on Saturday booked four former officia...,The New Indian Express,09:25:00,2017-03-26
1,Supreme Court to go paperless in 6 months: CJI,Chief Justice JS Khehar has said the Supreme C...,Outlook,22:18:00,2017-03-25
2,"At least 3 killed, 30 injured in blast in Sylh...","At least three people were killed, including a...",Hindustan Times,23:39:00,2017-03-25
3,Why has Reliance been barred from trading in f...,Mukesh Ambani-led Reliance Industries (RIL) wa...,Livemint,23:08:00,2017-03-25
4,Was stopped from entering my own studio at Tim...,TV news anchor Arnab Goswami has said he was t...,YouTube,23:24:00,2017-03-25


In [None]:
data.drop(columns=['Source ', 'Time ', 'Publish Date'], axis=1, inplace=True)

data.head()

Unnamed: 0,Headline,Short
0,4 ex-bank officials booked for cheating bank o...,The CBI on Saturday booked four former officia...
1,Supreme Court to go paperless in 6 months: CJI,Chief Justice JS Khehar has said the Supreme C...
2,"At least 3 killed, 30 injured in blast in Sylh...","At least three people were killed, including a..."
3,Why has Reliance been barred from trading in f...,Mukesh Ambani-led Reliance Industries (RIL) wa...
4,Was stopped from entering my own studio at Tim...,TV news anchor Arnab Goswami has said he was t...


In [None]:
new = {'Headline': 'Summary', 'Short': 'News'}
data = data.rename(new, axis=1)

In [None]:
data.head()

Unnamed: 0,Summary,News
0,<start> 4 ex bank officials booked for cheatin...,the cbi on saturday booked four former officia...
1,<start> supreme court to go paperless in 6 mon...,chief justice js khehar has said the supreme c...
2,<start> at least 3 killed 30 injured in blast ...,at least three people were killed including a ...
3,<start> why has reliance been barred from trad...,mukesh ambani led reliance industries ril was ...
4,<start> was stopped from entering my own studi...,tv news anchor arnab goswami has said he was t...


In [None]:
#preprocessing
data['Summary'] = data['Summary'].apply(lambda x: '<start> '+x+' <end>')
#removing extra spaces
data['News'] = data['News'].apply(lambda x: x.strip())
data['Summary'] = data['Summary'].apply(lambda x: x.strip())
#keeping only alphabets and numbers
data['News'] = data['News'].apply(lambda x: re.sub("@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", " ", x))
data['Summary'] = data['Summary'].apply(lambda x: re.sub("@\S+|https?:\S+|http?:\S|[^<>A-Za-z0-9]+", " ", x))
#lower casing
data['News'] = data['News'].apply(lambda x: x.lower())
data['Summary'] = data['Summary'].apply(lambda x: x.lower())

In [None]:
newsTokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>')
summaryTokenizer = tf.keras.preprocessing.text.Tokenizer(filters="", oov_token='<unk>')

In [None]:
newsTokenizer.fit_on_texts(data['News'])
summaryTokenizer.fit_on_texts(data['Summary'])

In [None]:
inputs = newsTokenizer.texts_to_sequences(data['News'])
targets = summaryTokenizer.texts_to_sequences(data['Summary'])

In [None]:
print('News example: ', data['News'][0])
print('News to sequences: ', inputs[0])
print()
print('Summary example: ', data['Summary'][0])
print('Summary to sequences: ', targets[0])

News example:  the cbi on saturday booked four former officials of syndicate bank and six others for cheating forgery criminal conspiracy and causing 209 crore loss to the state run bank the accused had availed home loans and credit from syndicate bank on the basis of forged and fabricated documents these funds were fraudulently transferred to the companies owned by the accused persons
News to sequences:  [2, 1146, 9, 120, 1917, 156, 112, 172, 6, 11820, 181, 8, 212, 331, 12, 3897, 17598, 1471, 3469, 8, 1909, 13127, 56, 730, 3, 2, 64, 295, 181, 2, 241, 35, 11821, 243, 1627, 8, 1807, 21, 11820, 181, 9, 2, 1440, 6, 7438, 8, 13128, 1515, 292, 868, 39, 17599, 3736, 3, 2, 453, 919, 17, 2, 241, 1718]

Summary example:  <start> <start> 4 ex bank officials booked for cheating bank of 209 crore <end> <end>
Summary to sequences:  [2, 2, 59, 130, 133, 734, 713, 8, 2313, 133, 9, 13316, 46, 3, 3]


In [None]:
news_vocab_size = len(newsTokenizer.word_index)+1
summary_vocab_size = len(summaryTokenizer.word_index)+1

In [None]:
print('News vocabulary size: ', news_vocab_size)
print('Summary vocabulary size:', summary_vocab_size)

News vocabulary size:  68204
Summary vocabulary size: 28285


In [None]:
#getting appropriate lens
news_lengths = pd.Series([len(x) for x in data['News']])
summary_lengths = pd.Series([len(x) for x in data['Summary']])
print(news_lengths)

0        373
1        312
2        376
3        339
4        326
        ... 
55099    345
55100    360
55101    317
55102    396
55103    353
Length: 55104, dtype: int64


In [None]:
news_lengths.describe()

count    55104.000000
mean       357.178190
std         24.743104
min        212.000000
25%        341.000000
50%        358.000000
75%        376.000000
max        435.000000
dtype: float64

In [None]:
summary_lengths.describe()

count    55104.000000
mean        78.479149
std          6.762844
min         29.000000
25%         74.000000
50%         78.000000
75%         84.000000
max        101.000000
dtype: float64

In [None]:
#taking values > and round figured to 75th percentile
news_max_len = 400
summary_max_len = 70

In [None]:
padded_inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, maxlen=news_max_len, padding='post', truncating='post')
padded_targets = tf.keras.preprocessing.sequence.pad_sequences(targets, maxlen=summary_max_len, padding='post', truncating='post')


In [None]:
#creating dataset pipeline
final_news_data = tf.cast(padded_inputs, dtype=tf.int32)
final_summary_data = tf.cast(padded_targets, dtype=tf.int32)

In [None]:
BUFFER_SIZE = 20000
BATCH_SIZE = 64

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((final_news_data, final_summary_data)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

BUILDING TRANSFORMER MODEL

In [None]:
#positional encoding

def get_angles(position, i, d_model):
    angle_rates = 1/np.power(10000, (2*(i//2))/np.float32(d_model))
    return position*angle_rates


def positional_encoding(position, d_model):
    angle_rads = get_angles(
        np.arange(position)[:, np.newaxis],
        np.arange(d_model)[np.newaxis, :],
        d_model)

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

#creating padding mask for padded sequences
def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

#creating look ahead mask for masking future words from contributing in prediction of current words in self attention
def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

#scaled dot product
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)

    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled_attention_logits += (mask * -1e9)

    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

    output = tf.matmul(attention_weights, v)
    return output, attention_weights

#Multi-head attention
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        output = self.dense(concat_attention)

        return output, attention_weights

#Feed-forward network
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model)])

#Fundamental unit of transformer encoder
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()

        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

#Fundamental unit of transformer decoder
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)

        self.ffn = point_wise_feed_forward_network(d_model, dff)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)


    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)

        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)

        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)

        return out3, attn_weights_block1, attn_weights_block2

#Encoder having multiple encoder layers
class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)

        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)

        return x

#Decoder having multiple decoder layers
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers

        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)

        for i in range(self.num_layers):
            x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)

            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

        return x, attention_weights


#The Transformer
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)

        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        enc_output = self.encoder(inp, training, enc_padding_mask)

        dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)

        final_output = self.final_layer(dec_output)

        return final_output, attention_weights


In [None]:
#hyper-parameter
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
EPOCHS = 20

In [None]:
#adam optimizer with custom learning rate scheduler

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        step = tf.cast(step, tf.float32)  # Ensure step is a float32 tensor
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)


In [None]:
#defining loss and loss function
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

train_loss = tf.keras.metrics.Mean(name='train_loss')

In [None]:
transformer = Transformer(
    num_layers,
    d_model,
    num_heads,
    dff,
    news_vocab_size,
    summary_vocab_size,
    pe_input=news_vocab_size,
    pe_target=summary_vocab_size,)

In [None]:
#creating masks for training
def create_masks(inp, tar):
    enc_padding_mask = create_padding_mask(inp)
    dec_padding_mask = create_padding_mask(inp)

    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    return enc_padding_mask, combined_mask, dec_padding_mask

In [None]:
#creating checkpoints for saving model config and weights
checkpoint_path = "/content/drive/MyDrive/Colab Notebooks/NLP Project/Transformer"

ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print ('Latest checkpoint restored!')


Latest checkpoint restored!


In [None]:
#training steps

@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]

    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)

    with tf.GradientTape() as tape:
        predictions, _ = transformer(
            inp, tar_inp,
            True,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask
        )
        loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)

TRAINING

In [None]:
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()

    for (batch, (inp, tar)) in enumerate(dataset):
        train_step(inp, tar)

        if batch % 429 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, train_loss.result()))

    if (epoch + 1) % 5 == 0:
        ckpt_save_path = ckpt_manager.save()
        print ('Saving checkpoint for epoch {} at {}'.format(epoch+1, ckpt_save_path))

    print ('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result()))

    print ('Time taken for this epoch: {} secs\n'.format(time.time() - start))

Epoch 1 Batch 0 Loss 4.0718
Epoch 1 Batch 429 Loss 2.3019
Epoch 1 Batch 858 Loss 2.2386
Epoch 1 Loss 2.2388
Time taken for this epoch: 450.6708116531372 secs

Epoch 2 Batch 0 Loss 2.1128
Epoch 2 Batch 429 Loss 2.0958
Epoch 2 Batch 858 Loss 2.1126
Epoch 2 Loss 2.1130
Time taken for this epoch: 320.64541125297546 secs

Epoch 3 Batch 0 Loss 2.0559
Epoch 3 Batch 429 Loss 2.0445
Epoch 3 Batch 858 Loss 2.0636
Epoch 3 Loss 2.0637
Time taken for this epoch: 320.6678204536438 secs

Epoch 4 Batch 0 Loss 1.7983
Epoch 4 Batch 429 Loss 2.0066
Epoch 4 Batch 858 Loss 2.0214
Epoch 4 Loss 2.0214
Time taken for this epoch: 320.1987714767456 secs

Epoch 5 Batch 0 Loss 2.1040
Epoch 5 Batch 429 Loss 1.9588
Epoch 5 Batch 858 Loss 1.9820
Saving checkpoint for epoch 5 at /content/drive/MyDrive/Colab Notebooks/NLP Project/Transformer/ckpt-5
Epoch 5 Loss 1.9820
Time taken for this epoch: 326.2505192756653 secs

Epoch 6 Batch 0 Loss 1.8312
Epoch 6 Batch 429 Loss 1.9223
Epoch 6 Batch 858 Loss 1.9424
Epoch 6 Loss 

INFERENCE

In [None]:
def evaluate(input_document):
    input_document = newsTokenizer.texts_to_sequences([input_document])
    input_document = tf.keras.preprocessing.sequence.pad_sequences(input_document,
                                                                   maxlen=news_max_len,
                                                                   padding='post',
                                                                   truncating='post')

    encoder_input = tf.expand_dims(input_document[0], 0)

    decoder_input = [summaryTokenizer.word_index["<start>"]]
    output = tf.expand_dims(decoder_input, 0)

    for i in range(summary_max_len):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output)

        predictions, attention_weights = transformer(
            encoder_input,
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask
        )

        predictions = predictions[: ,-1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        if predicted_id == summaryTokenizer.word_index["<end>"]:
            return tf.squeeze(output, axis=0), attention_weights

        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights

def summarize(input_document):
    summarized = evaluate(input_document=input_document)[0].numpy()
    summarized = np.expand_dims(summarized[1:], 0)  # not printing <start> token
    return summaryTokenizer.sequences_to_texts(summarized)[0]  # since there is just one translated document

In [34]:
summary = summarize("The CBI on Saturday booked four former officials of Syndicate Bank and six others for cheating, forgery, criminal conspiracy and causing ₹209 crore loss to the state-run bank. The accused had availed home loans and credit from Syndicate Bank on the basis of forged and fabricated documents. These funds were fraudulently transferred to the companies owned by the accused persons.")
print(summary)

women will shut down jobs as they are work maneka


In [None]:
women will shut down jobs as they are work maneka