<a href="https://colab.research.google.com/github/Arup3201/Summarization-Project-using-Pointer-Gen/blob/main/Get_To_The_Point_Summarization_with_Pointer_Generator_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

For the wraping the outputs of colab

In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [2]:
import os
import pathlib
import re
import random
import numpy as np
import tensorflow as tf
import pickle

# For tokenizing and processing the examples for the model training
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Layers for the Encoder, Attention and Decoder
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM
from tensorflow.keras.layers import RepeatVector, Concatenate, Activation, Dot
from tensorflow.keras.layers import Dense

# For model initialization
from tensorflow.keras import Model

# For training the model
from tensorflow.keras.optimizers.experimental import Adagrad

In [3]:
# Download the CNN stories from the url into cnn_stories_tgz file
cnn_stories_tgz = tf.keras.utils.get_file(
    origin="https://huggingface.co/datasets/cnn_dailymail/resolve/main/data/cnn_stories.tgz",
)

# Download the Dailymail stories from the url into dailymail_stories_tgz file
dailymail_stories_tgz = tf.keras.utils.get_file(
    origin="https://huggingface.co/datasets/cnn_dailymail/resolve/main/data/dailymail_stories.tgz",
)

Downloading data from https://huggingface.co/datasets/cnn_dailymail/resolve/main/data/cnn_stories.tgz
Downloading data from https://huggingface.co/datasets/cnn_dailymail/resolve/main/data/dailymail_stories.tgz


In [4]:
cnn_stories_tgz, dailymail_stories_tgz

('/root/.keras/datasets/cnn_stories.tgz',
 '/root/.keras/datasets/dailymail_stories.tgz')

In [5]:
!tar -xzf /root/.keras/datasets/cnn_stories.tgz
!tar -xzf /root/.keras/datasets/dailymail_stories.tgz

In [6]:
cnn_stories_dir = pathlib.Path('/content/cnn/stories')
dailymail_stories_dir = pathlib.Path('/content/cnn/stories')

In [7]:
cnn_stories_dir, dailymail_stories_dir

(PosixPath('/content/cnn/stories'), PosixPath('/content/cnn/stories'))

In [8]:
def print_filenames(dir_path, num_files=5):
  '''Prints the name of the files that are present at `dir_path`.
  Maximum `num_files` number of files are shown.

  Arguments:
    dir_path: PosixPath, pointing to the directory of which the user
              wants to prints the file names.
    num_files: int, number of files user wants to print.

  returns:
    nothing
  '''

  count = 0
  for f in dir_path.glob('*.story'):
    print(f.name)
    count += 1

    if count == num_files:
      break
  else:
    print(f"Less than {num_files} is present!")

In [9]:
print_filenames(cnn_stories_dir)

79001768da5b42b4d55171d7ff3ae0c3b31db124.story
b12b92b703ff0743a0ddf58fba2de53407b72386.story
0b4f2f1ee7c893ae0c1e0373b84b3ace03a113e9.story
3f7b1fa2f2db92f5aa1cc4c23584296e040e8042.story
095c831720238ca7f7475a41e556e3b4ac12275e.story


In [10]:
print_filenames(dailymail_stories_dir)

79001768da5b42b4d55171d7ff3ae0c3b31db124.story
b12b92b703ff0743a0ddf58fba2de53407b72386.story
0b4f2f1ee7c893ae0c1e0373b84b3ace03a113e9.story
3f7b1fa2f2db92f5aa1cc4c23584296e040e8042.story
095c831720238ca7f7475a41e556e3b4ac12275e.story


In [11]:
# Define the global variables
dm_single_close_quote = u'\u2019' # unicode for closing single quote
dm_double_close_quote = u'\u201d' # unicode for closing double quote
END_TOKENS = ['.', '!', '?', '...', "'", "`", '"',
              dm_single_close_quote, dm_double_close_quote, ")"]

# Maximum stories to process from cnn and dailymail each
MAX_STORIES = 50000

# From the total data how to split into train, val and test
TRAIN_SIZE = 0.8 # Fraction of the total dataset to use for training
VAL_SIZE = 0.1 # Fraction of the total dataset to use for validation
TEST_SIZE = 0.1 # Fraction of the total dataset to use for testing

# For tokenization
VOCAB_SIZE = 20000 # Vocabulary size or no of unique words
OOV_TOKEN = "<OOV>" # Word token to represent the out-of-vocabulary words

# For standardization
START_TOKEN = '<START>' # Starting word of each sentence
END_TOKEN = '<END>' # Ending word of each sentence

# For the number of tokens to use in representing articles and summaries, hyperparameters
MAX_ARTICLE_TOKENS = 400 # Maximum no of tokens to consider in article when processing them for model
MAX_SUMMARY_TOKENS = 100 # Maximum no of tokens to consider in summary when processing them for model

# For dataset creation hyperparameters
BUFFER_SIZE = 5000 # Buffer size when using shuffle
BATCH_SIZE = 16 # No of examples in each batch

# Model Archietecture hyperparameters
EMB_OUT = 32 # Embedding output dimension
ENCODER_STATE_DIM = 32 # Encoder hidden(also cell) state dimension
DECODER_STATE_DIM = 64 # Decoder hidden(also cell) state dimension
DENSE1_UNITS = 16 # Attention first dense layer units(calculates partial energy)
DENSE2_UNITS = 1 # Attention secodn dense layer units(calculated final energy)
DENSE_UNITS = 64 # Units of the Dense layers before output layer

# Model Optimizer hyperparameters
LEARNING_RATE=0.15 # Learning rate
INIT_ACC_VAL=0.1 # Initial accumulator value
MAX_GRAD_NORM=2 # Gardient norm

# Model Checkpoint hyperparameters
BASELINE_MODEL_CHECKPOINT = "baseline-model/cp-{epoch:04d}.ckpt"
PATIENCE = 5

# Model Fit
EPOCHS = 35
STEPS_PER_EPOCHS = 5000

# Coverage mechanism
LAMBDA_VAL = 1

In [12]:
# Taking a sample .story file from cnn stories
sample_filename = "438411e10e1ef79b47cc48cd95296d85798c1e38.story"
sample_filedir = cnn_stories_dir

sample_filepath = sample_filedir / sample_filename
with open(sample_filepath, 'r') as f:
  sample_story = f.read()

print(f"A sample story:\n{sample_story}")

A sample story:
New York (CNN) -- The U.S. population is expected to top out at close to 312.8 million people just around the time crowds gather to watch the ball drop on New Year's Eve, according to new census data released Thursday.

The figure represents a 0.7% increase from last year, adding 2,250,129 people to the U.S. population since the start of 2011, and a 1.3% increase since Census Day, April 1, 2010.

The agency estimates that beginning in January, one American will be born every eight seconds and one will die every 12 seconds.

U.S.-bound immigrants are also expected to add one person every 46 seconds.

That combination of births, deaths and migration is expected to add a single person to the U.S. population every 17 seconds, the Census Bureau said.

Meanwhile, millions are set to ring in the new year.

In New York, authorities are preparing for large crowds in Manhattan's Times Square, where Lady Gaga is expected to join Mayor Michael Bloomberg to push the button that drop

I am creating a function `fix_missing_period` where I am taking 2 arguements, one for the `line` for which I am checking and fixing the period and other is `end_tokens` which is a list that has all the tokens that I should consider as ending of a sentence.

These are the steps -
1. Check if line contains `@highlight`, if True then just return the line.
2. Check if line is empty, then return line as it is.
3. Check is line ends with any of the `end_tokens`, if so then return line as it is.
4. Only is none of the above conditions match then append `.` to the current line.

In [13]:
def fix_missing_period(line, end_tokens=END_TOKENS):
  '''function to fix the missing periods for some story lines which do not end with
  any of the end_tokens mentioned.

  Argument:
    line: string, line of the story to fix the missing the period of.
    end_tokens: list of strings, all the tokens that are considered as line end.

  Returns:
    new line with fixed the ending part by adding an ending token if not present.
  '''
  if "@highlight" in line:
    return line
  elif line == "":
    return line
  elif line[-1] in end_tokens:
    return line

  return line + '.'

In [14]:
sample_text = "i have a bad habit of not giving full-stop after sentence\nLike this setence"
print(f"Fixing {fix_missing_period(sample_text)}")

Fixing i have a bad habit of not giving full-stop after sentence
Like this setence.


I am creating a function `split_article_summary` which will split the story into article and summary parts.

The function takes only 1 arguement and that is the `story` which will be splitted into article and summary.

The steps to follow are -
1. Split the story by new line `\n`. I will get a list of lines.
2. Strip the lines by using list comprehension.
3. Use list comprehension to make lower case each line by using `.lower()`.
4. Fix each line by adding period if there is none in that line using `fix_missing_period` function.
5. Make 2 empty list for `article` and `summary`.
6. Go through each line. In each line, I need to check 4 things,
  * line contains `@highlight` or not, if True then set `next_highlight` to `True` because the next to next line is going to be a summary line.
  * line is `""` empty or not, if True then ignore.
  * `next_highlight` is True or not, if True then append the line to `summary`.
  * If non of the ebove then append to `article`.
7. After done with filling the `article` and `summary` list with lines, join those sentences to make the whole article and summary. Here, I am using `.join()` method.

In [15]:
def split_article_summary(story):
  '''Splits the story into 2 parts, one for article and other for summary of that
  article. Returns the article and summary.

  Argument:
    story: string file that contains both article and summary combiningly.

  Returns:
    article, summary seperately from the story.

  '''
  lines = story.split('\n')
  lines = [line.strip() for line in lines]
  lines = [line.lower() for line in lines]

  # Fix the ending period
  lines = [fix_missing_period(line) for line in lines]

  # List to contain the article and summary lines
  article = []
  summary = []

  # Indicator of whether the next line is the summary or not
  next_highlight = False

  for line in lines:
    if "@highlight" in line:
      next_highlight = True
    elif line=="":
      continue
    elif next_highlight:
      summary.append(line)
    else:
      article.append(line)

  article = ' '.join(article)
  summary = ' '.join(summary)

  return article, summary

In [16]:
sample_article, sample_summary = split_article_summary(sample_story)

print(f"Sample Article after spliting:\n{sample_article}")
print(f"Sample Summary after spliting:\n{sample_summary}")

Sample Article after spliting:
new york (cnn) -- the u.s. population is expected to top out at close to 312.8 million people just around the time crowds gather to watch the ball drop on new year's eve, according to new census data released thursday. the figure represents a 0.7% increase from last year, adding 2,250,129 people to the u.s. population since the start of 2011, and a 1.3% increase since census day, april 1, 2010. the agency estimates that beginning in january, one american will be born every eight seconds and one will die every 12 seconds. u.s.-bound immigrants are also expected to add one person every 46 seconds. that combination of births, deaths and migration is expected to add a single person to the u.s. population every 17 seconds, the census bureau said. meanwhile, millions are set to ring in the new year. in new york, authorities are preparing for large crowds in manhattan's times square, where lady gaga is expected to join mayor michael bloomberg to push the button 

I am creating a function `get_articles_summaries` which will process each of the stories present in the directory of cnn and dailymail and return the articles, summaries in the form of list.

This function will take 2 arguements. One will be the `stories_dir` which is a Posix format string from `pathlib` library and another arguement is of `max_stories` which is the maximum number of stories that we will extract from those directories.

The process is simple. We will follow this steps -
1. Create 2 empty lists of `articles` and `summaries`.
2. Loop through all the files present in the directory `stories_dir` using `.glob` generator method.
3. Make a `count` variable which will count the number of processed strories and when it hits `max_stories`, break from the loop.
4. Inside the loop, you will open the file in `r` reading format, then just use `.read()` method to read the story.
5. Everytime after reading the story, split the article and summary part from it and then append them inside the `articles` and `summaries` list.
6. Return the 2 lists.

In [17]:
def get_articles_summaries(stories_dir, max_stories):
  '''stores the stories from stories_dir folder into a list and returns the list

  Arguments:
    stories_dir: Posix string, the directory where the stories are stored
    max_stories: maximum number of stories to store

  Returns:
    list of stories.

  '''
  articles = []
  summaries = []

  count = 0
  for f in stories_dir.glob("*.story"):
    count += 1
    with open(f, 'r') as reader:
      story = reader.read()

      article, summary = split_article_summary(story)

      articles.append(article)
      summaries.append(summary)

    if count == max_stories:
      break

  return articles, summaries

Out of all available .story files, we will only take `MAX_STORIES` number of files and then open them.

In [18]:
cnn_articles, cnn_summaries = get_articles_summaries(cnn_stories_dir, MAX_STORIES)

In [19]:
print(f"Total no of cnn stories captured are {len(cnn_articles)}\n\n")
print(f"One of the CNN articles: {cnn_articles[0]}\n\n")
print(f"The summary of this article: {cnn_summaries[0]}\n\n")

Total no of cnn stories captured are 50000


One of the CNN articles: in south korea, the abduction and rape of a 7-year-old girl last week outraged the public and prompted  president lee myung-bak to consider various measures including chemical castration to combat child sex crimes, according to local media. chemical castration involves administering medication -- via injection or tablets -- to take away sexual interest and make it impossible for a person to perform sexual acts. the effects are reversible, after the person stops taking the drug. lee said this week that all detering measures - including chemical castration - should be considered, according to the korea times. after high-profile child rape cases, politicians worldwide tend to pledge a crackdown and harsher punishments for sex offenders, involving chemical castration, said don grubin, professor of forensic psychiatry at newcastle university. "in a way, i liken it to cutting the hand off the thief," he said. "it's very sy

In [20]:
dailymail_articles, dailymail_summaries = get_articles_summaries(dailymail_stories_dir,
                                                                 MAX_STORIES)

In [21]:
print(f"Total no of dailymail stories captured are {len(dailymail_articles)}\n\n")
print(f"One of the Dailymail articles: {dailymail_articles[0]}\n\n")
print(f"The summary of this article: {dailymail_summaries[0]}\n\n")

Total no of dailymail stories captured are 50000


One of the Dailymail articles: in south korea, the abduction and rape of a 7-year-old girl last week outraged the public and prompted  president lee myung-bak to consider various measures including chemical castration to combat child sex crimes, according to local media. chemical castration involves administering medication -- via injection or tablets -- to take away sexual interest and make it impossible for a person to perform sexual acts. the effects are reversible, after the person stops taking the drug. lee said this week that all detering measures - including chemical castration - should be considered, according to the korea times. after high-profile child rape cases, politicians worldwide tend to pledge a crackdown and harsher punishments for sex offenders, involving chemical castration, said don grubin, professor of forensic psychiatry at newcastle university. "in a way, i liken it to cutting the hand off the thief," he said. "

I am creating another function -
`split_dataset(train_size, val_size, test_size)`: I am creating this function to split the original 1,00,000 examples into 80,000 training samples, 10,000 val samples and 10,000 test samples.

In [22]:
def split_dataset(dataset, train_size, val_size, test_size):
  first_split = train_size
  second_split = train_size+val_size
  third_split = train_size+val_size+test_size
  return dataset[:first_split, :], dataset[first_split:second_split, :], dataset[second_split:third_split, :]

Let us create a function `make_datasets`, that will be make training, validation and testing datasets. This function will -
1. This functions will have many argumenets and among them 2 argumenets `cnn_stories` and `dailymail_stories` are lists which has list of articles and summaries at 0 and 1 index. It means `cnn_stories[0]` is articles of cnn news and `cnn_stories[1]` is summaries of cnn news. It applies to `dailymail_stories` as well.
Objective of this step is to concatenate the cnn articles with dailymail articles and cnn summaries with dailymail summaries.
```python
[1, 2] + [3, 4] = [1, 2, 3, 4]
```

3. Convert the articles and summaries list into tensors and then concatenate them along a new axis. To create new axis I can use `tf.newaxis` in the indexing. E.g.
```python
  np.concatenate([articles[:, tf.newaxis], summaries[:, tf.newaxis]], axis=-1)
```
4. Shuffle the dataset using `random.sample` method.
```python
random.seed(seed_value) # To make sure that everytime it gives the same shuffle
random.sample(list_to_shuffle, len(list_to_shuffle))
```
5. Split the dataset into 3 parts, one for training, other for validation and last one for testing. All the tensors are of shape `(num_samples, 2)`.

In [23]:
def make_datasets(cnn_stories, dailymail_stories, train_fraction, val_fraction, test_fraction, seed_value=0):
  '''Create 3 datasets each for training, validation and testing respectively.
  This function concatenates the articles, summaries of cnn and dailymail news. After that it will tokenize
  them one by one in a loop. After it is done with the tokenization, it will shuffle the articles and
  summaries using random.sample method (although we have a helper function for it). Finally we do the
  splitting of the whole dataset. Remember here the returned values become tensors.

  Arguments:
    cnn_stories: list of 2 values, one for cnn articles and other for cnn summaries.
    dailymail_stories: list of 2 values, one for dailymail articles and other for dailymail summaries.
    train_size: float, specifying how much fraction of the original dataset to take for training.
    val_size: float, specifying how much fraction of the original dataset to take for validation.
    test_size: float, specifying how much fraction of the original dataset to take for testing.

  Returns:
    returns a tuple with 3 values inside it, `training_data`, `validation_data` and `testing_data`
    with the specified amount of data in it.
    Each one of them are tensor with shape `(num_samples, 2)`. `shape[1]=2` for article and summary.
  '''
  articles = cnn_stories[0] + dailymail_stories[0]
  summaries = cnn_stories[1] + dailymail_stories[1]

  articles = np.array(articles, dtype=object)
  summaries = np.array(summaries, dtype=object)

  dataset = np.concatenate((articles[:, tf.newaxis], summaries[:, tf.newaxis]), axis=-1)

  random.seed(seed_value)
  shuffled_indices = random.sample(list(range(dataset.shape[0])), dataset.shape[0])

  dataset = dataset[shuffled_indices, :]

  train_size = int(train_fraction * dataset.shape[0])
  val_size = int(val_fraction * dataset.shape[0])
  test_size = dataset.shape[0] - (train_size + val_size)

  training_samples, validation_samples, testing_samples = split_dataset(dataset,
                                                                        train_size,
                                                                        val_size,
                                                                        test_size)

  return (training_samples, validation_samples, testing_samples)

In [24]:
train_dataset, val_dataset, test_dataset = make_datasets([cnn_articles, cnn_summaries], [dailymail_articles, dailymail_summaries], TRAIN_SIZE, VAL_SIZE, TEST_SIZE)

In [25]:
print(f"Type of the datasets: {type(train_dataset)}\n")

print(f"Training dataset shape: {train_dataset.shape}")
print(f"Validation dataset shape: {val_dataset.shape}")
print(f"Testing dataset shape: {test_dataset.shape}\n")

print(f"First example in the training dataset looks like: \n {train_dataset[0]}\n")

Type of the datasets: <class 'numpy.ndarray'>

Training dataset shape: (80000, 2)
Validation dataset shape: (10000, 2)
Testing dataset shape: (10000, 2)

First example in the training dataset looks like: 
 ['london, england (cnn) -- rafael nadal\'s shock french open exit at the hands of a player ranked outside the top 20 is already attracting suggestions that it may be the biggest upset in tennis history. rafael nadal faces media after his shock loss to sweden\'s robin soderling at the french open on sunday. nadal, the world number one had never lost a game at roland garros, winning the previous four titles in a row. however, sweden\'s robin soderling proved too good for the "king of clay," beating him 6-2 6-7 6-4 7-6. the official french open web site called it "one of the greatest upsets in grand slam history," while other news outlets have rushed to praise the swede for the "game of his life." the result leaves the men\'s draw at the french open tournament wide open -- with roger fe

Before the tokenization, we need to preprocess the text data so that it can be properly tokenized. In this step we need to choose whether we want to keep punctuations or not, whether we should keep the numbers or not and so on. There are 2 functions I will create, one for simple `standardize` and other to feed the Tokenizer class when creating the `tokenizer`. `standardize` function implements the following steps -

1. Lower case the strings passed to it. It is already done but for user data it might not be the case so, we will still perform this step.
2. Replace the single and double opening and closing quotes like `‘ → \u2018`, `’ → \u2019`, `“ → \u201c` and `” → \u201d` by `'` and `"` respectively.
3. Replace the punctutations ``['.', '?', '!', ',', ':', '-', ''', '"', '_', '(', ')', '{', '}', '[', ']', '`', ';', '...']`` by `[SPACE]punctutations`.
In this process we need to make sure that the floating point numbers like `1.78` do not become `1 .78`. To do that the correct regex expression is ``(?<!\d)\s*([!"#$£%&\'\(\)*+,-./:;<=>?@\[\]\\^_`{|}~])\s*(?!\d)``.
4. Strip the texts from extra starting or ending spaces. Finally, remove extra spaces using regex expression like `\s{2,}`.

`custom_analyzer` function which will be feed to the Tokenizer as the value for `analyzer`, has some more steps to implement -
1. Remove the `START_TOKEN` and `END_TOKEN` from the text. So that tokenizer does not standardize them.
2. Standardize the text with `standardizer`.
3. Add back the `START_TOKEN` and `END_TOKEN` because you want your tokenizer to learn them.
4. Remove unwanted spaces in between words.
5. Split the text into words which are seperated by ' '.
6. Strip each of the words in the sentence. Finally, return it.

In [26]:
# Standardize the text data
def standardizer(text):
  '''Standardize the text provided to the function
  The text is lower cased. Then, the opening and closing quotes are removed. I add spaces before the
  punctuations like `don't` becomes `don ' t`, ignoring the numerical values so that `1.78` does not become
  `1 . 78`. Finally, it strips the text and removes any type of unwanted spaces in it.

  Argument:
    text: str, the text to standardize

  Returns:
    returns the standadized text
  '''

  # Lower case the text
  text = text.lower()

  # Replace the special single and double opening and closing quotes
  text = re.sub(r'[\u2019\u2018]', "'", text)
  text = re.sub(r'[\u201c\u201d]', '"', text)

  # Add space before punctuations and ignore floating point numbers.
  text = re.sub(r'(?<!\d)\s*([!"#$£%&\'\(\)*+,-./:;<=>?@\[\]\\^_`{|}~])\s*(?!\d)',
                  r' \1 ', text)  # It used to also remove commas after numbers like '27,' will be removed

  # Remove spaces after sentence end and other unwanted spaces from text
  text = text.strip()
  text = re.sub('\s{2,}', ' ', text)

  return text

# custom analyzer for the Tokenizer class
def custom_analyzer(text):
  '''Custom analyzer to provide to the `Tokenizer` class when creating the tokenizer.

  Argument:
    text: str, the text that will be tokenized

  Returns:
    returns the splitted sentence
  '''
  # Remove START and END before standardizing
  if START_TOKEN in text:
    text = re.sub(f'{START_TOKEN} ', '', text)
  if END_TOKEN in text:
    text = re.sub(f'{END_TOKEN} ', '', text)

  # Standardize the text first
  text = standardizer(text)

  # Add back the START and END tokens
  text = ' '.join([START_TOKEN, text, END_TOKEN])

  # Split the sentence into words to tokenize
  words = text.split(' ')
  words = [word.strip() for word in words]

  return words

In [27]:
sample_texts = ["I have been working on, \nbut \tnever did it in this way.",
                "U.S won the world cup and bagged 1.78 million dollars.",
                "India had M.S. Dhoni won made it this far.",
                "My email address is arupjana7365@gmail.com.",
                "It can take care of dailymail single opening quote’ also.",
                "I have 10,000 Rs in my bank",
                "This sentence has , after a number 12,",
                "This sentence contains <START> token and <END> token."]

print(f"After Standardizing the sample texts:\n{[standardizer(text) for text in sample_texts]}\n")
print(f"After applying custom analyzer on sample texts:\n{[custom_analyzer(text) for text in sample_texts]}")

After Standardizing the sample texts:
['i have been working on , but never did it in this way .', 'u . s won the world cup and bagged 1.78 million dollars .', 'india had m . s . dhoni won made it this far .', 'my email address is arupjana7365@gmail . com .', "it can take care of dailymail single opening quote ' also .", 'i have 10,000 rs in my bank', 'this sentence has , after a number 12,', 'this sentence contains < start > token and < end > token .']

After applying custom analyzer on sample texts:
[['<START>', 'i', 'have', 'been', 'working', 'on', ',', 'but', 'never', 'did', 'it', 'in', 'this', 'way', '.', '<END>'], ['<START>', 'u', '.', 's', 'won', 'the', 'world', 'cup', 'and', 'bagged', '1.78', 'million', 'dollars', '.', '<END>'], ['<START>', 'india', 'had', 'm', '.', 's', '.', 'dhoni', 'won', 'made', 'it', 'this', 'far', '.', '<END>'], ['<START>', 'my', 'email', 'address', 'is', 'arupjana7365@gmail', '.', 'com', '.', '<END>'], ['<START>', 'it', 'can', 'take', 'care', 'of', 'daily

Now, I need to find the tokens from the articles. I need to use only training articles not any other and also I will not use summaries data because that will be my target and I won't know what type of words I will encounter when summarizing the source article. So, the only words that I know will be from the articles of training dataset. Here, I am going to use the `tensorflow.keras.preprocessing.text.Tokenizer` in short `Tokenizer` to find the tokens from the articles and then finally converting the articles into sequence of integers. One thing to remember is here we are going to use `oov_token` arguement of `Tokenizer` to mention the token we want to use for out-of-vocabulary words.

When fiting the texts on `tokenizer` make sure to remove floating point and integer numbers using the regex expression - `[+-]?[0-9]*[.]?[0-9]+`. I am making sure that tokenizer does learn the numbers because it can always be taken from the original articles data and we do not to remember them in vocab.

In [28]:
def get_tokenizer(texts, num_words, oov_token=None, filters = '#*+/:<=>@[\\]/^{|}~\t\n'):
  '''This will create the tokenizer needed for the task in hand.
  The tokenizer will be trained on the `texts`. Tokenizer will have vocabulary length `num_words`.
  The `oov_token` will be used as the token represent the out-of-vocabulary words. The `filters` are
  the ones which the tokenizer will remove when tokenizing any sentence given to it. The returned
  tokenizer is using a custom analyzer that can standardize the sentence before tokenizing using the
  `standardizer` function and then splits the sentence into words. After that it tokenizes the sentence.
  As for the vocabulary, the returned tokenizer's vocabulary does not contain any number, as I have removed
  them before feeding them into `Tokenizer.fit_on_texts` method.

  Arguments:
    texts: list of strings, the tokenizer will be trained on this strings
    num_words: int, number of vocabulary words the tokenizer will consider
    oov_token: str, token to represent out-of-vocabulary words
    filters: str, all the characters that the tokenizer will remove before tokenizing

  Returns:
    tokenzier of the `Tokenizer` class after learning vocabulary from `texts`
  '''

  # Create the tokenizer usinf Tokenizer class
  tokenizer = Tokenizer(num_words=num_words,
                        filters=filters,
                        oov_token=oov_token,
                        analyzer=custom_analyzer)

  # Remove the numbers from the dataset so that tokenizer does not add them inside vocabulary
  texts = [re.sub(r"[+-]?[0-9]*[.]?[0-9]+", "", text) for text in texts]

  # Fit the data with fit_on_texts method
  tokenizer.fit_on_texts(texts)

  return tokenizer

In [29]:
print(f"Length the articles dataset: {len(list(train_dataset[:, 0]))}")

Length the articles dataset: 80000


Create the `tokenizer` using the articles from training dataset by using `train_dataset[:, 0]`, with a vocabulary size of `VOCAB_SIZE` and use `OOV_TOKEN` token to represent out-of-vocabulary words.

In [30]:
tokenizer = get_tokenizer(list(train_dataset[:, 0]), VOCAB_SIZE, OOV_TOKEN)

In [31]:
print(f"The vocabulary for the tokenizer has a length {len(tokenizer.word_index.keys())}\n\n")


print(f"{OOV_TOKEN} word has index: {tokenizer.word_index[OOV_TOKEN]}")
print(f"{START_TOKEN} word has index: {tokenizer.word_index[START_TOKEN]}")
print(f"{END_TOKEN} word has index: {tokenizer.word_index[END_TOKEN]}\n\n")


print(f"'teacher' word has index: {tokenizer.word_index['teacher']}\n")

print(f"Text:\n{train_dataset[0, 0]}\n\n")
sample_sequence = tokenizer.texts_to_sequences([train_dataset[0, 0]])
print(f"Text to Sequence of the first article:\n{sample_sequence}\n")
print(f"Sequence to Text of the first acrticle:\n{tokenizer.sequences_to_texts(sample_sequence)}")

The vocabulary for the tokenizer has a length 171557


<OOV> word has index: 1
<START> word has index: 80
<END> word has index: 81


'teacher' word has index: 2110

Text:
london, england (cnn) -- rafael nadal's shock french open exit at the hands of a player ranked outside the top 20 is already attracting suggestions that it may be the biggest upset in tennis history. rafael nadal faces media after his shock loss to sweden's robin soderling at the french open on sunday. nadal, the world number one had never lost a game at roland garros, winning the previous four titles in a row. however, sweden's robin soderling proved too good for the "king of clay," beating him 6-2 6-7 6-4 7-6. the official french open web site called it "one of the greatest upsets in grand slam history," while other news outlets have rushed to praise the swede for the "game of his life." the result leaves the men's draw at the french open tournament wide open -- with roger federer and andy murray now the highest-ran

The oddness you might see if you are that much familiar with `Tokenizer` class is, even though I have specified that `num_words=VOCAB_SIZE` which is `20,000` still the length of the `word_index` is more that that. Does that mean we are doing something wrong?
NO, here although tokenizer computes the word_index of all other words apart from those first 20000 words, it will not use them when we convert them into sequence. Let's look at one example to understand that.

In [32]:
list(tokenizer.word_index.keys())[21000]

'pfizer'

In [33]:
oov_word = list(tokenizer.word_index.keys())[21000]
sample_text = f"This example is to test the above fact with the word `{oov_word}`"
sample_sequence = tokenizer.texts_to_sequences([sample_text])

print(f"Text: {sample_text}\n\n")
print(f"Tokenized text: {tokenizer.sequences_to_texts(sample_sequence)}")
print(f"Sequence: {sample_sequence}")

Text: This example is to test the above fact with the word `pfizer`


Tokenized text: ['<START> this example is to test the above fact with the word <OOV> <OOV> <OOV> <END>']
Sequence: [[80, 33, 946, 16, 5, 965, 2, 1176, 477, 22, 2, 1095, 1, 1, 1, 81]]


Although the word was present in the `word_index` mapping still tokenizer represented it with `<OOV>`.

In [34]:
sample_text = "What happens when I add a number 2.1 in this sentence!"
sample_sequence = tokenizer.texts_to_sequences([sample_text])

print(f"Text: {sample_text}\n\n")
print(f"Tokenized text: {tokenizer.sequences_to_texts(sample_sequence)}")
print(f"Sequence: {sample_sequence}")

Text: What happens when I add a number 2.1 in this sentence!


Tokenized text: ['<START> what happens when i add a number <OOV> in this sentence ! <END>']
Sequence: [[80, 61, 1709, 59, 24, 2011, 9, 279, 1, 10, 33, 1424, 308, 81]]


In [35]:
sample_text = "What happens when I add parenthesis (I am inside it!)."
sample_sequence = tokenizer.texts_to_sequences([sample_text])

print(f"Text: {sample_text}\n\n")
print(f"Tokenized text: {tokenizer.sequences_to_texts(sample_sequence)}")
print(f"Sequence: {sample_sequence}")

Text: What happens when I add parenthesis (I am inside it!).


Tokenized text: ['<START> what happens when i add <OOV> ( i am inside it ! ) . <END>']
Sequence: [[80, 61, 1709, 59, 24, 2011, 1, 44, 24, 481, 572, 18, 308, 43, 3, 81]]


We have the `tokenizer` to tokenize the articles and summaries. We need to pad those sequences to fit the requirements.

In the paper, the articles are limited to have 400 tokens and summary has, 100 tokens at training and 120 tokens for testing.

I will be using `pad_sequences` method to pad or truncate the articles and summaries based on their length.

NOTE: I am using same tokenizer for article and summary. But, later I might change that to 2 different tokenizers each having different `num_words`.

In [36]:
def tokenize_pad(texts, tokenizer, padding, truncating, maxlen):
  '''Tokenize the `texts` using the tokenizer. Then, pad the sequences or truncate the sequences
  depending the length. If the length exceeds `maxlen` then it will be truncated and if not then it will be
  padded. The padding and truncating can happend at the beginning or at the end of the sequence depending
  on the value of `padding` and `truncating` respectively.

  Arguments:
    texts: list of strings, the sentences to tokenize and pad
    tokenizer: Tokenizer class object, helps in tokenizing the `texts`
    padding: str, can take 2 values `pre` or `post`. If `pre` then padding will happen at the beginning,
    if `post` then padding will happen at the end.
    truncating: str, can take 2 values `pre` or 'truncating`, works the same as `padding`
    maxlen: int, maximum length after padding or truncating

  Returns:
    returns the tokenized and padded sentences
  '''
  sequences = tokenizer.texts_to_sequences(texts)

  padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding=padding, truncating=truncating)

  return padded_sequences

In [37]:
sample_texts

['I have been working on, \nbut \tnever did it in this way.',
 'U.S won the world cup and bagged 1.78 million dollars.',
 'India had M.S. Dhoni won made it this far.',
 'My email address is arupjana7365@gmail.com.',
 'It can take care of dailymail single opening quote’ also.',
 'I have 10,000 Rs in my bank',
 'This sentence has , after a number 12,',
 'This sentence contains <START> token and <END> token.']

In [38]:
tokenize_pad(sample_texts, tokenizer, padding="post", truncating="post", maxlen=20)

array([[   80,    24,    27,    55,   316,    19,     4,    31,   211,
          146,    18,    10,    33,   139,     3,    81,     0,     0,
            0,     0],
       [   80,    78,     3,    13,   269,     2,    95,   634,     8,
            1,     1,   168,  1592,     3,    81,     0,     0,     0,
            0,     0],
       [   80,   770,    51,   133,     3,    13,     3,     1,   269,
          138,    18,    33,   309,     3,    81,     0,     0,     0,
            0,     0],
       [   80,    90,  4381,  1008,    16,     1,     3,   401,     3,
           81,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [   80,    18,    63,   158,   297,     7,     1,   863,  1070,
         7014,    11,    69,     3,    81,     0,     0,     0,     0,
            0,     0],
       [   80,    24,    27,     1, 17628,    10,    90,   976,    81,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [  

After this, we need the model that we can train on this dataset. The model archietecture will be 3-
1. Base-line model: Seq-Seq model with attention mechanism.
2. Pointer Generetor model: With seq-seq attention model will be implementing the pointer generator that can either copy words from article or generate words from the pre-defined vocabulary.
3. Coverage mechanism: Along with the pointer generator that will take case of the out-of-vocabulary words. Coverage mechanism will help prevent the repetition of the words in the summary.

### Base-Line Model: Seq-seq with Attention

#### Creating `tf_train_dataset`, `tf_val_dataset` using tf.data API

Now, I need write the `generate_example` function that can help me generate model inputs for training, validation and testing set. For different type of dataset, we will create different generator with the help of the `example_generator` method to create `tf.data.Dataset` object for our model.

We can use `tf.data` API to create the input data pipeline for our model. I will use the `tf.data.Dataset` class to get the the examples from the `train_example_generator` function which uses `generate_example`, we can save the generator inside `example_gen` which we can iterate over later to get the examples. We can yield the examples according to the need of the problem.

Remember, along with input article tokens and input summary tokens, we need the initial states as an input to the model. So, as we process the examples we can create this zero-value tensors and yield them along with 2 original inputs.

For more about datasets from generator, refer to [here](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator).

Create a Geneator function that generates the source and target for training the model.

In [None]:
def generate_example_v1(inputs, targets, input_tokenizer, target_tokenizer, input_len, target_len):
  '''Generates examples for the model. Processes the `inputs` and `targets` with their respective
  tokenizers and tokenize them to `input_len` and `target_len` length.

  Arguments:
    inputs: list of input sentences
    targets: list of target sentences
    input_tokenizer: Tokenizer class object, tokenizer for inputs
    target_tokenizer: Tokenizer class object, tokenizer for targets
    input_len: int, the length of the tokenization for inputs
    target_len: int, the length of the tokenization for targets

  Returns:
    returns 2 values, a tuple containing 2 numpy arrays (input_tokens, target_tokens[:-1]) and
    another numpy array target_tokens[1:]
  '''

  for inp, tar in zip(inputs, targets):
    # Tokenizing article words
    inp_tokens = tokenize_pad([inp],
                              input_tokenizer,
                              padding="post",
                              truncating="post",
                              maxlen=input_len)

    # Tokenizing summary words
    tar_tokens = tokenize_pad([tar],
                 target_tokenizer,
                 padding="post",
                 truncating="post",
                 maxlen=target_len)

    yield (inp_tokens[0], tar_tokens[0][:-1]), tar_tokens[0][1:]

In [None]:
print(f"Example generated by the generator v1:")

# (inp_art_tokens, inp_sum_tokens), tar_sum_tokens = generate_example(list(train_dataset[:, 0]),
example_gen = generate_example_v1(list(train_dataset[:, 0]),
                               list(train_dataset[:, 1]),
                               input_tokenizer=tokenizer,
                               target_tokenizer=tokenizer,
                               input_len=MAX_ARTICLE_TOKENS,
                               target_len=MAX_SUMMARY_TOKENS)

inps, tar = next(example_gen)
print(f"Inputs:\n{inps[0]}\n{inps[1]}\n\n")
print(f"Target:\n{tar}")

Example generated by the generator :
Inputs:
[   80    44    42    43    12    12   831   127  3951  3915   152    36
 18545   821     5     9     1  1296    10  3733     4  1330  4243  1151
     4    31  9579   812    50     2   258   173  7545    11    13  9225
    15 17603     9   587  1606     3   831   127  3064     9   978     8
  2466  1296    10  3733    19   233     3  3733    11    13  4038     1
  6059   267    73   108   292    57    52   396  2268    15     9   227
   591    10  1087     8    17    14   127    11    13  1029    12    12
    66  5408  5142    32   201    79     3    13     3  3087   377  1582
     3  1501     8  4316  2867    59     2  3546   150    20    10     2
  1556   237 18546    12    12    99   112    27    55   135    10     2
  1330   635     3     6   831   127    11    13  1029   280    38    27
    55     1     8    18  2239   567    11    41     1    12  5157     3
    31    22    71 13873     8     2   740     7    33     1    12   151
     4

In [None]:
print(f"Shapes of the Inputs:\n{inps[0].shape}\n{inps[1].shape}\n\n")
print(f"Shape of the Target:\n{tar.shape}")

Shapes of the Inputs:
(400,)
(99,)


Shape of the Target:
(99,)


In [None]:
print(f"Data Type of the Inputs data:\n{inps[0].dtype}\n{inps[1].dtype}\n\n")
print(f"Data Type of the Target data:\n{tar.dtype}")

Data Type of the Inputs data:
int32
int32


Data Type of the Target data:
int32


In [None]:
inps, tar = next(example_gen)
print(f"Second Inputs:\n{inps[0]}\n{inps[1]}\n\n")
print(f"Second Target:\n{tar}")

Second Inputs:
[   80    44    42    43    12    12   222  6837     1     1     8     1
    10   454   695    19   230  1234    60  7293    10  8812  1046  1015
    49  5048   378    10     2   118     3  1450     1     8  3349    29
     2   695   254     7  1881    48   145   321    10     2   308     4
  9036  1031 12160     1    17     3     6  2072  1280     5  1046     8
  1351  1274    72    25    62   476     4     6    46    17     3     6
   344    15  1491     8  1450     3     6  8812  1046    52  9698   647
     5  7643   549  6953     7  5879     1  7318     7     9     1  4173
     7     2  5879    48  1231    10   193  7266    29  1675  8390     5
  2537     1     4     1    17     3    58    48   282  4843     4   185
     9   105   620    39    20   442    32     9   419   117   359     5
   114     5   174  1479    19  5879     1     2   620    20  4267   233
  1439    22  2781  1373     4    42  1260     1   234     3  1239  1450
    69  1015     2  5048     7    79

In [None]:
def train_example_generetor_v1():
  example_gen = generate_example_v1(list(train_dataset[:, 0]),
                                list(train_dataset[:, 1]),
                                input_tokenizer=tokenizer,
                                target_tokenizer=tokenizer,
                                input_len=MAX_ARTICLE_TOKENS,
                                target_len=MAX_SUMMARY_TOKENS)

  for example in example_gen:
    s0 = np.zeros((DECODER_STATE_DIM, ), dtype=np.int32)
    c0 = np.zeros((DECODER_STATE_DIM, ), dtype=np.int32)

    (input_0, input_1), target = example
    yield (input_0, input_1, s0, c0), target

In [None]:
output_signature = (
    (tf.TensorSpec(shape=(MAX_ARTICLE_TOKENS, ), dtype=tf.int32),
     tf.TensorSpec(shape=(MAX_SUMMARY_TOKENS-1, ), dtype=tf.int32),
     tf.TensorSpec(shape=(DECODER_STATE_DIM, ), dtype=tf.int32),
     tf.TensorSpec(shape=(DECODER_STATE_DIM, ), dtype=tf.int32)),
    tf.TensorSpec(shape=(MAX_SUMMARY_TOKENS-1, ), dtype=tf.int32)
)

tf_train_dataset = tf.data.Dataset.from_generator(generator=train_example_generetor_v1,
                                                  output_signature=output_signature)
tf_train_dataset = tf_train_dataset.shuffle(BUFFER_SIZE)
tf_train_dataset = tf_train_dataset.batch(BATCH_SIZE, drop_remainder=True)
tf_train_dataset = tf_train_dataset.prefetch(1)

In [None]:
def val_example_generetor_v1():
  example_gen = generate_example_v1(list(val_dataset[:, 0]),
                                list(val_dataset[:, 1]),
                                input_tokenizer=tokenizer,
                                target_tokenizer=tokenizer,
                                input_len=MAX_ARTICLE_TOKENS,
                                target_len=MAX_SUMMARY_TOKENS)

  for example in example_gen:
    s0 = np.zeros((DECODER_STATE_DIM, ), dtype=np.int32)
    c0 = np.zeros((DECODER_STATE_DIM, ), dtype=np.int32)

    (input_0, input_1), target = example
    yield (input_0, input_1, s0, c0), target


output_signature = (
    (tf.TensorSpec(shape=(MAX_ARTICLE_TOKENS, ), dtype=tf.int32),
     tf.TensorSpec(shape=(MAX_SUMMARY_TOKENS-1, ), dtype=tf.int32),
     tf.TensorSpec(shape=(DECODER_STATE_DIM, ), dtype=tf.int32),
     tf.TensorSpec(shape=(DECODER_STATE_DIM, ), dtype=tf.int32)),
    tf.TensorSpec(shape=(MAX_SUMMARY_TOKENS-1, ), dtype=tf.int32)
)

tf_val_dataset = tf.data.Dataset.from_generator(generator=val_example_generetor_v1,
                                                  output_signature=output_signature)
tf_val_dataset = tf_val_dataset.shuffle(BUFFER_SIZE)
tf_val_dataset = tf_val_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
for (art_inp, sum_inp, s0, c0), sum_tar in tf_train_dataset.take(1):
  print(f"Input tokenized article shape: {art_inp.shape}")
  print(f"Input tokenized summary shape: {sum_inp.shape}\n")

  print(f"Target tokenized summary shape: {sum_tar.shape}")

Input tokenized article shape: (16, 400)
Input tokenized summary shape: (16, 99)

Target tokenized summary shape: (16, 99)


A small demonstration of how Dot layer works

In [None]:
x = np.arange(10).reshape(1, 2, 5)
example_layer = tf.keras.layers.Dense(units=DENSE1_UNITS)

print(f"After applying dense layer on x of shape:{x.shape}, output has {example_layer(x).shape} shape")

After applying dense layer on x of shape:(1, 2, 5), output has (1, 2, 16) shape


In [None]:
x1 = np.arange(10).reshape(1, 2, 5)
x2 = np.arange(10, 22).reshape(1, 2, 6)
print(f"x1: {x1}\nx2: {x2}")

Dot(axes=1)([x1, x2])

x1: [[[0 1 2 3 4]
  [5 6 7 8 9]]]
x2: [[[10 11 12 13 14 15]
  [16 17 18 19 20 21]]]


<tf.Tensor: shape=(1, 5, 6), dtype=int64, numpy=
array([[[ 80,  85,  90,  95, 100, 105],
        [106, 113, 120, 127, 134, 141],
        [132, 141, 150, 159, 168, 177],
        [158, 169, 180, 191, 202, 213],
        [184, 197, 210, 223, 236, 249]]])>

#### Attention Mechanism

In [None]:
def one_time_attention_v1(a, s_prev,
                       repeater, concatenator, densor_1, densor_2, softmax_layer, dotter):
  '''Calculates the attention score and returns the context for the current timestep in the decoder.
  Attention mechanism uses encoder outputs `a` of shape `(batch, timesteps, features)` and decoder
  previous hidden state `s_prev` of shape `(batch, features)`, then calculates alignment scores `alphas`
  for each encoder timestep with the help of energies computed with 2 dense layers using `a` and `s_prev`.

  Arguments:
    a: tf.Tensor object, encoder output of shape `(batch, timesteps, features)` or `(batch, Tx, 2*n_a)`
    s_prev: tf.Tensor object, decoder previous hidden state of shape `(batch, features)` or `(batch, n_s)`
    repeater: RepeatVector layer, repeat the `s_prev` `Tx` times
    concatenator: Concatenate layer, concatenates `a` and repeated `s_prev`, Concatenates along axis=-1
    densor_1: Dense layer, calculates the pertial energies `e`, with `units=d1_units`
    refer to `baseline_model` function for details about this variable
    densor_2: Dense layer, calculated the energies `energies`, with `units=d2_units`
    refer to `baseline_model` function for details about this variable
    softmax_layer: Activation layer, computes softmax of the energies and calculates `alphas`, with
    `units=article_vocab_size` refer to `baseline_model` function for details about this variable
    dotter: Dot layer, Performs dot operation between `alphas` and `a` along axis=1

  Returns:
    returns the context of shape `(batch, features)`
  '''

  # Repeat the `s_prev` `Tx` times
  s_prev = repeater(s_prev) # (batch, Tx, n_s)

  # Concatenate `a` and `s_prev` along axis=-1
  concat = concatenator([a, s_prev]) # (batch, Tx, n_a + n_s)

  # Apply dense layer to get partial energies e
  e = densor_1(concat) # (batch, Tx, d1_units)

  # Apply dense layer again to get energies
  energies = densor_2(e) # (batch, Tx, d2_units)

  # Apply softmax over the energies
  alphas = softmax_layer(energies) # (batch, Tx, d2_units)

  # Dot the alphas and a along axes=1
  context = dotter([alphas, a]) # (batch, d2_units, 2*n_a)

  return context

#### Encoder-decoder Model using Attention mechanism

In [None]:
def baseline_model(Tx, Ty,
                   emb_dim, n_a, n_s, d1_units, d2_units, d_units,
                   article_vocab_size, summary_vocab_size):
  '''This implements the bas-line model archietecture for summarization.
  It is a seq-seq model with attention mechanism implemented in it. The encoder take an input
  with `Tx` time-steps and summarizes with the help of decoder into Ty words. The encoder and decoder
  hidden states are `n_a` and `n_s` dimension respectively. The words are taken from the vocabulary of
  article and summary `article_vocab` and `summary_vocab` with size `article_vocab_size` and
  `summary_vocab_size` respectively.

  Arguments:
    Tx: int, length of the input article
    Ty: int, length of the output summary
    n_a: int, dimension of the encoder hidden states
    n_s: int, dimension of the deocder hidden states
    d1_units: int, units for the first dense layer in attention mechanism
    d2_units: int, units for the second dense layer in attention mechanism
    d_units: int, units for the dense layer before output layer
    article_vocab_size: int, length of the article vocabulary
    summary_vocab_size: int, length of the summary vocabulary

  Returns:
    returns the base line model
  '''
  # Defining the input for our model with shape (None, Tx) and (None, Ty) for encoder input and decoder input
  X_inp = Input(shape=(Tx))
  X_tar = Input(shape=(Ty))

  # Initialize s0
  s0 = Input(shape=(n_s, ), name="s0")
  # Initialize c0
  c0 = Input(shape=(n_s, ), name="c0")

  # Initialize the a and s with a0 and s0
  s = s0 # (batch, n_s)
  c = c0 # (batch, n_s)

  # Define the outputs as empty list
  outputs = []

  # First embedding layer for the article input
  encoder_inp = Embedding(article_vocab_size, emb_dim)(X_inp) # (batch, Tx, emb_dim)

  # Encoder: Bidirectional layer with LSTM cells
  a = Bidirectional(LSTM(units=n_a, return_sequences=True))(encoder_inp) # (batch, Tx, n_a)

  # Define the embedding for decoder
  decoder_inp = Embedding(summary_vocab_size, emb_dim)(X_tar) # (batch, Ty, emb_dim)

  # Define the layers for Attention so that we can use the same weights for all decoder timesteps
  repeater = RepeatVector(Tx)
  concatenator = Concatenate(axis=-1)
  attn_densor1 = Dense(units=d1_units, activation='tanh')
  attn_densor2 = Dense(units=d2_units, activation='linear', use_bias=False)
  softmax_layer = Activation('softmax', name="attention_weights")
  dotter = Dot(axes=1)

  # Define the Decoder unidirectional LSTM for shared weights
  post_attention_lstm = LSTM(units=n_s, return_state=True)

  # Define the last dense layer before output layer with linear activation
  densor = Dense(units=d_units, activation='linear')

  # Define the output layer so that it does not initalize again and again for shared weights
  output_layer = Dense(units=summary_vocab_size, activation='softmax')

  # Decoder: Appends outputs from the output layer in each timestep
  for t in range(Ty):
    # Get the decoder input for current timestep
    curr_dec_in = decoder_inp[:, t:t+1, :] # (batch, 1, emb_dim)

    # Get the context from the attention mechanism
    context = one_time_attention_v1(a, s, # (batch, d2_units, 2*n_a)
                                 repeater, concatenator, attn_densor1, attn_densor2, softmax_layer, dotter)

    concat = Concatenate(axis=-1)([curr_dec_in, context]) # (batch, d2_units, emb_dim+2*n_a); d2_units=1 otherwise error
    _, s, c = post_attention_lstm(concat, initial_state=[s, c]) # _, (batch, n_s), (batch, n_s)

    # Calculate the output after using 2 linear dense layers
    out = densor(s) # (batch, d_units)
    out = densor(out) # (batch, d_units)
    # Use the output_layer to get the output
    out  = output_layer(out) # (batch, summary_vocab_size)

    # Append the final output to the outputs list
    outputs.append(out)

  # Stack the list of each timesteps output along axis=1
  outputs = tf.stack(outputs, axis=1) # (batch, Ty, summary_vocab_size)

  model = Model(inputs=[X_inp, X_tar, s0, c0], outputs=outputs)

  return model

In [None]:
# Reset states generated by Keras
tf.keras.backend.clear_session()

In [None]:
Tx = MAX_ARTICLE_TOKENS
Ty = MAX_SUMMARY_TOKENS - 1
emb_dim = EMB_OUT
n_a = ENCODER_STATE_DIM
n_s= DECODER_STATE_DIM
d1_units = DENSE1_UNITS
d2_units = DENSE2_UNITS
d_units = DENSE_UNITS
article_vocab_size = VOCAB_SIZE
summary_vocab_size = VOCAB_SIZE

model = baseline_model(Tx, Ty,
                       emb_dim, n_a, n_s, d1_units, d2_units, d_units,
                       article_vocab_size, summary_vocab_size)

In [None]:
print(f"Model has {model.count_params():,} parameters.")

Model has 2,644,096 parameters.


A look into how model will output on the above input.

In [None]:
sample_model_out = model((art_inp, sum_inp, s0, c0))

print(f"Model output has a type: {type(sample_model_out)}")
print(f"Model Output list for the Inputs above are of length: {len(sample_model_out)}")
print(f"Model Output list has each output of shape: {sample_model_out[0].shape}")

Model output has a type: <class 'tensorflow.python.framework.ops.EagerTensor'>
Model Output list for the Inputs above are of length: 16
Model Output list has each output of shape: (99, 20000)


In [None]:
def custom_loss_v1(y_true, y_pred):
  '''Calculates the loss for the baseline model. The loss is calculated by taking the negative
  log-likelihood of the target word(w*_t) in the current timestep. Then the overall loss
  is the summation over all timesteps divided by T (not Ty because it would include paddings also).

  Arguments:
    y_true: tf.Tensor object, true values for the target
    y_pred: list of tf.Tensor objects, predicted probablities of the summary words

  Returns:
    returns the loss on the predicted values for the model
  '''
  # Calculate the loss for each item in the batch.
  loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()
  loss = loss_fn(y_true, y_pred)

  # Remove the paddings from calculation of loss
  mask = tf.cast(y_true != 0, loss.dtype)
  loss *= mask

  # Divide the total loss after masking out paddings divided by total words which are not paddings
  return tf.reduce_sum(loss) / tf.reduce_sum(mask)


def custom_accuracy_v1(y_true, y_pred):
  '''Calculates accuracy of the baseline model. The accuracy is calculated by matching how many correct
  words were predicted excluding the paddings. Then, just add those which are correct and you will get the
  the accuracy and then just divide it by total words not including padding.

  Arguments:
    y_true: tf.Tensor object, expected target values
    y_pred: list of tf.Tensor object, predicted target values by model

  Returns:
    returns the total accuracy over the batch of data
  '''
  # Find the word index with maximum probablity
  y_pred = tf.argmax(y_pred, axis=-1)
  y_pred = tf.cast(y_pred, y_true.dtype)

  # Count the words that matches with true values
  match = tf.cast(y_pred == y_true, tf.float32)
  mask = tf.cast(y_true != 0, tf.float32)

  # Mask out the paddings
  match *= mask

  return tf.reduce_sum(match) / tf.reduce_sum(mask)

In [None]:
print(f"Sample y true values: {sum_tar}")
print(f"Sample y pred values(first 10 values of first 2 timestep): {sample_model_out[:2]}")

sample_loss = custom_loss_v1(sum_tar, sample_model_out)
print(f"Loss of the sample y_true and y_pred: {sample_loss}")

sample_acc = custom_accuracy_v1(sum_tar, sample_model_out)
print(f"Accuracy of the sample y_true and y_pred: {sample_acc}")

Sample y true values: [[   2 1594 1589 ...    0    0    0]
 [  36 2526 7765 ...    0    0    0]
 [1024  358 5136 ...    0    0    0]
 ...
 [ 446  178    2 ...    0    0    0]
 [  68   47 1745 ...    0    0    0]
 [ 241 1043    1 ...    0    0    0]]
Sample y pred values(first 10 values of first 2 timestep): [[[4.9766826e-05 4.9918344e-05 4.9897033e-05 ... 4.9849703e-05
   5.0315728e-05 5.0180632e-05]
  [4.9540908e-05 4.9899852e-05 4.9744962e-05 ... 4.9732076e-05
   5.0619914e-05 5.0087474e-05]
  [4.9342078e-05 4.9915510e-05 4.9560989e-05 ... 4.9633010e-05
   5.0835100e-05 4.9942733e-05]
  ...
  [4.8487480e-05 5.0055794e-05 4.8468002e-05 ... 4.9327318e-05
   5.1671042e-05 4.9163184e-05]
  [4.8487480e-05 5.0055794e-05 4.8468002e-05 ... 4.9327318e-05
   5.1671042e-05 4.9163184e-05]
  [4.8487480e-05 5.0055794e-05 4.8468009e-05 ... 4.9327322e-05
   5.1671042e-05 4.9163184e-05]]

 [[4.9986847e-05 4.9694067e-05 5.0172279e-05 ... 5.0206174e-05
   5.0213788e-05 5.0044073e-05]
  [5.0031969e-05 4

In [None]:
lr = LEARNING_RATE
initial_accumulator_value = INIT_ACC_VAL
clipnorm = MAX_GRAD_NORM

opt = Adagrad(learning_rate=lr,
              initial_accumulator_value=initial_accumulator_value,
              clipnorm=clipnorm)

model.compile(loss=custom_loss_v1, optimizer=opt, metrics=[custom_loss_v1, custom_accuracy_v1])

In [None]:
# Mention the checkpoint path and it's directory where you will save the model
checkpoint_path = BASELINE_MODEL_CHECKPOINT
checkpoint_dir = os.path.dirname(checkpoint_path)

# Calculate no of batches, I am taking floor because when creating the training data I used drop_remainder
n_batches = int(train_dataset.shape[0] / BATCH_SIZE)

# Create the checkpoint for model saving, monitoring val_custom_accuracy_v1 and save only weights of the model
saving_chkpnt = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                monitor='val_custom_accuracy_v1',
                                                verbose=1,
                                                save_weights_only=True,
                                                save_freq=n_batches//2)

# Create the checkpoint for stopping early after noticing that val_custom_accuracy_v1 is not increasing even after 5 consecutive epochs
earlystop_chkpnt = tf.keras.callbacks.EarlyStopping(monitor='val_custom_accuracy_v1',
                                                    patience=PATIENCE,
                                                    mode='max',
                                                    )

# Store the checkpoints in a list
checkpoints = [saving_chkpnt, earlystop_chkpnt]

In [None]:
epochs = EPOCHS
steps_per_epoch = STEPS_PER_EPOCHS

history = model.fit(tf_train_dataset.repeat(),
                    epochs=epochs,
                    validation_data=tf_val_dataset,
                    steps_per_epoch=steps_per_epoch,
                    checkpoint=checkpoints
                    )

### Pointer Generetor model: Adding Pointer Generator to avoid getting OOV tokens in the summary

#### Creating `tf_train_dataset`, `tf_val_dataset` dataset using tf.data API

I already have created to the `generate_example` function, I can use it again to generate examples for training and validation respectively.

But this time I have to also consider the `<OOV>` with token value `1`. Instead of using `1`, I have to generate a new token value for each of the `<OOV>` words.

How to do it?

In [77]:
def find_oovs(sent, sent_token, sent_len):
  '''Finds the out of vocabulary words from `sent` with the help of already tokenized `sent_tokens`.

  Arguments:
    sent: str, sentence to find the oov from
    sent_token: 2D np.array, the tokenized form of the sentence with sent_len length
    sent_len: int, length of the tokenized sentence

  Returns:
    Returns a list of all oov words in the `sent`
  '''
  analyzed_sent = custom_analyzer(sent)
  oov_words = [w for i, w in enumerate(analyzed_sent[:sent_len]) if (sent_token[0][i] == 1)]
  return oov_words

def map_oovs(oovs, oov_start_token):
  '''Stores the out of vocabulary words in a dictionary and sets the values of each oov key to
  a temporary unique tokens.

  Arguments:
    oovs: list of oov words
    oov_start_token: int, the first value to use as oov token then increase by 1

  Returns:
    dictionary of (oov, token) as (key, value) pairs
  '''
  unique_oovs = list(set(oovs))
  oov_tokens = [oov_start_token+i for i in range(len(unique_oovs))]

  oov_dict = dict(zip(unique_oovs, oov_tokens))

  return oov_dict

def tokenize_oovs(sent, sent_token, oov_dict, sent_len):
  '''Tokenize the sent by replacing the oov tokens by new unique tokens from oov_dict.

  Arguments:
    sent: str, sentence to handle the oovs
    sent_token: 2D np.array of tokens
    oov_dict: dictionary, oov words and their tokens are stored here
    sent_len: int, length of the sentence token array

  Returns:
    tokenized sentence with oov words tokenized to temporary oov tokens
  '''
  analyzed_sent = custom_analyzer(sent)

  for i, w in enumerate(analyzed_sent[:sent_len]):
    if w in oov_dict.keys():
      sent_token[0, i] = oov_dict[w]

  return sent_token

In [78]:
print(f"Article:\n{sample_article}\n\n")

sample_article_tokens = tokenize_pad([sample_article],
                              tokenizer,
                              padding="post",
                              truncating="post",
                              maxlen=MAX_ARTICLE_TOKENS)
print(f"sample article tokens:\n{sample_article_tokens}\n\n")

sample_oovs = find_oovs(sample_article, sample_article_tokens, MAX_ARTICLE_TOKENS)
print(f"OOVs in the sample article:\n{sample_oovs}\n\n")

sample_oov_dict = map_oovs(sample_oovs, VOCAB_SIZE)
print(f"OOV dictionary:\n{sample_oov_dict}\n\n")

sample_article_tokens_with_oovs = tokenize_oovs(sample_article, sample_article_tokens, sample_oov_dict, MAX_ARTICLE_TOKENS)
print(f"sample article tokens with oov tokens:\n{sample_article_tokens_with_oovs}")

Article:
new york (cnn) -- the u.s. population is expected to top out at close to 312.8 million people just around the time crowds gather to watch the ball drop on new year's eve, according to new census data released thursday. the figure represents a 0.7% increase from last year, adding 2,250,129 people to the u.s. population since the start of 2011, and a 1.3% increase since census day, april 1, 2010. the agency estimates that beginning in january, one american will be born every eight seconds and one will die every 12 seconds. u.s.-bound immigrants are also expected to add one person every 46 seconds. that combination of births, deaths and migration is expected to add a single person to the u.s. population every 17 seconds, the census bureau said. meanwhile, millions are set to ring in the new year. in new york, authorities are preparing for large crowds in manhattan's times square, where lady gaga is expected to join mayor michael bloomberg to push the button that drops the waterfo

In [79]:
def generate_example_v2(inputs, targets,
                        input_tokenizer, target_tokenizer,
                        input_len, target_len,
                        padding="post", truncating="post",
                        vocab_size=VOCAB_SIZE):

  '''Generates examples for the Pointer Generator model. Processes the `inputs` and `targets`
  with their respective tokenizers and tokenize them to `input_len` and `target_len` length.
  After tokenizing the article words, it looks for the out-of-vocabulary words and creates unique tokens
  for each of those OOV words. Then, instead of keeping the oov word tokens as 1, it replaces them
  with their respective newly generated tokens. This tokens are temporary

  Arguments:
    inputs: list of input sentences
    targets: list of target sentences
    input_tokenizer: Tokenizer class object, tokenizer for inputs
    target_tokenizer: Tokenizer class object, tokenizer for targets
    input_len: int, the length of the tokenization for inputs
    target_len: int, the length of the tokenization for targets

  Returns:
    returns 2 values, a tuple containing 2 numpy arrays (input_tokens, target_tokens[:-1]) and
    another numpy array target_tokens[1:]
  '''

  for inp, tar in zip(inputs, targets):
    # Tokenizing article words
    inp_token = tokenize_pad([inp],
                              input_tokenizer,
                              padding=padding,
                              truncating=truncating,
                              maxlen=input_len)

    oov_words = find_oovs(inp, inp_token, input_len)
    oov_dict = map_oovs(oov_words, oov_start_token=vocab_size)
    inp_token = tokenize_oovs(inp, inp_token, oov_dict, input_len)

    # Tokenizing summary words
    tar_token = tokenize_pad([tar],
                 target_tokenizer,
                 padding=padding,
                 truncating=truncating,
                 maxlen=target_len)
    tar_token = tokenize_oovs(tar, tar_token, oov_dict, target_len)

    yield (inp_token[0], tar_token[0][:-1]), tar_token[0][1:]

In [80]:
print(f"Example generated by the generator v2:")

# (inp_art_tokens, inp_sum_tokens), tar_sum_tokens = generate_example(list(train_dataset[:, 0]),
example_gen = generate_example_v2(list(train_dataset[:, 0]),
                               list(train_dataset[:, 1]),
                               input_tokenizer=tokenizer,
                               target_tokenizer=tokenizer,
                               input_len=MAX_ARTICLE_TOKENS,
                               target_len=MAX_SUMMARY_TOKENS)

inps, tar = next(example_gen)
print(f"Inputs:\n{inps[0]}\n{inps[1]}\n\n")
print(f"Target:\n{tar}")

Example generated by the generator v2:
Inputs:
[   80   452     4   814    44    42    43    12    12  4414  2853    11
    13  2577   674   331  3716    25     2   895     7     9   878  3280
   419     2   257 20000    16   337  7635  6508    14    18   135    30
     2   909  2876    10  2225   396     3  4414  2853  1326   273    60
    26  2577  1182     5  3533    11    13  4556 14519    25     2   674
   331    19   249     3  2853     4     2    95   279    52    51   211
   442     9   313    25  6555  9110     4   964     2  1101   204  2719
    10     9  2762     3   335     4  3533    11    13  4556 14519  2884
   244   199    15     2     6   670     7  4662     4     6  2439    97
 20012 20003 20008 20013     2   258   674   331   859   418   187    18
     6    52     7     2  1992 19527    10   843  2709   396     4     6
   117    83   191  4145    27  4440     5  5293     2 15749    15     2
     6   313     7    26   160     3     6     2   752  2889     2   266
    

In [81]:
inps, tar = next(example_gen)
print(f"Inputs:\n{inps[0]}\n{inps[1]}\n\n")
print(f"Target:\n{tar}")

Inputs:
[   80    44    42    43    12    12  4556  1260    20    10     2   324
  4058     7 10986    11    13   852    59    21   348    33   162     4
    26   519   100     3  1260    20   188   562    10    26  1014   425
   151   215    60    61   908  1091    20     9  1374    32  3891     3
    61    16 10986    11    13   852    79 10986    11    13   852    16
     9     6  5402  3466     7     2  4288   326     4     6   104     5
     2  9879  2864     4    14  4351  5081     9  1806    11    13   955
     3    18   438  2850    22     9   423 20003    10     2   708    56
  5553 20009     8  1219  1619    91    74     3    58    16    77   965
    15 10986    11    13     4    73    18    16  5342 20010     3 10986
    11    13  1024   438    27  2017  1764     8   992     3  2529   768
 20005     7   955     4     9  1182     7  2637     8 20006   838     3
    22 10986    11    13   852     4     6    40   135    27     9  8781
  1259     5  2937  7085  4252     4   182 

#### Pointer Generator

The function `generate_pointer` will return the $p_{gen}$. $p_{gen}$ will be used to decide whether the next word needs to be copied from the article or we should generate a new word from the vocabulary. This decision will be taken using the equation- $$P(w) = p_{gen}*P_{vocab}(w) + (1-p_{gen})*\sum_{i:w_i=w}a^t_i$$

Where
- $P$ is the extended vocabulary containing words from vocabulary and article both.
- $a^t_i$ is the attention $t^{th}$ time step need to give to $i^{th}$ article word.
- $P_{vocab}$ is the previous vocabulary distribution without considering words from article.

In [None]:
def generate_pointer():
  pass