<a href="https://colab.research.google.com/github/Arup3201/Summarization-Project-using-Pointer-Gen/blob/main/Get_To_The_Point_Summarization_with_Pointer_Generator_Networks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [97]:
import pathlib
import re
import random
import numpy as np
import tensorflow as tf

# For tokenizing and processing the examples for the model training
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Layers for the Encoder, Attention and Decoder
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM
from tensorflow.keras.layers import RepeatVector, Concatenate, Activation, Dot
from tensorflow.keras.layers import Dense

# For model initialization
from tensorflow.keras import Model

In [3]:
path_to_cnn_stories = tf.keras.utils.get_file(
    origin="https://huggingface.co/datasets/cnn_dailymail/resolve/main/data/cnn_stories.tgz",
    extract=True
)

path_to_dailymail_stories = tf.keras.utils.get_file(
    origin="https://huggingface.co/datasets/cnn_dailymail/resolve/main/data/dailymail_stories.tgz",
    extract=True
)

Downloading data from https://huggingface.co/datasets/cnn_dailymail/resolve/main/data/cnn_stories.tgz
Downloading data from https://huggingface.co/datasets/cnn_dailymail/resolve/main/data/dailymail_stories.tgz


In [4]:
path_to_cnn_stories, path_to_dailymail_stories

('/root/.keras/datasets/cnn_stories.tgz',
 '/root/.keras/datasets/dailymail_stories.tgz')

In [5]:
!ls -l /root/.keras/datasets

total 521956
drwxr-xr-x 3 root root      4096 Jul 28 05:38 cnn
-rw-r--r-- 1 root root 158577824 Jul 28 05:38 cnn_stories.tgz
drwxr-xr-x 3 root root      4096 Jul 28 05:39 dailymail
-rw-r--r-- 1 root root 375893739 Jul 28 05:39 dailymail_stories.tgz


In [6]:
cnn_stories_dir = pathlib.Path('/root/.keras/datasets/cnn/stories')
dailymail_stories_dir = pathlib.Path('/root/.keras/datasets/dailymail/stories')

In [7]:
cnn_stories_dir, dailymail_stories_dir

(PosixPath('/root/.keras/datasets/cnn/stories'),
 PosixPath('/root/.keras/datasets/dailymail/stories'))

In [8]:
def print_filenames(dir_path, num_files=5):
  '''Prints the name of the files that are present at `dir_path`.
  Maximum `num_files` number of files are shown.

  Arguments:
    dir_path: PosixPath, pointing to the directory of which the user
              wants to prints the file names.
    num_files: int, number of files user wants to print.

  returns:
    nothing
  '''

  count = 0
  for f in dir_path.glob('*.story'):
    print(f.name)
    count += 1

    if count == num_files:
      break
  else:
    print(f"Less than {num_files} is present!")

In [9]:
print_filenames(cnn_stories_dir)

dfc761dc9979d48f81622209420ec256c27b064b.story
7a91fabdfe78b9357173135abd8a1eb3d6f54883.story
7366de0ba2f4d1d1dea009d28770643cc373f0f0.story
a71b9b280bd0fd28e47407956cde2844c812acd4.story
ede76765d99a416d0a353c858267e0be3ce44f53.story


In [10]:
print_filenames(dailymail_stories_dir)

fee8d56ace20661cec362490adaa5d3af2860a19.story
6de92207db5537a42078856ece9fd190f5d961ea.story
b842f69a37319d3e9dcb9e3068023275fd10adc7.story
af7a438d642ea2be63ac4a16f6dce42cde0344bc.story
8803ae457b25fd727e922c03730c7f1631688996.story


In [92]:
# Define the global variables
dm_single_close_quote = u'\u2019' # unicode
dm_double_close_quote = u'\u201d'
END_TOKENS = ['.', '!', '?', '...', "'", "`", '"',
              dm_single_close_quote, dm_double_close_quote, ")"]

# Maximum stories to process from cnn and dailymail each
MAX_STORIES = 50000

# From the total data how to split into train, val and test
TRAIN_SIZE = 0.8
VAL_SIZE = 0.1
TEST_SIZE = 0.1

# For tokenization
VOCAB_SIZE = 20000
OOV_TOKEN = "<OOV>"

# For standardization
PAD_TOKEN = '<PAD>'
START_TOKEN = '<START>'
END_TOKEN = '<END>'

# For the number of tokens to use in representing articles and summaries, hyperparameters
MAX_ARTICLE_TOKENS = 400
MAX_SUMMARY_TOKENS = 100

# For dataset creation hyperparameters
BUFFER_SIZE = 5000
BATCH_SIZE = 16

## Model Archietecture hyperparameters
# Embedding output dimension
EMB_OUT = 32
# Encoder hidden(also cell) state dimension
ENCODER_STATE_DIM = 32
# Decoder hidden(also cell) state dimension
DECODER_STATE_DIM = 64
# Attention first dense layer units(calculates partial energy)
DENSE1_UNITS = 16
# Attention secodn dense layer units(calculated final energy)
DENSE2_UNITS = 1
# Units of the Dense layers before output layer
DENSE_UNITS = 16

In [12]:
# Taking a sample .story file from cnn stories
sample_filename = "438411e10e1ef79b47cc48cd95296d85798c1e38.story"
sample_filedir = cnn_stories_dir

sample_filepath = sample_filedir / sample_filename
with open(sample_filepath, 'r') as f:
  sample_story = f.read()

print(sample_story)

New York (CNN) -- The U.S. population is expected to top out at close to 312.8 million people just around the time crowds gather to watch the ball drop on New Year's Eve, according to new census data released Thursday.

The figure represents a 0.7% increase from last year, adding 2,250,129 people to the U.S. population since the start of 2011, and a 1.3% increase since Census Day, April 1, 2010.

The agency estimates that beginning in January, one American will be born every eight seconds and one will die every 12 seconds.

U.S.-bound immigrants are also expected to add one person every 46 seconds.

That combination of births, deaths and migration is expected to add a single person to the U.S. population every 17 seconds, the Census Bureau said.

Meanwhile, millions are set to ring in the new year.

In New York, authorities are preparing for large crowds in Manhattan's Times Square, where Lady Gaga is expected to join Mayor Michael Bloomberg to push the button that drops the Waterford 

I am creating a function `fix_missing_period` where I am taking 2 arguements, one for the `line` for which I am checking and fixing the period and other is `end_tokens` which is a list that has all the tokens that I should consider as ending of a sentence.

These are the steps -
1. Check if line contains `@highlight`, if True then just return the line.
2. Check if line is empty, then return line as it is.
3. Check is line ends with any of the `end_tokens`, if so then return line as it is.
4. Only is none of the above conditions match then append `.` to the current line.

In [13]:
def fix_missing_period(line, end_tokens=END_TOKENS):
  '''function to fix the missing periods for some story lines which do not end with
  any of the end_tokens mentioned.

  Argument:
    line: string, line of the story to fix the missing the period of.
    end_tokens: list of strings, all the tokens that are considered as line end.

  Returns:
    new line with fixed the ending part by adding an ending token if not present.
  '''
  if "@highlight" in line:
    return line
  elif line == "":
    return line
  elif line[-1] in end_tokens:
    return line

  return line + '.'

In [14]:
fix_missing_period(sample_story.split('\n')[0])

"New York (CNN) -- The U.S. population is expected to top out at close to 312.8 million people just around the time crowds gather to watch the ball drop on New Year's Eve, according to new census data released Thursday."

I am creating a function `split_article_summary` which will split the story into article and summary parts.

The function takes only 1 arguement and that is the `story` which will be splitted into article and summary.

The steps to follow are -
1. Split the story by new line `\n`. I will get a list of lines.
2. Strip the lines by using list comprehension.
3. Use list comprehension to make lower case each line by using `.lower()`.
4. Fix each line by adding period if there is none in that line using `fix_missing_period` function.
5. Make 2 empty list for `article` and `summary`.
6. Go through each line. In each line, I need to check 4 things,
  * line contains `@highlight` or not, if True then set `next_highlight` to `True` because the next to next line is going to be a summary line.
  * line is `""` empty or not, if True then ignore.
  * `next_highlight` is True or not, if True then append the line to `summary`.
  * If non of the ebove then append to `article`.
7. After done with filling the `article` and `summary` list with lines, join those sentences to make the whole article and summary. Here, I am using `.join()` method.

In [15]:
def split_article_summary(story):
  '''Splits the story into 2 parts, one for article and other for summary of that
  article. Returns the article and summary.

  Argument:
    story: string file that contains both article and summary combiningly.

  Returns:
    article, summary seperately from the story.

  '''
  lines = story.split('\n')
  lines = [line.strip() for line in lines]
  lines = [line.lower() for line in lines]

  # Fix the ending period
  lines = [fix_missing_period(line) for line in lines]

  # List to contain the article and summary lines
  article = []
  summary = []

  # Indicator of whether the next line is the summary or not
  next_highlight = False

  for line in lines:
    if "@highlight" in line:
      next_highlight = True
    elif line=="":
      continue
    elif next_highlight:
      summary.append(line)
    else:
      article.append(line)

  article = ' '.join(article)
  summary = ' '.join(summary)

  return article, summary

In [16]:
split_article_summary(sample_story)

('new york (cnn) -- the u.s. population is expected to top out at close to 312.8 million people just around the time crowds gather to watch the ball drop on new year\'s eve, according to new census data released thursday. the figure represents a 0.7% increase from last year, adding 2,250,129 people to the u.s. population since the start of 2011, and a 1.3% increase since census day, april 1, 2010. the agency estimates that beginning in january, one american will be born every eight seconds and one will die every 12 seconds. u.s.-bound immigrants are also expected to add one person every 46 seconds. that combination of births, deaths and migration is expected to add a single person to the u.s. population every 17 seconds, the census bureau said. meanwhile, millions are set to ring in the new year. in new york, authorities are preparing for large crowds in manhattan\'s times square, where lady gaga is expected to join mayor michael bloomberg to push the button that drops the waterford cr

I am creating a function `get_articles_summaries` which will process each of the stories present in the directory of cnn and dailymail and return the articles, summaries in the form of list.

This function will take 2 arguements. One will be the `stories_dir` which is a Posix format string from `pathlib` library and another arguement is of `max_stories` which is the maximum number of stories that we will extract from those directories.

The process is simple. We will follow this steps -
1. Create 2 empty lists of `articles` and `summaries`.
2. Loop through all the files present in the directory `stories_dir` using `.glob` generator method.
3. Make a `count` variable which will count the number of processed strories and when it hits `max_stories`, break from the loop.
4. Inside the loop, you will open the file in `r` reading format, then just use `.read()` method to read the story.
5. Everytime after reading the story, split the article and summary part from it and then append them inside the `articles` and `summaries` list.
6. Return the 2 lists.

In [17]:
def get_articles_summaries(stories_dir, max_stories):
  '''stores the stories from stories_dir folder into a list and returns the list

  Arguments:
    stories_dir: Posix string, the directory where the stories are stored
    max_stories: maximum number of stories to store

  Returns:
    list of stories.

  '''
  articles = []
  summaries = []

  count = 0
  for f in stories_dir.glob("*.story"):
    count += 1
    with open(f, 'r') as reader:
      story = reader.read()

      article, summary = split_article_summary(story)

      articles.append(article)
      summaries.append(summary)

    if count == max_stories:
      break

  return articles, summaries

```
cnn
  stories
    438411e10e1ef79b47cc48cd95296d85798c1e38.story
    e453e379e8a70af2d3dff1c75c41b0a35edbe9cc.story
    2079f35aca44978a7985afe0ddacdf02bedf98f2.story
    4702f28c198223157bb8f69665b039d560eebb0f.story
    db3e2ea79323a98379228b17cd3b9dec17dbd2cb.story
    ...
    ...
    ...

dailymail
  stories
    f4ba18635997139c751311b9f2ad18f455dd7c98.story
    4a3ef32cff589c85ad0d22724e2ed747c0dacf87.story
    5375ed75939108c72001b043d3b4799c47f32be9.story
    fe9e57c21e21fb4ec26e394f0e92824f38d18a95.story
    6a544b5cdd2384be6cc657b265d7aa2de72a99e0.story
    ...
    ...
    ...

```

Out of all available .story files, we will only take `MAX_STORIES` number of files and then open them.

In [18]:
cnn_articles, cnn_summaries = get_articles_summaries(cnn_stories_dir, MAX_STORIES)

len(cnn_articles)

50000

In [19]:
print(f"Total no of cnn stories captured are {len(cnn_articles)}\n\n")
print(f"One of the CNN articles: {cnn_articles[0]}\n\n")
print(f"The summary of this article: {cnn_summaries[0]}\n\n")

Total no of cnn stories captured are 50000


One of the CNN articles: (cnn)here's a tip for traveling in busan: whatever time you've allotted for a venue or attraction, triple it. no, quadruple it. everything in south korea's second largest city (after seoul) is better than you think it's going to be, and you'll want more time to explore and enjoy. case in point is spa land centum city, a gigantic, modern jjimjilbang (korean-style sauna/spa) located inside shinsegae department store centum city, reportedly the world's largest department store, which opened four years ago. before seeing spa land, i had set aside an hour for a visit -- the usual amount of time i allot for soaking and scrubbing at a korean bathhouse. coming from seoul, i was skeptical about how different one jjimjilbang could be from any other. the capital, after all, is well known for its extravagant spas. boy, was i mistaken. a spa like no other. centum city's spa land takes the jjimjilbang concept to an entirely new pl

In [20]:
dailymail_articles, dailymail_summaries = get_articles_summaries(dailymail_stories_dir,
                                                                 MAX_STORIES)

In [21]:
print(f"Total no of cnn stories captured are {len(dailymail_articles)}\n\n")
print(f"One of the CNN articles: {dailymail_articles[0]}\n\n")
print(f"The summary of this article: {dailymail_summaries[0]}\n\n")

Total no of cnn stories captured are 50000


One of the CNN articles: by. helen pow. the first pictures have emerged of a 4-year-old girl who was tragically killed wednesday when a driver crashed a vehicle into her florida childcare center as she was waiting for her afternoon snack and then fled off. little lily quintus was killed and 14 others were injured in the horror crash at kindercare daycare center in winter park. on thursday afternoon, police announced robert alex corchado - the man they believe caused the fatal wreck - had turned himself in. lily was sitting at a table waiting. for her afternoon snack when the car crashed into the building, and. her heartbroken mother, nicole quintus, said a teacher called her soon after screaming but. unable to say what happened. scroll down for video. in custody: on thursday afternoon, authorities announced robert alex corchado, left, the man they believe caused the fatal wreck that killed lily quintus, right, had turned himself in. tragic: 

In [22]:
[1, 2] + [3, 4]

[1, 2, 3, 4]

In [23]:
random.seed(0) # Keeps the shuffling same as before
random.sample([1, 2, 3, 4, 5], 5)

[4, 5, 1, 2, 3]

In [24]:
sample_article, sample_summary = split_article_summary(sample_story)

In [25]:
sample_article

'new york (cnn) -- the u.s. population is expected to top out at close to 312.8 million people just around the time crowds gather to watch the ball drop on new year\'s eve, according to new census data released thursday. the figure represents a 0.7% increase from last year, adding 2,250,129 people to the u.s. population since the start of 2011, and a 1.3% increase since census day, april 1, 2010. the agency estimates that beginning in january, one american will be born every eight seconds and one will die every 12 seconds. u.s.-bound immigrants are also expected to add one person every 46 seconds. that combination of births, deaths and migration is expected to add a single person to the u.s. population every 17 seconds, the census bureau said. meanwhile, millions are set to ring in the new year. in new york, authorities are preparing for large crowds in manhattan\'s times square, where lady gaga is expected to join mayor michael bloomberg to push the button that drops the waterford cry

I am creating another function -
`split_dataset(train_size, val_size, test_size)`: I am creating this function to split the original 1,00,000 examples into 80,000 training samples, 10,000 val samples and 10,000 test samples.

In [26]:
def split_dataset(dataset, train_size, val_size, test_size):
  first_split = train_size
  second_split = train_size+val_size
  third_split = train_size+val_size+test_size
  return dataset[:first_split, :], dataset[first_split:second_split, :], dataset[second_split:third_split, :]

Utilize the 4 functions created above into one function called `make_datasets`. This function will -
1. This functions will have many argumenets and among them 2 argumenets `cnn_stories` and `dailymail_stories` are lists which has list of articles and summaries at 0 and 1 index. It means `cnn_stories[0]` is articles of cnn news and `cnn_stories[1]` is summaries of cnn news. It applies to `dailymail_stories` as well.
Objective of this step is to concatenate the cnn articles with dailymail articles and cnn summaries with dailymail summaries.
```python
[1, 2] + [3, 4] = [1, 2, 3, 4]
```

3. Convert the articles and summaries list into tensors and then concatenate them along a new axis. To create new axis I can use `tf.newaxis` in the indexing. E.g.
```python
  np.concatenate([articles[:, tf.newaxis], summaries[:, tf.newaxis]], axis=-1)
```
4. Shuffle the dataset using `random.sample` method.
```python
random.seed(seed_value) # To make sure that everytime it gives the same shuffle
random.sample(list_to_shuffle, len(list_to_shuffle))
```
5. Split the dataset into 3 parts, one for training, other for validation and last one for testing. All the tensors are of shape `(num_samples, 2)`.

In [27]:
def make_datasets(cnn_stories, dailymail_stories, train_fraction, val_fraction, test_fraction, seed_value=0):
  '''Create 3 datasets each for training, validation and testing respectively.
  This function concatenates the articles, summaries of cnn and dailymail news. After that it will tokenize
  them one by one in a loop. After it is done with the tokenization, it will shuffle the articles and
  summaries using random.sample method (although we have a helper function for it). Finally we do the
  splitting of the whole dataset. Remember here the returned values become tensors.

  Arguments:
    cnn_stories: list of 2 values, one for cnn articles and other for cnn summaries.
    dailymail_stories: list of 2 values, one for dailymail articles and other for dailymail summaries.
    train_size: float, specifying how much fraction of the original dataset to take for training.
    val_size: float, specifying how much fraction of the original dataset to take for validation.
    test_size: float, specifying how much fraction of the original dataset to take for testing.

  Returns:
    returns a tuple with 3 values inside it, `training_data`, `validation_data` and `testing_data`
    with the specified amount of data in it.
    Each one of them are tensor with shape `(num_samples, 2)`. `shape[1]=2` for article and summary.
  '''
  articles = cnn_stories[0] + dailymail_stories[0]
  summaries = cnn_stories[1] + dailymail_stories[1]

  articles = np.array(articles, dtype=object)
  summaries = np.array(summaries, dtype=object)

  dataset = np.concatenate((articles[:, tf.newaxis], summaries[:, tf.newaxis]), axis=-1)

  random.seed(seed_value)
  shuffled_indices = random.sample(list(range(dataset.shape[0])), dataset.shape[0])

  dataset = dataset[shuffled_indices, :]

  train_size = int(train_fraction * dataset.shape[0])
  val_size = int(val_fraction * dataset.shape[0])
  test_size = dataset.shape[0] - (train_size + val_size)

  training_samples, validation_samples, testing_samples = split_dataset(dataset,
                                                                        train_size,
                                                                        val_size,
                                                                        test_size)

  return (training_samples, validation_samples, testing_samples)

In [28]:
train_dataset, val_dataset, test_dataset = make_datasets([cnn_articles, cnn_summaries], [dailymail_articles, dailymail_summaries], TRAIN_SIZE, VAL_SIZE, TEST_SIZE)

In [29]:
print(f"Type of the datasets: {type(train_dataset)}\n")

print(f"Training dataset shape: {train_dataset.shape}")
print(f"Validation dataset shape: {val_dataset.shape}")
print(f"Testing dataset shape: {test_dataset.shape}\n")

print(f"First example in the training dataset looks like: \n {train_dataset[0]}\n")

Type of the datasets: <class 'numpy.ndarray'>

Training dataset shape: (80000, 2)
Validation dataset shape: (10000, 2)
Testing dataset shape: (10000, 2)

First example in the training dataset looks like: 
 ["in steven gerrard’s dreams, wednesday night ends with him celebrating like it is 2009. liverpool blow real madrid away, their stuttering season catches fire and more memories are created to last a lifetime. his nightmares, however, involve an embarrassment, of liverpool finding themselves on the receiving end of a scoreline similar to the 4-0 drubbing they administered to real, when these teams last collided five years ago. the first scenario, he believes, is possible, particularly if brendan rodgers’ side are swept along on a wave of emotion. if liverpool use the energy that a highly-charged anfield crowd will give them, gerrard hopes they can swarm all over los blancos. video\xa0scroll down to watch liverpool host real in european epic. steven gerrard has told his liverpool team-

Before the tokenization, we need to preprocess the text data so that it can be properly tokenized. In this step we need to choose whether we want to keep punctuations or not, whether we should keep the numbers or not and so on. There are 2 functions I will create, one for simple `standardize` and other to feed the Tokenizer class when creating the `tokenizer`. `standardize` function implements the following steps -

1. Lower case the strings passed to it. It is already done but for user data it might not be the case so, we will still perform this step.
2. Replace the single and double opening and closing quotes like `‘ → \u2018`, `’ → \u2019`, `“ → \u201c` and `” → \u201d` by `'` and `"` respectively.
3. Replace the punctutations ``['.', '?', '!', ',', ':', '-', ''', '"', '_', '(', ')', '{', '}', '[', ']', '`', ';', '...']`` by `[SPACE]punctutations`.
In this process we need to make sure that the floating point numbers like `1.78` do not become `1 .78`. To do that the correct regex expression is ``(?<!\d)\s*([!"#$£%&\'\(\)*+,-./:;<=>?@\[\]\\^_`{|}~])\s*(?!\d)``.
4. Strip the texts from extra starting or ending spaces. Finally, remove extra spaces using regex expression like `\s{2,}`.

`custom_analyzer` function which will be feed to the Tokenizer as the value for `analyzer`, has some more steps to implement -
1. Remove the `START_TOKEN` and `END_TOKEN` from the text. So that tokenizer does not standardize them.
2. Standardize the text with `standardizer`.
3. Add back the `START_TOKEN` and `END_TOKEN` because you want your tokenizer to learn them.
4. Remove unwanted spaces in between words.
5. Split the text into words which are seperated by ' '.
6. Strip each of the words in the sentence. Finally, return it.

In [30]:
# Standardize the text data
def standardizer(text):
  '''Standardize the text provided to the function
  The text is lower cased. Then, the opening and closing quotes are removed. I add spaces before the
  punctuations like `don't` becomes `don ' t`, ignoring the numerical values so that `1.78` does not become
  `1 . 78`. Finally, it strips the text and removes any type of unwanted spaces in it.

  Argument:
    text: str, the text to standardize

  Returns:
    returns the standadized text
  '''

  # Lower case the text
  text = text.lower()

  # Replace the special single and double opening and closing quotes
  text = re.sub(r'[\u2019\u2018]', "'", text)
  text = re.sub(r'[\u201c\u201d]', '"', text)

  # Add space before punctuations and ignore floating point numbers.
  text = re.sub(r'(?<!\d)\s*([!"#$£%&\'\(\)*+,-./:;<=>?@\[\]\\^_`{|}~])\s*(?!\d)',
                  r' \1 ', text)  # It used to also remove commas after numbers like '27,' will be removed

  # Remove spaces after sentence end and other unwanted spaces from text
  text = text.strip()
  text = re.sub('\s{2,}', ' ', text)

  return text

# custom analyzer for the Tokenizer class
def custom_analyzer(text):
  '''Custom analyzer to provide to the `Tokenizer` class when creating the tokenizer.

  Argument:
    text: str, the text that will be tokenized

  Returns:
    returns the splitted sentence
  '''
  # Remove START and END before standardizing
  if START_TOKEN in text:
    text = re.sub(f'{START_TOKEN} ', '', text)
  if END_TOKEN in text:
    text = re.sub(f'{END_TOKEN} ', '', text)

  # Standardize the text first
  text = standardizer(text)

  # Add back the START and END tokens
  text = ' '.join([START_TOKEN, text, END_TOKEN])

  # Split the sentence into words to tokenize
  words = text.split(' ')
  words = [word.strip() for word in words]

  return words

In [31]:
sample_texts = ["I have been working on, \nbut \tnever did it in this way.",
                "U.S won the world cup and bagged 1.78 million dollars.",
                "India had M.S. Dhoni won made it this far.",
                "My email address is arupjana7365@gmail.com.",
                "It can take care of dailymail single opening quote’ also.",
                "I have 10,000 Rs in my bank",
                "This sentence has , after a number 12,",
                "This sentence contains <START> token and <END> token."]

print(f"After Standardizing the sample texts:\n{[standardizer(text) for text in sample_texts]}\n")
print(f"After applying custom analyzer on sample texts:\n{[custom_analyzer(text) for text in sample_texts]}")

After Standardizing the sample texts:
['i have been working on , but never did it in this way .', 'u . s won the world cup and bagged 1.78 million dollars .', 'india had m . s . dhoni won made it this far .', 'my email address is arupjana7365@gmail . com .', "it can take care of dailymail single opening quote ' also .", 'i have 10,000 rs in my bank', 'this sentence has , after a number 12,', 'this sentence contains < start > token and < end > token .']
After applying custom analyzer on sample texts:
[['<START>', 'i', 'have', 'been', 'working', 'on', ',', 'but', 'never', 'did', 'it', 'in', 'this', 'way', '.', '<END>'], ['<START>', 'u', '.', 's', 'won', 'the', 'world', 'cup', 'and', 'bagged', '1.78', 'million', 'dollars', '.', '<END>'], ['<START>', 'india', 'had', 'm', '.', 's', '.', 'dhoni', 'won', 'made', 'it', 'this', 'far', '.', '<END>'], ['<START>', 'my', 'email', 'address', 'is', 'arupjana7365@gmail', '.', 'com', '.', '<END>'], ['<START>', 'it', 'can', 'take', 'care', 'of', 'dailym

Now, I need to find the tokens from the articles. I need to use only training articles not any other and also I will not use summaries data because that will be my target and I won't know what type of words I will encounter when summarizing the source article. So, the only words that I know will be from the articles of training dataset. Here, I am going to use the `tensorflow.keras.preprocessing.text.Tokenizer` in short `Tokenizer` to find the tokens from the articles and then finally converting the articles into sequence of integers. One thing to remember is here we are going to use `oov_token` arguement of `Tokenizer` to mention the token we want to use for out-of-vocabulary words.

When fiting the texts on `tokenizer` make sure to remove floating point and integer numbers using the regex expression - `[+-]?[0-9]*[.]?[0-9]+`. I am making sure that tokenizer does learn the numbers because it can always be taken from the original articles data and we do not to remember them in vocab.

In [32]:
def get_tokenizer(texts, num_words, oov_token=None, filters = '#*+/:<=>@[\\]/^{|}~\t\n'):
  '''This will create the tokenizer needed for the task in hand.
  The tokenizer will be trained on the `texts`. Tokenizer will have vocabulary length `num_words`.
  The `oov_token` will be used as the token represent the out-of-vocabulary words. The `filters` are
  the ones which the tokenizer will remove when tokenizing any sentence given to it. The returned
  tokenizer is using a custom analyzer that can standardize the sentence before tokenizing using the
  `standardizer` function and then splits the sentence into words. After that it tokenizes the sentence.
  As for the vocabulary, the returned tokenizer's vocabulary does not contain any number, as I have removed
  them before feeding them into `Tokenizer.fit_on_texts` method.

  Arguments:
    texts: list of strings, the tokenizer will be trained on this strings
    num_words: int, number of vocabulary words the tokenizer will consider
    oov_token: str, token to represent out-of-vocabulary words
    filters: str, all the characters that the tokenizer will remove before tokenizing

  Returns:
    tokenzier of the `Tokenizer` class after learning vocabulary from `texts`
  '''

  # Create the tokenizer usinf Tokenizer class
  tokenizer = Tokenizer(num_words=num_words,
                        filters=filters,
                        oov_token=oov_token,
                        analyzer=custom_analyzer)

  # Remove the numbers from the dataset so that tokenizer does not add them inside vocabulary
  texts = [re.sub(r"[+-]?[0-9]*[.]?[0-9]+", "", text) for text in texts]

  # Fit the data with fit_on_texts method
  tokenizer.fit_on_texts(texts)

  return tokenizer

In [33]:
print(f"Length the articles dataset: {len(list(train_dataset[:, 0]))}")

Length the articles dataset: 80000


Create the `tokenizer` using the articles from training dataset by using `train_dataset[:, 0]`, with a vocabulary size of `VOCAB_SIZE` and use `OOV_TOKEN` token to represent out-of-vocabulary words.

In [34]:
tokenizer = get_tokenizer(list(train_dataset[:, 0]), VOCAB_SIZE, OOV_TOKEN)

In [35]:
print(f"The vocabulary for the tokenizer has a length {len(tokenizer.word_index.keys())}\n\n")


print(f"{OOV_TOKEN} word has index: {tokenizer.word_index[OOV_TOKEN]}")
print(f"{START_TOKEN} word has index: {tokenizer.word_index[START_TOKEN]}")
print(f"{END_TOKEN} word has index: {tokenizer.word_index[END_TOKEN]}\n\n")


print(f"'teacher' word has index: {tokenizer.word_index['teacher']}\n")

print(f"Text:\n{train_dataset[0, 0]}\n\n")
sample_sequence = tokenizer.texts_to_sequences([train_dataset[0, 0]])
print(f"Text to Sequence of the first article:\n{sample_sequence}\n")
print(f"Sequence to Text of the first acrticle:\n{tokenizer.sequences_to_texts(sample_sequence)}")

The vocabulary for the tokenizer has a length 251348


<OOV> word has index: 1
<START> word has index: 76
<END> word has index: 77


'teacher' word has index: 1595

Text:
in steven gerrard’s dreams, wednesday night ends with him celebrating like it is 2009. liverpool blow real madrid away, their stuttering season catches fire and more memories are created to last a lifetime. his nightmares, however, involve an embarrassment, of liverpool finding themselves on the receiving end of a scoreline similar to the 4-0 drubbing they administered to real, when these teams last collided five years ago. the first scenario, he believes, is possible, particularly if brendan rodgers’ side are swept along on a wave of emotion. if liverpool use the energy that a highly-charged anfield crowd will give them, gerrard hopes they can swarm all over los blancos. video scroll down to watch liverpool host real in european epic. steven gerrard has told his liverpool team-mates not to shy away against real madri

The oddness you might see if you are that much familiar with `Tokenizer` class is, even though I have specified that `num_words=VOCAB_SIZE` which is `20,000` still the length of the `word_index` is more that that. Does that mean we are doing something wrong?
NO, here although tokenizer computes the word_index of all other words apart from those first 20000 words, it will not use them when we convert them into sequence. Let's look at one example to understand that.

In [36]:
list(tokenizer.word_index.keys())[21000]

'restraints'

In [37]:
oov_word = list(tokenizer.word_index.keys())[21000]
sample_text = f"This example is to test the above fact with the word `{oov_word}`"
sample_sequence = tokenizer.texts_to_sequences([sample_text])

print(f"Text: {sample_text}\n\n")
print(f"Tokenized text: {tokenizer.sequences_to_texts(sample_sequence)}")
print(f"Sequence: {sample_sequence}")

Text: This example is to test the above fact with the word `restraints`


Tokenized text: ['<START> this example is to test the above fact with the word ` <OOV> ` <END>']
Sequence: [[76, 38, 1134, 18, 6, 877, 3, 754, 548, 22, 3, 1290, 15392, 1, 15392, 77]]


Although the word was present in the `word_index` mapping still tokenizer represented it with `<OOV>`.

In [38]:
sample_text = "What happens when I add a number 2.1 in this sentence!"
sample_sequence = tokenizer.texts_to_sequences([sample_text])

print(f"Text: {sample_text}\n\n")
print(f"Tokenized text: {tokenizer.sequences_to_texts(sample_sequence)}")
print(f"Sequence: {sample_sequence}")

Text: What happens when I add a number 2.1 in this sentence!


Tokenized text: ['<START> what happens when i add a number <OOV> in this sentence ! <END>']
Sequence: [[76, 70, 2070, 54, 27, 1969, 7, 264, 1, 10, 38, 1112, 300, 77]]


In [39]:
sample_text = "What happens when I add parenthesis (I am inside it!)."
sample_sequence = tokenizer.texts_to_sequences([sample_text])

print(f"Text: {sample_text}\n\n")
print(f"Tokenized text: {tokenizer.sequences_to_texts(sample_sequence)}")
print(f"Sequence: {sample_sequence}")

Text: What happens when I add parenthesis (I am inside it!).


Tokenized text: ['<START> what happens when i add <OOV> ( i am inside it ! ) . <END>']
Sequence: [[76, 70, 2070, 54, 27, 1969, 1, 50, 27, 318, 506, 19, 300, 49, 2, 77]]


We have the `tokenizer` to tokenize the articles and summaries. We need to pad those sequences to fit the requirements.

In the paper, the articles are limited to have 400 tokens and summary has, 100 tokens at training and 120 tokens for testing.

I will be using `pad_sequences` method to pad or truncate the articles and summaries based on their length.

NOTE: I am using same tokenizer for article and summary. But, later I might change that to 2 different tokenizers each having different `num_words`.

In [40]:
def tokenize_pad(texts, tokenizer, padding, truncating, maxlen):
  '''Tokenize the `texts` using the tokenizer. Then, pad the sequences or truncate the sequences
  depending the length. If the length exceeds `maxlen` then it will be truncated and if not then it will be
  padded. The padding and truncating can happend at the beginning or at the end of the sequence depending
  on the value of `padding` and `truncating` respectively.

  Arguments:
    texts: list of strings, the sentences to tokenize and pad
    tokenizer: Tokenizer class object, helps in tokenizing the `texts`
    padding: str, can take 2 values `pre` or `post`. If `pre` then padding will happen at the beginning,
    if `post` then padding will happen at the end.
    truncating: str, can take 2 values `pre` or 'truncating`, works the same as `padding`
    maxlen: int, maximum length after padding or truncating

  Returns:
    returns the tokenized and padded sentences
  '''
  sequences = tokenizer.texts_to_sequences(texts)

  padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding=padding, truncating=truncating)

  return padded_sequences

In [41]:
sample_texts

['I have been working on, \nbut \tnever did it in this way.',
 'U.S won the world cup and bagged 1.78 million dollars.',
 'India had M.S. Dhoni won made it this far.',
 'My email address is arupjana7365@gmail.com.',
 'It can take care of dailymail single opening quote’ also.',
 'I have 10,000 Rs in my bank',
 'This sentence has , after a number 12,',
 'This sentence contains <START> token and <END> token.']

In [42]:
tokenize_pad(sample_texts, tokenizer, padding="post", truncating="post", maxlen=20)

array([[   76,    27,    28,    46,   334,    17,     4,    33,   202,
          146,    19,    10,    38,   143,     2,    77,     0,     0,
            0,     0],
       [   76,   111,     2,    13,   297,     3,    97,   389,     8,
        18336,     1,   161,  2108,     2,    77,     0,     0,     0,
            0,     0],
       [   76,  1039,    43,   136,     2,    13,     2,     1,   297,
          120,    19,    38,   328,     2,    77,     0,     0,     0,
            0,     0],
       [   76,    87,  2646,  1263,    18,     1,     2,   621,     2,
           77,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [   76,    19,    66,   153,   360,     9, 19051,   871,  1078,
         8151,     5,    67,     2,    77,     0,     0,     0,     0,
            0,     0],
       [   76,    27,    28,     1, 17025,    10,    87,   881,    77,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0],
       [  

Create a Geneator function that generates the source and target for training the model.

In [75]:
def generate_example(inputs, targets, input_tokenizer, target_tokenizer, input_len, target_len):
  '''Generates examples for the model. Processes the `inputs` and `targets` with their respective
  tokenizers and tokenize them to `input_len` and `target_len` length.

  Arguments:
    inputs: list of input sentences
    targets: list of target sentences
    input_tokenizer: Tokenizer class object, tokenizer for inputs
    target_tokenizer: Tokenizer class object, tokenizer for targets
    input_len: int, the length of the tokenization for inputs
    target_len: int, the length of the tokenization for targets

  Returns:
    returns 2 values, a tuple containing 2 numpy arrays (input_tokens, target_tokens[:-1]) and
    another numpy array target_tokens[1:]
  '''

  for inp, tar in zip(inputs, targets):
    inp_tokens = tokenize_pad([inp],
                              input_tokenizer,
                              padding="post",
                              truncating="post",
                              maxlen=input_len)

    tar_tokens = tokenize_pad([tar],
                 target_tokenizer,
                 padding="post",
                 truncating="post",
                 maxlen=target_len)

    yield (inp_tokens[0], tar_tokens[0][:-1]), tar_tokens[0][1:]

In [76]:
print(f"Example generated by the generator :")

# (inp_art_tokens, inp_sum_tokens), tar_sum_tokens = generate_example(list(train_dataset[:, 0]),
example_gen = generate_example(list(train_dataset[:, 0]),
                               list(train_dataset[:, 1]),
                               input_tokenizer=tokenizer,
                               target_tokenizer=tokenizer,
                               input_len=MAX_ARTICLE_TOKENS,
                               target_len=MAX_SUMMARY_TOKENS)

inps, tar = next(example_gen)
print(f"Inputs:\n{inps[0]}\n{inps[1]}\n\n")
print(f"Target:\n{tar}")

Example generated by the generator :
Inputs:
[   76    10  2412  3866     5    13  3906     4   295   203  3276    22
    74  3546    95    19    18     1   850  2709   347  1230   222     4
    44     1   294 12027   396     8    56  3245    34   938     6    82
     7  3665     2    25 11679     4   272     4  4955    37  7668     4
     9   850  1593   766    17     3  2215   239     9     7 18464   781
     6     3     1     1    35  8013     6   347     4    54   162  1330
    82  8055   201    88   309     2     3    86  5192     4    20  1048
     4    18   542     4  1197    73  5044  3465     5   301    34  4515
   446    17     7  2853     9  5693     2    73   850   250     3   888
    14     7  1714    11   632  5279  1305    48   374    89     4  3866
  1321    35    66 15812    65    85   946     1     2     1   106     6
   447   850  1354   347    10   660  6436     2  2412  3866    32    83
    25   850   177    11  3547    40     6  5665   222   125   347  1230
    17

In [45]:
print(f"Shapes of the Inputs:\n{inps[0].shape}\n{inps[1].shape}\n\n")
print(f"Shape of the Target:\n{tar.shape}")

Shapes of the Inputs:
(400,)
(99,)


Shape of the Target:
(99,)


In [46]:
print(f"Data Type of the Inputs data:\n{inps[0].dtype}\n{inps[1].dtype}\n\n")
print(f"Data Type of the Target data:\n{tar.dtype}")

Data Type of the Inputs data:
int32
int32


Data Type of the Target data:
int32


In [77]:
inps, tar = next(example_gen)
print(f"Second Inputs:\n{inps[0]}\n{inps[1]}\n\n")
print(f"Second Target:\n{tar}")

Second Inputs:
[   76    19    16    53     9     3   116   790   563   608     9    84
    71     8   523    45  2864    54    19  2872    30    44  5694  3376
    47    79    69  3903     2    33    69     9     1     5   116    11
  1193  2944   120     7  2284   570     6     3   750   361    38   163
     6  2101     3   752     9     7    68  2978   544    55     3   341
     9     3  7391     1   608     2     1  1244  1872  1647     1    11
  3101     8    36  4865  6276     1  4641     4    69     9     3   196
     5    13   116   790     1     4    45   349     7  6823    11   498
   935     9    91     9   242     5    13   234  3295  7273     4   182
 19193     1     4   105  1647     1    16  2796     7  3107  1911    22
     5    36  9838     3  1155     5     8   349     7   720     6     1
    58     6  1823     1     5   820  7969     5     2   541   106    15
   187     2  1681     1     4    20     5    13   336   254   300     3
     1   417  1462  1197 11386     9

As we have the generator to generate the examples, we can use `tf.data` API to create the input data pipeline for our model. I will use the `tf.data.Dataset` class to get the the examples from the generator `generate_example`, we already have the generator object `example_gen`. We can use that in creating the dataset. For more about datasets from generator, refer to [here](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#from_generator).

In [88]:
def train_example_generetor():
  example_gen = generate_example(list(train_dataset[:, 0]),
                                list(train_dataset[:, 1]),
                                input_tokenizer=tokenizer,
                                target_tokenizer=tokenizer,
                                input_len=MAX_ARTICLE_TOKENS,
                                target_len=MAX_SUMMARY_TOKENS)

  for example in example_gen:
    (input_0, input_1), target = example
    yield (input_0, input_1), target

In [91]:
output_signature = (
    (tf.TensorSpec(shape=(MAX_ARTICLE_TOKENS, ), dtype=tf.int32),
     tf.TensorSpec(shape=(MAX_SUMMARY_TOKENS-1, ), dtype=tf.int32)),
    tf.TensorSpec(shape=(MAX_SUMMARY_TOKENS-1, ), dtype=tf.int32)
)

tf_train_dataset = tf.data.Dataset.from_generator(generator=train_example_generetor,
                                                  output_signature=output_signature)
tf_train_dataset = tf_train_dataset.shuffle(BUFFER_SIZE)
tf_train_dataset = tf_train_dataset.batch(BATCH_SIZE, drop_remainder=True)
tf_train_dataset = tf_train_dataset.prefetch(1)

In [90]:
for (art_inp, sum_inp), sum_tar in tf_train_dataset.take(1):
  print(f"Input tokenized article shape: {art_inp.shape}")
  print(f"Input tokenized summary shape: {sum_inp.shape}\n")

  print(f"Target tokenized summary shape: {sum_tar.shape}")

Input tokenized article shape: (16, 400)
Input tokenized summary shape: (16, 99)

Target tokenized summary shape: (16, 99)


After this, we need the model that we can train on this dataset. The model archietecture will be 3-
1. Base-line model: Seq-Seq model with attention mechanism.
2. Pointer Generetor model: With seq-seq attention model will be implementing the pointer generator that can either copy words from article or generate words from the pre-defined vocabulary.
3. Coverage mechanism: Along with the pointer generator that will take case of the out-of-vocabulary words. Coverage mechanism will help prevent the repetition of the words in the summary.

#### Base-Line Model: Seq-seq with Attention

A small demonstration of how Dot layer works

In [94]:
x1 = np.arange(10).reshape(1, 2, 5)
x2 = np.arange(10, 22).reshape(1, 2, 6)
print(f"x1: {x1}\nx2: {x2}")

Dot(axes=1)([x1, x2])

x1: [[[0 1 2 3 4]
  [5 6 7 8 9]]]
x2: [[[10 11 12 13 14 15]
  [16 17 18 19 20 21]]]


<tf.Tensor: shape=(1, 5, 6), dtype=int64, numpy=
array([[[ 80,  85,  90,  95, 100, 105],
        [106, 113, 120, 127, 134, 141],
        [132, 141, 150, 159, 168, 177],
        [158, 169, 180, 191, 202, 213],
        [184, 197, 210, 223, 236, 249]]])>

In [101]:
def one_time_attention(a, s_prev,
                       repeater, concatenator, densor_1, densor_2, softmax_layer, dotter):
  '''Calculates the attention score and returns the context for the current timestep in the decoder.
  Attention mechanism uses encoder outputs `a` of shape `(batch, timesteps, features)` and decoder
  previous hidden state `s_prev` of shape `(batch, features)`, then calculates alignment scores `alphas`
  for each encoder timestep with the help of energies computed with 2 dense layers using `a` and `s_prev`.

  Arguments:
    a: tf.Tensor object, encoder output of shape `(batch, timesteps, features)`
    s_prev: tf.Tensor object, decoder previous hidden state
    repeater: RepeatVector layer, repeat the `s_prev` `Tx` times
    concatenator: Concatenate layer, concatenates `a` and repeated `s_prev`, Concatenates along axis=-1
    densor_1: Dense layer, calculates the pertial energies `e`
    densor_2: Dense layer, calculated the energies `energies`
    softmax_layer: Activation layer, computes softmax of the energies and calculates `alphas`
    dotter: Dot layer, Performs dot operation between `alphas` and `a` along axis=1

  Returns:
    returns the context of shape `(batch, features)`
  '''

  # Repeat the `s_prev` `Tx` times
  s_prev = repeater(s_prev)

  # Concatenate `a` and `s_prev` along axis=-1
  concat = concatenator([a, s_prev])

  # Apply dense layer to get partial energies e
  e = densor_1(concat)

  # Apply dense layer again to get energies
  energies = densor_2(e)

  # Apply softmax over the energies
  alphas = softmax_layer(energies)

  # Dot the alphas and a along axes=1, result shape (batch, densor_2_units, n_a*2)
  context = dotter([alphas, a])

  return context

In [108]:
def baseline_model(Tx, Ty,
                   emb_dim, n_a, n_s, d1_units, d2_units, d_units,
                   article_vocab_size, summary_vocab_size):
  '''This implements the bas-line model archietecture for summarization.
  It is a seq-seq model with attention mechanism implemented in it. The encoder take an input
  with `Tx` time-steps and summarizes with the help of decoder into Ty words. The encoder and decoder
  hidden states are `n_a` and `n_s` dimension respectively. The words are taken from the vocabulary of
  article and summary `article_vocab` and `summary_vocab` with size `article_vocab_size` and
  `summary_vocab_size` respectively.

  Arguments:
    Tx: int, length of the input article
    Ty: int, length of the output summary
    n_a: int, dimension of the encoder hidden states
    n_s: int, dimension of the deocder hidden states
    article_vocab_size: int, length of the article vocabulary
    summary_vocab_size: int, length of the summary vocabulary

  Returns:
    returns the base line model
  '''
  # Defining the input for our model with shape (None, Tx) and (None, Ty) for encoder input and decoder input
  X_inp = Input(shape=(Tx))
  X_tar = Input(shape=(Ty))

  # Initialize s0
  s0 = Input(shape=(n_s, ), name="s0")
  # Initialize c0
  c0 = Input(shape=(n_s, ), name="c0")

  # Initialize the a and s with a0 and s0
  s = s0
  c = c0

  # Define the outputs as empty list
  outputs = []

  # First embedding layer for the article input
  encoder_inp = Embedding(article_vocab_size, emb_dim)(X_inp)

  # Encoder: Bidirectional layer with LSTM cells
  a = Bidirectional(LSTM(units=n_a, return_sequences=True))(encoder_inp)

  # Define the embedding for decoder
  decoder_inp = Embedding(summary_vocab_size, emb_dim)(X_tar)

  # Define the layers for Attention so that we can use the same weights for all decoder timesteps
  repeater = RepeatVector(Tx)
  concatenator = Concatenate(axis=-1)
  attn_densor1 = Dense(units=16, activation='tanh')
  attn_densor2 = Dense(units=1, activation='linear', use_bias=False)
  softmax_layer = Activation('softmax', name="attention_weights")
  dotter = Dot(axes=1)

  # Define the Decoder unidirectional LSTM for shared weights
  post_attention_lstm = LSTM(units=n_s, return_state=True)

  # Define the last dense layer before output layer with linear activation
  densor = Dense(units=16, activation='linear')

  # Define the output layer so that it does not initalize again and again for shared weights
  output_layer = Dense(units=summary_vocab_size, activation='softmax')

  # Decoder: Appends outputs from the output layer in each timestep
  for t in range(Ty):
    # Get the decoder input for current timestep
    curr_dec_in = decoder_inp[:, t:t+1, :]

    # Get the context from the attention mechanism
    context = one_time_attention(a, s,
                                 repeater, concatenator, attn_densor1, attn_densor2, softmax_layer, dotter)

    concat = Concatenate(axis=-1)([curr_dec_in, context])
    _, s, c = post_attention_lstm(concat, initial_state=[s, c])

    # Calculate the output after using 2 linear dense layers
    out = densor(s)
    out = densor(s)
    # Use the output_layer to get the output
    out = output_layer(out)

    # Append the final output to the outputs list
    outputs.append(out)

  model = Model(inputs=[X_inp, X_tar, s0, c0], outputs=outputs)

  return model

In [109]:
Tx = MAX_ARTICLE_TOKENS
Ty = MAX_SUMMARY_TOKENS - 1
emb_dim = EMB_OUT
n_a = ENCODER_STATE_DIM
n_s= DECODER_STATE_DIM
d1_units = DENSE1_UNITS
d2_units = DENSE2_UNITS
d_units = DENSE_UNITS
article_vocab_size = VOCAB_SIZE
summary_vocab_size = VOCAB_SIZE

model = baseline_model(Tx, Ty,
                       emb_dim, n_a, n_s, d1_units, d2_units, d_units,
                       article_vocab_size, summary_vocab_size)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_9 (InputLayer)           [(None, 400)]        0           []                               
                                                                                                  
 embedding_8 (Embedding)        (None, 400, 32)      640000      ['input_9[0][0]']                
                                                                                                  
 s0 (InputLayer)                [(None, 64)]         0           []                               
                                                                                                  
 bidirectional_4 (Bidirectional  (None, 400, 64)     16640       ['embedding_8[0][0]']            
 )                                                                                            