<a href="https://colab.research.google.com/github/Ajay-user/DataScience/blob/master/Natural%20Language%20Processing/Text_classification_author_of_Illiad_translations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Predict the author of Illiad translations
The following provides an example of using `tf.data.TextLineDataset` to load examples from text files, and `tf.text` to preprocess the data. In this example, you will use three different English translations of the same work, Homer's Illiad, and train a model to identify the translator given a single line of text.

Download and explore the dataset

The texts of the three translations are by:

* William Cowper — text

* Edward, Earl of Derby — text

* Samuel Butler — text

In [1]:
pip install tensorflow-text-nightly

Collecting tensorflow-text-nightly
  Downloading tensorflow_text_nightly-2.7.0.dev20210825-cp37-cp37m-manylinux1_x86_64.whl (4.3 MB)
[K     |████████████████████████████████| 4.3 MB 8.0 MB/s 
Installing collected packages: tensorflow-text-nightly
Successfully installed tensorflow-text-nightly-2.7.0.dev20210825


In [2]:
pip install tensorflow-text

Collecting tensorflow-text
  Downloading tensorflow_text-2.6.0-cp37-cp37m-manylinux1_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 7.9 MB/s 
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.6.0


## Imports

In [3]:
import tensorflow as tf
import pathlib
import tensorflow_text as tf_text
import collections

## Data

In [4]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt', 'derby.txt', 'butler.txt']

text_dir = ''

for author in FILE_NAMES:
  text_dir = tf.keras.utils.get_file(fname=author, origin=DIRECTORY_URL+author)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/cowper.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/derby.txt
Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/illiad/butler.txt


In [5]:
parent_dir =  pathlib.Path(text_dir).parent

print('Parent directory ',parent_dir)
for dir in parent_dir.iterdir():
  print('File :',dir)

Parent directory  /root/.keras/datasets
File : /root/.keras/datasets/derby.txt
File : /root/.keras/datasets/cowper.txt
File : /root/.keras/datasets/butler.txt


## Model constant

In [6]:
BUFFER_SIZE = 50000
BATCH_SIZE = 64
VALIDATION_SIZE = 5000
VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 250

## Load the dataset

 `TextLineDataset` is designed to create a `tf.data.Dataset` from a text file in which each example is a line of text from the original file, whereas `text_dataset_from_directory` treats all contents of a file as a single example.

In [7]:
def labeler(sample, label):
  return (sample, tf.cast(label, tf.int64))


lines_set = []
for label, fname in enumerate(FILE_NAMES):
  lines_ds = tf.data.TextLineDataset(str(parent_dir/fname))
  labeled_ds = lines_ds.map(lambda sample:labeler(sample, label))
  lines_set.append(labeled_ds)


In [8]:
print('Length of lines set ', len(lines_set))

Length of lines set  3


In [9]:
text_lines_ds = lines_set[0]

# concatenation
for ds in lines_set[1:]:
  text_lines_ds = text_lines_ds.concatenate(ds)

# shuffle the dataset
text_lines_ds = text_lines_ds.shuffle(buffer_size=BUFFER_SIZE, seed=42, reshuffle_each_iteration=False, )

In [10]:
for line, label in text_lines_ds.take(5):
  print('Line : ',line.numpy())
  print('Label : ',label.numpy())

Line :  b'taught to use the bow.'
Label :  2
Line :  b"This said, he sat; and Atreus' godlike son,"
Label :  1
Line :  b'Is gone to Chrysa, and with her we send'
Label :  0
Line :  b"He cut the boar's throat as he spoke, whereon Talthybius whirled it"
Label :  2
Line :  b"Redden'd the east, then, thronging forth, all Troy"
Label :  0


## Prepare the dataset for training

Tokenization

In [11]:
#tokenizer
tokenizer = tf_text.UnicodeScriptTokenizer()

# utility for standardizing and tokenizing the text
def tokenize(text):
  lowercase = tf_text.case_fold_utf8(text)
  return tokenizer.tokenize(lowercase)


In [12]:
# eg: tokenization
(line, label) = next(iter(text_lines_ds))

print('Line : ',line.numpy())
print('Tokenization : ',tokenize(line).numpy())

Line :  b'taught to use the bow.'
Instructions for updating:
`tf.batch_gather` is deprecated, please use `tf.gather` with `batch_dims=-1` instead.
Tokenization :  [b'taught' b'to' b'use' b'the' b'bow' b'.']


In [13]:
# create a tokenized dataset
tokenized_ds = text_lines_ds.map(lambda text, label: tokenize(text))

Building Vocabulary

In [14]:
# create a default dict for storing vocab
vocab_dict = collections.defaultdict(lambda : 0)

In [15]:
# iterate through the tokenized dataset and create the vocabulary
for tokens in tokenized_ds.as_numpy_iterator():
  for toks in tokens:
    vocab_dict[toks] += 1

In [16]:
# sorting
vocab = sorted(vocab_dict.items(), key=lambda tup: tup[1], reverse=True)

print('Length of vocab', len(vocab))

Length of vocab 14262


In [17]:
# we only need vocabs not the counts
vocab = [token for token, count in vocab]

# keep the top VOCAB_SIZE only
vocab = vocab[:VOCAB_SIZE]

vocab_size = len(vocab)
print('Lenght of vocab', vocab_size)
print('First five vocab', vocab[:5])

Lenght of vocab 10000
First five vocab [b',', b'the', b'and', b"'", b'of']


Vocab look-up table

In [18]:
# vocab look-up

keys = vocab
 # reserve 0 for padding, 1 for OOV
values = range(2,vocab_size+2)

# key-value intializer
init = tf.lookup.KeyValueTensorInitializer(keys, values, key_dtype=tf.string, value_dtype=tf.int64)

# look-up table
num_oov = 1
vocab_table = tf.lookup.StaticVocabularyTable(init, num_oov)

Standardize, Tokenize, Vectorize

In [19]:
def preprocess_text(text, label):
  standardize = tf_text.case_fold_utf8(text)
  tokenize = tokenizer.tokenize(standardize)
  vectorize = vocab_table.lookup(tokenize)
  return vectorize, label

In [20]:
(text, label) = next(iter(text_lines_ds))
print('Line', text.numpy())
preprocessed_text, preprocessed_label = preprocess_text(text, label)
print('Preprocessed output', preprocessed_text.numpy())

Line b'taught to use the bow.'
Preprocessed output [1595    8 1596    3  310    7]


In [21]:
# standardize, tokenize and vectorize the dataset using the tokenizer and lookup table
encoded_ds = text_lines_ds.map(preprocess_text)

## Train Test Split

In [22]:
train_ds = encoded_ds.skip(VALIDATION_SIZE)
val_ds = encoded_ds.take(VALIDATION_SIZE)

Shuffling , Padding and Batching

In [23]:
train_ds = train_ds.shuffle(BUFFER_SIZE).padded_batch(BATCH_SIZE)
val_ds = val_ds.padded_batch(BATCH_SIZE)

In [24]:
text_batch, label_batch = next(iter(train_ds))

print('Shape of text batch', text_batch.shape)
print('Shape of label batch', label_batch.shape)
print('Sample text ',text_batch[0].numpy())
print('Sample label ',label_batch[0].numpy())

Shape of text batch (64, 18)
Shape of label batch (64,)
Sample text  [ 20  47 387  22  32 415 122   2 816 143   0   0   0   0   0   0   0   0]
Sample label  0


## Configure for performance

In [25]:
def config_for_performance(ds):
  ds = ds.cache()
  ds = ds.prefetch(tf.data.AUTOTUNE)
  return ds

In [26]:
train_ds = config_for_performance(train_ds)
val_ds = config_for_performance(val_ds)

## Model Building

In [27]:
#  1D ConvNet
model_1 = tf.keras.Sequential([
                               tf.keras.layers.Embedding(input_dim=vocab_size+2, output_dim=64, mask_zero=True),
                               tf.keras.layers.Conv1D(filters=64, kernel_size=5, padding='valid', activation='relu', strides=2),
                               tf.keras.layers.GlobalAveragePooling1D(),
                               tf.keras.layers.Dense(3)                              
])
# compile the model
model_1.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
# train the model
model_1_history = model_1.fit(train_ds, validation_data=val_ds, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [28]:
loss, accuracy = model_1.evaluate(val_ds)
print('Model loss', loss)
print('Model accuracy', accuracy)

Model loss 0.4006834030151367
Model accuracy 0.8353999853134155


In [43]:
for layer in model_1.layers:
  print('layer ',layer, 'supports masking', layer.supports_masking)

layer  <keras.layers.embeddings.Embedding object at 0x7fd8ed738e50> supports masking True
layer  <keras.layers.convolutional.Conv1D object at 0x7fd8ed6239d0> supports masking False
layer  <keras.layers.pooling.GlobalAveragePooling1D object at 0x7fd8ed731ad0> supports masking True
layer  <keras.layers.core.Dense object at 0x7fd8ef2b0a50> supports masking True


## Export model

In [48]:
preprocessing_layer = tf.keras.layers.TextVectorization(max_tokens=vocab_size+2,
                                                        standardize=tf_text.case_fold_utf8,
                                                        split=tokenizer.tokenize,
                                                        output_mode='int',
                                                        output_sequence_length=MAX_SEQUENCE_LENGTH)

In [49]:
# setting vocabulary
preprocessing_layer.set_vocabulary(vocab)

In [64]:
# export model
export_model = tf.keras.Sequential([preprocessing_layer, model_1, tf.keras.layers.Activation('sigmoid')])

# compile the model
export_model.compile(optimizer='adam',
                     loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
                     metrics=['accuracy'])

In [59]:
# dataset of raw strings
test_ds = text_lines_ds.take(VALIDATION_SIZE).batch(BATCH_SIZE)

In [67]:
# congif for performance
test_ds = config_for_performance(test_ds)

In [68]:
loss, accuracy = export_model.evaluate(test_ds)
print('Model loss', loss)
print('Model accuracy', accuracy)

Model loss 0.7100275158882141
Model accuracy 0.7265999913215637


## Inference on New data

In [69]:
inputs = [
    "Join'd to th' Ionians with their flowing robes,",  # Label: 1
    "the allies, and his armour flashed about him so that he seemed to all",  # Label: 2
    "And with loud clangor of his arms he fell.",  # Label: 0
]

In [71]:
# making predictions
predictions = export_model.predict(inputs)

In [77]:
for text, pred in zip(inputs, predictions):
  print('Text : ', text)
  print('Predicted label : ', tf.argmax(pred).numpy())

Text :  Join'd to th' Ionians with their flowing robes,
Predicted label :  1
Text :  the allies, and his armour flashed about him so that he seemed to all
Predicted label :  2
Text :  And with loud clangor of his arms he fell.
Predicted label :  0
