<a href="https://colab.research.google.com/github/Anjasfedo/Learning-TensorFlow/blob/main/eat_tensorflow2_in_30_days/Chapter1_3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1-3 Example: Modeling Procedure for Texts

## 1. Data Preparation

The purpose of imdb dataset is to predict setiment label according to movie reviews.

There 20000 text reviews in train dataset and 5000 in test datase, half positive and negative, respectively.

The pre-processing of text dataset kinda complex, which include word devision (for chinese only, not relevant on this demo), dictionary construction, encoding, sequence filling, and data pipeline construction, etc.

There is two popular method of text preparation in TensorFlow:
1. construct text data generator using Tokenizer in `tf.keras.preprocessing`, together with `tf.kears.utils.Sequence`.
2. with `tf.data.Dataset`, together with pre-processing layer `tf.keras.experimental.preprocessing.TextVectorization`

Here is the second method

In [41]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as py
import tensorflow as tf
from tensorflow.keras import models, layers, preprocessing, optimizers, losses, metrics
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
import re, string, os

In [42]:
base_url = "https://raw.githubusercontent.com/lyhue1991/eat_tensorflow2_in_30_days/master/data/imdb/"

train_filename = "train.csv"
test_filename = "test.csv"

train_url = base_url + train_filename
test_url = base_url + test_filename

train_data_path = tf.keras.utils.get_file(train_filename, origin=train_url, cache_dir='.', cache_subdir='data')
test_data_path = tf.keras.utils.get_file(test_filename, origin=test_url, cache_dir='.', cache_subdir='data')

print(f"Train data downloaded to: {train_data_path}")
print(f"Test data downloaded to: {test_data_path}")

print(f"Train data exists: {os.path.exists(train_data_path)}, Size: {os.path.getsize(train_data_path) / 1024:.2f} KB")
print(f"Test data exists: {os.path.exists(test_data_path)}, Size: {os.path.getsize(test_data_path) / 1024:.2f} KB")

Downloading data from https://raw.githubusercontent.com/lyhue1991/eat_tensorflow2_in_30_days/master/data/imdb/train.csv
Downloading data from https://raw.githubusercontent.com/lyhue1991/eat_tensorflow2_in_30_days/master/data/imdb/test.csv
Train data downloaded to: ./data/train.csv
Test data downloaded to: ./data/test.csv
Train data exists: True, Size: 26058.23 KB
Test data exists: True, Size: 6482.65 KB


In [57]:
MAX_WORDS = 10000 # consider the 10000 words with highest frequency of appearence
MAX_LEN = 200 # each sample, preserve the first 200 words
BATCH_SIZE = 32

In [51]:
# Construct data pipeline
def split_line(line):
  arr = tf.strings.split(line, sep='\t')
  label = tf.expand_dims(tf.cast(tf.strings.to_number(arr[0]), tf.int32), axis=0)
  text = tf.expand_dims(arr[1], axis=0)
  return (text, label)

In [52]:
ds_train_raw = tf.data.TextLineDataset(filenames=[train_data_path]) \
                .map(split_line, num_parallel_calls=tf.data.experimental.AUTOTUNE) \
                .shuffle(buffer_size=10000) \
                .batch(BATCH_SIZE) \
                .prefetch(tf.data.experimental.AUTOTUNE)

ds_test_raw = tf.data.TextLineDataset(filenames=[test_data_path]) \
                .map(split_line, num_parallel_calls=tf.data.experimental.AUTOTUNE) \
                .batch(BATCH_SIZE) \
                .prefetch(tf.data.experimental.AUTOTUNE)

In [53]:
# Construct dictionary
def clean_text(text):
  lowercase = tf.strings.lower(text)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ')
  cleaned_punctuation = tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation), '')
  return cleaned_punctuation

In [54]:
vectorize_layer = TextVectorization(
    standardize=clean_text,
    split='whitespace',
    max_tokens=MAX_WORDS,
    output_mode='int',
    output_sequence_length=MAX_len
)

ds_text = ds_train_raw.map(lambda text, label: text)
vectorize_layer.adapt(ds_text)
print(vectorize_layer.get_vocabulary()[0:100])

['', '[UNK]', 'the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i', 'this', 'that', 'was', 'as', 'for', 'with', 'movie', 'but', 'film', 'on', 'not', 'you', 'his', 'are', 'have', 'be', 'he', 'one', 'its', 'at', 'all', 'by', 'an', 'they', 'from', 'who', 'so', 'like', 'her', 'just', 'or', 'about', 'has', 'if', 'out', 'some', 'there', 'what', 'good', 'more', 'when', 'very', 'she', 'even', 'my', 'no', 'would', 'up', 'time', 'only', 'which', 'story', 'really', 'their', 'were', 'had', 'see', 'can', 'me', 'than', 'we', 'much', 'well', 'get', 'been', 'will', 'into', 'people', 'also', 'other', 'do', 'bad', 'because', 'great', 'first', 'how', 'him', 'most', 'dont', 'made', 'then', 'them', 'films', 'movies', 'way', 'make', 'could', 'too', 'any']


In [56]:
# Word encoding
ds_train = ds_train_raw.map(lambda text, label: (vectorize_layer(text), label)) \
            .prefetch(tf.data.experimental.AUTOTUNE)
ds_test = ds_test_raw.map(lambda text, label: (vectorize_layer(text), label)) \
            .prefetch(tf.data.experimental.AUTOTUNE)

## 2. Model Definition

Here is the way to customized modeling by inherit base class `Model`

In [58]:
# Actually, modeling with sequential() or API functions should be priorized.

tf.keras.backend.clear_session()

class CnnModel(models.Model):
  def __init__(self):
    super(CnnModel, self).__init__()

  def build(self, input_shape):
    self.embedding = layers.Embedding(MAX_WORDS, 7, input_length=MAX_LEN)
    self.conv_1 = layers.Conv1D(16, kernel_size=5, name='conv_1', activation='relu')
    self.pool_1 = layers.MaxPool1D(name='pool_1')
    self.conv_2 = layers.Conv1D(128, kernel_size=2, name='conv_2', activation='relu')
    self.pool_2 = layers.MaxPool1D(name='pool_2')
    self.flatten = layers.Flatten()
    self.dense = layers.Dense(1, activation='sigmoid')

  def call(self, inputs):
    x = self.embedding(inputs)
    x = self.conv_1(x)
    x = self.pool_1(x)
    x = self.conv_2(x)
    x = self.pool_2(x)
    x = self.flatten(x)
    x = self.dense(x)
    return (x)

  def summary(self):
    x_input = layers.Input(shape=(MAX_LEN,), dtype=tf.int32)
    output = self.call(x_input)
    model = models.Model(inputs=x_input, outputs=output)
    model.summary()

model = CnnModel()
model.build(input_shape=(None, MAX_LEN))
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 200)]             0         
                                                                 
 embedding (Embedding)       (None, 200, 7)            70000     
                                                                 
 conv_1 (Conv1D)             (None, 196, 16)           576       
                                                                 
 pool_1 (MaxPooling1D)       (None, 98, 16)            0         
                                                                 
 conv_2 (Conv1D)             (None, 97, 128)           4224      
                                                                 
 pool_2 (MaxPooling1D)       (None, 48, 128)           0         
                                                                 
 flatten (Flatten)           (None, 6144)              0     