<a href="https://colab.research.google.com/github/Amanuel94/kaggle/blob/main/movie_review_tfds.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pathlib
import requests
import zipfile
import tarfile
import random
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

In [2]:
DOWNLOAD_DIR = "./data"
DATA_URL = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"


Download the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz), which contains 50,000 movies reviews from the [Internet Movie Database](https://www.imdb.com/). The data is organized in two directories, train and test, each containing a pos subdirectory with 12,500 positive reviews and a neg subdirectory with 12,500 negative reviews. Each review is stored in a
separate text file.

In [3]:
def get_file(url, download_dir = DOWNLOAD_DIR):

  data_path = pathlib.Path(download_dir)
  if data_path.exists() and data_path.is_dir():
        shutil.rmtree(data_path)
        print(f'Removed existing directory: {data_path}')

  file_name = url.split('/')[-1]
  file_path = os.path.join(download_dir, file_name)
  os.makedirs(download_dir, exist_ok=True)
  response = requests.get(url)
  response.raise_for_status()
  with open(file_path, 'wb') as f:
    f.write(response.content)

  if file_path.endswith('.zip'):
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(download_dir)
            print(f'Extracted files to {download_dir}')
  elif file_path.endswith('.tar.gz') or file_path.endswith('.tgz'):
      with tarfile.open(file_path, 'r:gz') as tar_ref:
          tar_ref.extractall(download_dir)
          print(f'Extracted files to {download_dir}')

  train_path = f"{download_dir}/{file_name.split('_')[0]}/train/"
  test_path = f"{download_dir}/{file_name.split('_')[0]}/test/"

  return train_path, test_path

In [4]:
trainPath, testPath = get_file(DATA_URL, DOWNLOAD_DIR)

Removed existing directory: data
Extracted files to ./data


In [5]:
def count_instances(dir_path):
  dir_path = pathlib.Path(dir_path)
  file_count = sum(1 for item in dir_path.iterdir() if item.is_file())
  return file_count

print(f"Positive train instances: {count_instances(trainPath + 'pos/')}")
print(f"Negative train instances: {count_instances(trainPath + 'neg/')}")
print(f"Positive test instances: {count_instances(testPath + 'pos/')}")
print(f"Negative test instances: {count_instances(testPath + 'neg/')}")

Positive train instances: 12500
Negative train instances: 12500
Positive test instances: 12500
Negative test instances: 12500


Split the test set into a validation set (15,000) and a test set (10,000).


In [6]:
def copy_files(sources, dest_dir):

  dest_path = pathlib.Path(dest_dir)

  if dest_path.exists() and dest_path.is_dir():
        shutil.rmtree(dest_path)
        print(f'Removed existing directory: {dest_path}')

  dest_path.mkdir(parents = True, exist_ok = True)

  for file_path in sources:
    shutil.copy(file_path, dest_path / file_path.name)


def partition_files(file_path, n):
  filePath = pathlib.Path(file_path)
  all_files = [item for item in filePath.iterdir() if item.is_file()]
  random.shuffle(all_files)
  return all_files[:n], all_files[n:]



def val_test_split(dir_path, val_size):

  val_pos, test_pos = partition_files(dir_path + 'pos/', val_size//2)
  val_neg, test_neg = partition_files(dir_path + 'neg/', val_size//2)

  if dir_path[-1] == '/': dir_path = dir_path[:-1]
  root_path = dir_path[:dir_path.rindex('/')]

  val_pos_path = os.path.join(root_path, 'val', 'pos')
  val_neg_path = os.path.join(root_path, 'val', 'neg')

  test_pos_path = os.path.join(root_path, 'test_v2', 'pos')
  test_neg_path = os.path.join(root_path, 'test_v2', 'neg')

  copy_files(val_pos, val_pos_path)
  copy_files(test_pos, test_pos_path)
  copy_files(val_neg, val_neg_path)
  copy_files(test_neg, test_neg_path)

  return root_path + "/val/" , root_path + "/test_v2/"


In [7]:
valPath_, testPath_ = val_test_split(testPath, 15000)

Use `tf.data` to create an efficient dataset for each set.


In [8]:
train_filepath_ds = tf.data.Dataset.list_files(trainPath + "*/*", seed = 42)
val_filepath_ds = tf.data.Dataset.list_files(valPath_ + "*/*", seed = 42)
test_filepath_ds = tf.data.Dataset.list_files(testPath_ + "*/*", seed = 42)


In [9]:
def instance_label(file_path):
  label = 0 if tf.strings.split(file_path, sep = '/')[4] == 'neg' else 1
  return  tf.data.Dataset.from_tensors(label)

In [10]:
train_ds = train_filepath_ds.interleave(
    lambda x: tf.data.Dataset.zip((tf.data.TextLineDataset(x), instance_label(x))),
    cycle_length = 5,
    num_parallel_calls = tf.data.experimental.AUTOTUNE
  )

val_ds = val_filepath_ds.interleave(
    lambda x: tf.data.Dataset.zip((tf.data.TextLineDataset(x), instance_label(x))),
    cycle_length = 5,
    num_parallel_calls = tf.data.experimental.AUTOTUNE
  )

test_ds = test_filepath_ds.interleave(
    lambda x: tf.data.Dataset.zip((tf.data.TextLineDataset(x), instance_label(x))),
    cycle_length = 5,
    num_parallel_calls = tf.data.experimental.AUTOTUNE
  )

In [11]:
for text, label in train_ds.take(1):
  print(text)
  print(label)

tf.Tensor(b"John Rivers' life as an architect and family man has taken a turn for the worst when his wife has disappeared and has been concluded dead after a freakish accident that involved changing a tyre on her car. During the days she has been missing, he confronts a man that's been following and he tells him that his been in contact with his dead wife from the other-side through E.V.P - Electronic Voice Phenomenon. Naturally he doesn't believe it but then hear gets weird phone calls from her phone and so he contacts the man to find out more about E.V.P. Soon enough John is hooked onto it, but something supernatural doesn't like him interfering with the dead, as now other then contacting his wife, the white noise is foretelling events before they happen.<br /><br />Since this DVD has been sitting on my shelf for a while now, I thought I better get around to watching it since it wasn't my copy. But then again I don't think the owners were in a hurry to get it back, as they haven't qu

Create a binary classification model, using a `TextVectorization` layer to pre- process each review. If the `TextVectorization` layer is not yet available (or if you like a challenge), try to create your own custom preprocessing layer: you can use the functions in the `tf.strings` package, for example `lower()` to make everything lowercase, `regex_replace()` to replace punctuation with spaces, and `split()` to split words on spaces. You should use a lookup table to output word indices, which must be prepared in the `adapt()` method.

In [174]:
class CustomTextVectorization(tf.keras.Layer):

  def __init__(self,
               max_tokens = None,
               standardization = "lower_and_strip_punctuation",
               split = True,
               num_oov_buckets = 100,
               max_len = None,
               paddings = None,
               **kwargs):

    super(CustomTextVectorization, self).__init__(**kwargs)
    self.max_tokens = max_tokens
    self.standardization = standardization
    self.split = split
    self.num_oov_buckets = num_oov_buckets
    self.max_len = max_len
    self.paddings = paddings
    self.init = None
    self.table = None


  def standardize(self, text):
    text = tf.strings.split(text)
    if self.standardization == "lower_and_strip_punctuation":
      text = tf.map_fn(lambda x: tf.strings.lower(x), text)
      text = tf.map_fn(lambda x: tf.strings.regex_replace(x, r'[\p{P}]', ''), text)
      text = tf.map_fn(lambda x: tf.strings.regex_replace(x, r'/<br\s*/?/>', ''), text)
      text = tf.map_fn(lambda x: tf.strings.regex_replace(x, r'[^a-z\s]', ''), text)
      text = tf.map_fn(lambda x: tf.strings.regex_replace(x, r'\s+', ' '), text)

    elif self.standardization == "strip_punctuation":
      text = tf.map_fn(lambda x: tf.strings.regex_replace(x, r'[\p{P}]', ''), text)
      text = tf.map_fn(lambda x: tf.strings.regex_replace(x, r'/<br\s*/?/>', ''), text)
      text = tf.map_fn(lambda x: tf.strings.regex_replace(x, r'[^a-z\s]', ''), text)
      text = tf.map_fn(lambda x: tf.strings.regex_replace(x, r'\s+', ' '), text)

    elif self.standardization == "lower":
      text = tf.map_fn(lambda x: tf.strings.lower(x), text)
    row_lengths = tf.shape(text)[0]
    paddings = max(self.max_len - row_lengths, 0)
    pad_and_slice = lambda x: tf.pad(x, [[0, paddings]], constant_values = "<pad>")[:self.max_len]
    text = tf.map_fn(pad_and_slice, text)
    return text


  def adapt(self, data):
    words = []
    for text, label in data.take(10):
      text = self.standardize(text)
      text = tf.reshape(text, [-1])
      for substring in text:
         words.append(substring)

    keys, _ = tf.unique(tf.reshape(words, [-1]))
    values = tf.constant(tf.cast(np.arange(tf.size(keys)), tf.int64))
    self.init = tf.lookup.KeyValueTensorInitializer(keys, values)
    self.table = tf.lookup.StaticVocabularyTable(self.init, self.num_oov_buckets)
    return self

  def call(self, X):
    print(tf.shape(X))
    # X = tf.reshape(X, [-1])
    X = self.standardize(X)
    # print(tf.shape(X))
    word_vector = self.table.lookup(X).to_tensor()
    # print(tf.shape(word_vector))
    return tf.reshape(word_vector, shape = (-1, self.max_len))

  def compute_output_shape(self, input_shape):
    return (input_shape[0], self.max_len)



In [175]:
l = CustomTextVectorization(max_len = 1000).adapt(train_ds.batch(1))
# l.table.lookup(tf.constant(["this", "is", "a", "test"]))
for a, b in train_ds.batch(32, drop_remainder=True).take(10):
  print("-"*10)
  (l(a))
  print()

----------
tf.Tensor([32], shape=(1,), dtype=int32)

----------
tf.Tensor([32], shape=(1,), dtype=int32)

----------
tf.Tensor([32], shape=(1,), dtype=int32)

----------
tf.Tensor([32], shape=(1,), dtype=int32)

----------
tf.Tensor([32], shape=(1,), dtype=int32)

----------
tf.Tensor([32], shape=(1,), dtype=int32)

----------
tf.Tensor([32], shape=(1,), dtype=int32)

----------
tf.Tensor([32], shape=(1,), dtype=int32)

----------
tf.Tensor([32], shape=(1,), dtype=int32)

----------
tf.Tensor([32], shape=(1,), dtype=int32)



In [180]:
model = tf.keras.models.Sequential(
    [
        tf.keras.layers.Input(shape = (), dtype = tf.string),
        CustomTextVectorization(max_len = 100).adapt(train_ds.batch(1)),
        tf.keras.layers.Dense(200,  activation = "selu", kernel_initializer="lecun_normal"),
        tf.keras.layers.Dense(200,  activation = "selu", kernel_initializer="lecun_normal"),
        tf.keras.layers.Dense(100,  activation = "selu", kernel_initializer="lecun_normal"),
        tf.keras.layers.Dense(10,  activation = "selu", kernel_initializer="lecun_normal"),
        tf.keras.layers.Dense(2, activation = 'softmax')
    ]
)
model.summary()

In [181]:
model.compile(
    optimizer = "Nadam",
    loss = tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics = ["accuracy"],
    run_eagerly = True
)

In [None]:
n_batch = 32
n_epoch = 100
escb = tf.keras.callbacks.EarlyStopping(patience = 20)
batched_train_ds = train_ds.batch(n_batch, drop_remainder=True)
batched_val_ds = val_ds.batch(n_batch,  drop_remainder=True)

model.fit(batched_train_ds, validation_data = batched_val_ds, epochs = n_epoch, callbacks = [escb])

Epoch 1/100
tf.Tensor([32], shape=(1,), dtype=int32)
      1/Unknown [1m1s[0m 889ms/step - accuracy: 0.6562 - loss: 24.8340tf.Tensor([32], shape=(1,), dtype=int32)
      2/Unknown [1m1s[0m 313ms/step - accuracy: 0.6953 - loss: 26.3288tf.Tensor([32], shape=(1,), dtype=int32)
      3/Unknown [1m2s[0m 320ms/step - accuracy: 0.7066 - loss: 26.0174tf.Tensor([32], shape=(1,), dtype=int32)
      4/Unknown [1m2s[0m 311ms/step - accuracy: 0.7194 - loss: 25.7982tf.Tensor([32], shape=(1,), dtype=int32)
      5/Unknown [1m2s[0m 306ms/step - accuracy: 0.7130 - loss: 25.6637tf.Tensor([32], shape=(1,), dtype=int32)
      6/Unknown [1m2s[0m 309ms/step - accuracy: 0.7122 - loss: 25.2387tf.Tensor([32], shape=(1,), dtype=int32)
      7/Unknown [1m3s[0m 305ms/step - accuracy: 0.7125 - loss: 24.7389tf.Tensor([32], shape=(1,), dtype=int32)
      8/Unknown [1m3s[0m 304ms/step - accuracy: 0.7153 - loss: 24.1777tf.Tensor([32], shape=(1,), dtype=int32)
      9/Unknown [1m3s[0m 309ms/step - acc