<a href="https://colab.research.google.com/github/FurrukhJamal/ColabML/blob/main/chapter_13_q10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import sys
assert sys.version_info >= (3,7)

import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

from packaging import version
assert version.parse(tf.__version__) >= version.parse('2.8.0')

In [None]:
from pathlib import Path

rootUrl = "https://ai.stanford.edu/~amaas/data/sentiment/"
fileName = "aclImdb_v1.tar.gz"

filePath = tf.keras.utils.get_file(fileName, rootUrl + fileName, extract=True, cache_dir='.')
filePath = Path(filePath)

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
[1m84125825/84125825[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [None]:
filePath.name

'aclImdb_v1_extracted'

In [None]:
filePath = Path(filePath) / "aclImdb"

In [None]:
filePath

PosixPath('datasets/aclImdb_v1_extracted/aclImdb')

In [None]:
print(filePath)

datasets/aclImdb_v1_extracted/aclImdb


In [None]:
def tree(path, level = 0, indent = 4, max_files = 3):
  if level == 0:
    print(f"{path}/")
    level += 1
  sub_paths = sorted(path.iterdir())
  sub_dirs = [path for path in sub_paths if path.is_dir()]
  filePaths = [path for path in sub_paths if not path in sub_dirs]
  indent_str = " " * indent * level
  for sub_dir in sub_dirs:
    print(f"{indent_str}{sub_dir.name}/")
    tree(sub_dir, level +1)
  for filePath in filePaths[:max_files]:
    print(f"{indent_str}{filePath.name}")
  if len(filePaths) > max_files:
    print(f"{indent_str}...")


  # print(f"len(filePaths) : {len(filePaths)}")


In [None]:
tree(filePath)

datasets/aclImdb_v1_extracted/aclImdb/
    test/
        neg/
            0_2.txt
            10000_4.txt
            10001_1.txt
            ...
        pos/
            0_10.txt
            10000_7.txt
            10001_9.txt
            ...
        labeledBow.feat
        urls_neg.txt
        urls_pos.txt
    train/
        neg/
            0_3.txt
            10000_4.txt
            10001_4.txt
            ...
        pos/
            0_9.txt
            10000_8.txt
            10001_10.txt
            ...
        unsup/
            0_0.txt
            10000_0.txt
            10001_0.txt
            ...
        labeledBow.feat
        unsupBow.feat
        urls_neg.txt
        ...
    README
    imdb.vocab
    imdbEr.txt


In [None]:
def reviewPaths(paths):
  return [str(path) for path in paths.glob("*.txt")]

trainPositive = reviewPaths(filePath / "train" / "pos")
trainNegative = reviewPaths(filePath / "train" / "neg")
testPositive = reviewPaths(filePath / "test" / "pos")
testNegative = reviewPaths(filePath / "test" / "neg")

len(trainPositive), len(trainNegative), len(testPositive), len(testNegative)

(12500, 12500, 12500, 12500)

In [None]:
trainPositive[:5]

['datasets/aclImdb_v1_extracted/aclImdb/train/pos/10642_8.txt',
 'datasets/aclImdb_v1_extracted/aclImdb/train/pos/2177_8.txt',
 'datasets/aclImdb_v1_extracted/aclImdb/train/pos/11353_9.txt',
 'datasets/aclImdb_v1_extracted/aclImdb/train/pos/1162_9.txt',
 'datasets/aclImdb_v1_extracted/aclImdb/train/pos/4181_9.txt']

In [None]:
np.random.shuffle(testPositive)
np.random.shuffle(testNegative)
test_pos = testPositive[7500:]
test_neg = testNegative[7500:]
val_pos = testPositive[:7500]
val_neg = testNegative[:7500]

print(f"validation size : {len(val_pos) + len(val_neg)}")
print(f"test size : {len(test_pos) + len(test_neg)}")

validation size : 15000
test size : 10000


In [None]:
def imdb_datasets(filePathsPositive, filePathsNegative):
  reviews = []
  labels = []
  for filePaths, label in ((filePathsPositive, 1), (filePathsNegative, 0)):
    for filePath in filePaths:
      with open(filePath) as f:
        reviews.append(f.read())
      labels.append(label)
  return tf.data.Dataset.from_tensor_slices((tf.constant(reviews), tf.constant(labels)))

In [None]:
for X, y in imdb_datasets(trainPositive, trainNegative).take(3):
  print(X)
  print(y)
  print()

tf.Tensor(b"I just viewed Detention last night and i liked what i saw. It was a cool fun movie.Dolph looked superbly cool on the Bike.He also looked good in this movie as compared to his other recent movies.He is now in a pretty good shape.The story was ok and the other actors were also passable.I wouldn't call this movie his best but its still a good movie.<br /><br />But it also had its share of Problems. The first one was the way bullets were flying everywhere and even when they were being fired at point blank range they missed the target.They should've had shown the ppl escaping the bullets in a better way. Another problem which i had was the way the students were swearing. I dont know in which school the students can swear in front of their teacher and even in the classroom. The third problem was that the bad guys were very few in numbers. There should've been more bad guys. Last problem was definately the fact that the set looked cheesy , but that was due to the small budget. Ove

timing how much time it takes to go through the dataset 10 times

In [None]:
%timeit -r1 for X, y in imdb_datasets(trainPositive, trainNegative).repeat(10): pass

1min 15s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


creating a dataset

In [None]:
def imdb_dataset2(filepathsPositive, filePathsNegative, n_thread = 5):
  dataset_pos = tf.data.TextLineDataset(filepathsPositive, num_parallel_reads=n_thread)
  dataset_pos = dataset_pos.map(lambda x : (x, 1))
  dataset_neg = tf.data.TextLineDataset(filePathsNegative, num_parallel_reads=n_thread)
  dataset_neg = dataset_neg.map(lambda x : (x, 0))
  return tf.data.Dataset.concatenate(dataset_pos, dataset_neg)

In [None]:
%timeit -r1 for X, y in imdb_dataset2(trainPositive, trainNegative).repeat(10): pass

1min 49s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
%timeit -r1 for X, y in imdb_dataset2(trainPositive, trainNegative).cache().repeat(10): pass

1min 29s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [None]:
batchSize = 32
train_dataset = imdb_dataset2(trainPositive, trainNegative)
train_dataset = train_dataset.shuffle(25000, seed = 42).batch(batchSize).prefetch(1)

valid_dataset = imdb_dataset2(val_pos, val_neg).batch(batchSize).prefetch(1)
test_dataset = imdb_dataset2(test_pos, test_neg).batch(batchSize).prefetch(1)

In [None]:
text_vec_layer = tf.keras.layers.TextVectorization(output_mode = "tf_idf")
text_vec_layer.adapt(train_dataset.map(lambda x, y : x))

In [None]:
text_vec_layer.get_vocabulary()[:10]

['[UNK]',
 np.str_('the'),
 np.str_('and'),
 np.str_('a'),
 np.str_('of'),
 np.str_('to'),
 np.str_('is'),
 np.str_('in'),
 np.str_('it'),
 np.str_('i')]

In [None]:
tf.random.set_seed(42)
model = tf.keras.Sequential([
    text_vec_layer,
    tf.keras.layers.Dense(100, activation = "relu"),
    tf.keras.layers.Dense(1, activation = "sigmoid")
])

model.compile(loss = "binary_crossentropy", optimizer = "nadam", metrics = ["accuracy"])

model.fit(train_dataset, epochs = 10, validation_data = valid_dataset)

Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 310ms/step - accuracy: 0.8413 - loss: 0.3835 - val_accuracy: 0.8714 - val_loss: 0.3442
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 310ms/step - accuracy: 0.9809 - loss: 0.0557 - val_accuracy: 0.8547 - val_loss: 0.5088
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m243s[0m 309ms/step - accuracy: 0.9978 - loss: 0.0093 - val_accuracy: 0.8528 - val_loss: 0.6873
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m228s[0m 289ms/step - accuracy: 0.9997 - loss: 0.0032 - val_accuracy: 0.8527 - val_loss: 0.8040
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m268s[0m 296ms/step - accuracy: 1.0000 - loss: 5.6279e-04 - val_accuracy: 0.8523 - val_loss: 0.8925
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 284ms/step - accuracy: 1.0000 - loss: 2.4564e-04 - val_accuracy: 0.8515 - val_loss: 0.9

<keras.src.callbacks.history.History at 0x79cab712a150>