In [10]:
import io
import os
import re
import shutil
import string
import tensorflow

In [51]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, GlobalAveragePooling1D, Embedding
from tensorflow.keras.layers import TextVectorization

In [12]:
url= "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset=tf.keras.utils.get_file("aclImdb_v1.tar.gz",url,
                                untar=True,
                                cache_dir='.',
                                cache_subdir=''
                               )
dataset_dir=os.path.join(os.path.dirname(dataset),'aclImdb')
os.listdir(dataset_dir)

['imdb.vocab', 'imdbEr.txt', 'README', 'test', 'train']

In [15]:
train_dir=os.path.join(dataset_dir,'train')
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsup',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [19]:
remove_dir=os.path.join(train_dir,'unsup')
shutil.rmtree(remove_dir)

In [20]:
batch_size=1024
seed=123

train_ds=tf.keras.utils.text_dataset_from_directory('aclImdb/train',
                                                    batch_size=batch_size,validation_split=0.2,
                                                    subset='training',seed=seed)
validation_ds=tf.keras.utils.text_dataset_from_directory('aclImdb/train',batch_size=batch_size,
                                                         validation_split=0.2,
                                                        subset='validation',seed=seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.
Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [21]:
os.listdir(train_dir)

['labeledBow.feat',
 'neg',
 'pos',
 'unsupBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt']

In [24]:
for text_batch,labels in train_ds.take(1):
    for i in range(5):
        print(labels[i].numpy(),text_batch.numpy()[i])

0 b"Wow. Some movies just leave me speechless. This was undeniably one of those movies. When I left the theatre, not a single word came to my mouth. All I had was an incredible urge to slam my head against the theatre wall to help me forget about the last hour and a half. Unfortunately, it didn't work. Honestly, this movie has nothing to recommend. The humor was at the first grade level, at best, the acting was overly silly, and the plot was astronomically far-fetched. I hearby pledge never to see an other movie starring Chris Kattan or any other cast-member of SNL."
1 b'If any show in the last ten years deserves a 10, it is this rare gem. It allows us to escape back to a time when things were simpler and more fun. Filled with heart and laughs, this show keeps you laughing through the three decades of difference. The furniture was ugly, the clothes were colorful, and the even the drugs were tolerable. The hair was feathered, the music was accompanied by roller-skates, and in the words 

In [26]:
AUTOTUNE=tf.data.AUTOTUNE
train_ds=train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds=validation_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [52]:
# embedding_layer=tf.keras.layers.Embedding(1000,5)

In [68]:
embedding_dim=16
vocab_size=10000
sequence_length=100

In [73]:
def custom_vectorization(input_data):
    lowercase=tf.strings.lower(input_data)
    stripped_html=tf.strings.regex_replace(lowercase, '<br />', ' ')
    return tf.strings.regex_replace(stripped_html, '[%s]' % re.escape(string.punctuation),'')
    

In [74]:
vectorize_layer=TextVectorization(
    standardize=custom_vectorization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length
)


In [75]:
test_ds=train_ds.map(lambda x, y:x)
vectorize_layer.adapt(test_ds)

In [91]:

# lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', oov_token='<OOV>')
# lang_tokenizer.fit_on_texts(train_ds)
# tensor = lang_tokenizer.texts_to_sequences(lang) 


# tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')

In [92]:
model=tf.keras.Sequential([
    vectorize_layer,
    Embedding(vocab_size,embedding_dim,name='embedding',mask_zero=True),
    GlobalAveragePooling1D(),
    Dense(16,activation='relu'),
    Dense(1)    
])

In [93]:
model.compile(optimizer='Adam',
             loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
             metrics=['accuracy'])

In [94]:
history=model.fit(train_ds,
                 validation_data=validation_ds,
                 epochs=15
                 )

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [96]:
weights=model.get_layer('embedding').get_weights()[0]

In [100]:
vocab_layer=vectorize_layer.get_vocabulary()

In [102]:
# vocab_layer