# Sentence Classifier using 1D CNN

In [1]:
import matplotlib.pyplot as plt
import os
import re
import string
import shutil
import tensorflow as tf
from tensorflow.keras import layers, losses, callbacks, Sequential

2024-04-03 17:25:01.094655: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-03 17:25:01.097687: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-03 17:25:01.141596: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
dataset = tf.keras.utils.get_file(
    "aclImdb_v1" , url,
    untar=True, cache_dir='',
    cache_subdir=''
)
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

In [3]:
os.listdir(dataset_dir)

['test', 'imdbEr.txt', 'imdb.vocab', 'README', 'train']

In [4]:
train_dir = os.path.join(dataset_dir, 'train')
os.listdir(train_dir)

['neg',
 'pos',
 'unsupBow.feat',
 'labeledBow.feat',
 'urls_neg.txt',
 'urls_pos.txt',
 'urls_unsup.txt',
 'unsup']

In [5]:
shutil.rmtree(os.path.join(train_dir, "unsup"))

In [6]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed
)
raw_val_ds = tf.keras.utils.text_dataset_from_directory(
    train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed
)
raw_test_ds = tf.keras.utils.text_dataset_from_directory(
    os.path.join(dataset_dir, 'test'),
    batch_size=batch_size,
)


Found 25000 files belonging to 2 classes.
Using 20000 files for training.


2024-04-03 17:25:19.027573: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-04-03 17:25:19.027962: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Found 25000 files belonging to 2 classes.
Using 5000 files for validation.
Found 25000 files belonging to 2 classes.


In [7]:
for x in iter(raw_test_ds):
    print(x)
    break

(<tf.Tensor: shape=(32,), dtype=string, numpy=
array([b'the acting is good.thats the positives out of the way! SOSN is shallow and superficial.Almost all the characters are middle class and English.The gay men are depicted as fickle sexual predators aiming to use children in their empty lives.This film could only appeal to people who know hampstead heath and would get minor satisfaction from pointing out any landmarks.There is no time to engage with the characters and has a result you really don,t care about them,Catherine Tate at the height of her comedic fame stars as a woman seeking a divorce from her husband and on screen for about the same time as her Nana sketches failed to convince,however if she had said "what a f****** liberty" i would have agreed <br /><br />I\'d rather take a walk in the Park;unintelligent rubbish!',
       b'This movie is of interest to the fans of the famous rock group "The Band" in that singer/ keyboardist Richard Manuel appears in several scenes. It look

In [8]:
def custom_standardization(input_data):
    l_case = tf.strings.lower(input_data)
    stp_html = tf.strings.regex_replace(
        l_case, '<br/>', ' '
    )
    return tf.strings.regex_replace(
        stp_html,
        f'[{re.escape(string.punctuation)}]',
        ''
    )

def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [9]:
max_features = 10000
seq_len = 250

vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=seq_len 
)

In [10]:
train_text = raw_train_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)


2024-04-03 17:25:24.231080: W tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [11]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = raw_train_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
val_ds = raw_val_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)
test_ds = raw_test_ds.map(vectorize_text).cache().prefetch(buffer_size=AUTOTUNE)


In [41]:
emb_dims = 128

model = Sequential([
    layers.Embedding(max_features+1, emb_dims),
    layers.Conv1D(16, 5, activation="relu"),
    layers.GlobalAveragePooling1D(),
    layers.Dropout(0.2),
    layers.Dense(32, activation='relu'),
    layers.Dense(1),
])

model.compile(
    optimizer='adam',
    loss=losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy']
)

model.summary()

In [42]:
model.fit(
    train_ds, 
    validation_data=val_ds,
    epochs=10,
    callbacks=[
        callbacks.TensorBoard(log_dir="logs/1dcnn"),
    ]
)

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 30ms/step - accuracy: 0.6604 - loss: 0.5513 - val_accuracy: 0.8520 - val_loss: 0.3111
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 29ms/step - accuracy: 0.8873 - loss: 0.2762 - val_accuracy: 0.8650 - val_loss: 0.2906
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 29ms/step - accuracy: 0.9181 - loss: 0.2085 - val_accuracy: 0.8770 - val_loss: 0.3062
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 29ms/step - accuracy: 0.9415 - loss: 0.1511 - val_accuracy: 0.8746 - val_loss: 0.3758
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 28ms/step - accuracy: 0.9537 - loss: 0.1201 - val_accuracy: 0.8720 - val_loss: 0.3764
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 29ms/step - accuracy: 0.9620 - loss: 0.0967 - val_accuracy: 0.8670 - val_loss: 0.4255
Epoch 7/10
[1m6

<keras.src.callbacks.history.History at 0x7f4e839ee7a0>

In [14]:
%load_ext tensorboard
%tensorboard --logdir logs

In [43]:
model.evaluate(test_ds)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8389 - loss: 0.6776


[0.6867113709449768, 0.8408799767494202]

In [44]:
export_model = tf.keras.Sequential([
    vectorize_layer,
    model,
    layers.Activation('sigmoid')
])

export_model.compile(
    loss=losses.BinaryCrossentropy(from_logits=False),
    optimizer="adam",
    metrics=["accuracy"]
)


export_model.evaluate(raw_test_ds)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.8380 - loss: 0.6798


[0.6888637542724609, 0.8415200114250183]

In [55]:
export_model(tf.constant(['The movie is very boring', 'A Good Movie' , 'very bad worst movie',  'Worst movie, boring' ]))

<tf.Tensor: shape=(4, 1), dtype=float32, numpy=
array([[0.06500715],
       [0.5285808 ],
       [0.02967874],
       [0.11571622]], dtype=float32)>

In [52]:
export_model.save('sentence_classificatoin_model.keras')