In [1]:
import pandas as pd

df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,text,target
0,LOLLL IT IS OFFICIALLY DAYLIGHT. Obviously NOW...,0
1,@Ange1isa Robert Pattinson was hit by a cab....,0
2,They killed CORNBREAD!!!! http://bit.ly/3PCl7R,0
3,@reemakoul why is twitterfox not working! i wr...,0
4,@verabeltran hi.you're still talking to me? ...,0


In [2]:
df_val = pd.read_csv('validation.csv')
df_val.head()

Unnamed: 0,text,target
0,@rubyredtees You know you will ....... This...,1
1,@actionsmotives Besides proving your natural h...,1
2,@corkyloowho me too,0
3,@samcarew it's still on! On my way to Berlin f...,1
4,I soooooo don't wanna go to work today.,0


In [3]:
import tensorflow as tf
from tensorflow import keras

train_set = tf.data.Dataset.from_tensor_slices((df_train.text, df_train.target)).shuffle(int(1e5)).batch(32).prefetch(1)
val_set = tf.data.Dataset.from_tensor_slices((df_val.text, df_val.target)).batch(32).prefetch(1)

2021-09-16 14:53:34.021635: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
2021-09-16 14:54:09.785178: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2021-09-16 14:54:09.792415: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2021-09-16 14:54:09.881397: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-09-16 14:54:09.881980: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:01:00.0 name: NVIDIA GeForce GTX 1050 Ti computeCapability: 6.1
coreClock: 1.62GHz coreCount: 6 deviceMemorySize: 3.95GiB deviceMemoryBandwidth: 104.43GiB/s
2021-09-16 14:54:09.882018: I tensorflow/stream_executor/platform/d

In [4]:
from collections import Counter

class TextVectorization(keras.layers.Layer):
    def __init__(self, output_sequence_length=64, max_vocab_size=50000, n_oov_buckets=1000, dtype=tf.string, **kwargs):
        super().__init__(dtype=dtype, **kwargs)
        self.table = None
        self.output_sequence_length = output_sequence_length
        self.max_vocab_size = max_vocab_size
        self.n_oov_buckets = n_oov_buckets
        
    def _preprocess(self, input_data):
        data = tf.strings.lower(input_data)
        data = tf.strings.regex_replace(data, '[^a-z]', ' ')
        data = tf.strings.split(data)
        data = data.to_tensor(default_value=b'<pad>', shape=(None, self.output_sequence_length))
        
        return data
    
    def _get_vocab(self, data_sample):
        data_sample = self._preprocess(data_sample)
        data_sample = data_sample.numpy().reshape(-1)
        counter = Counter(data_sample)
        _ = counter.pop(b'<pad>', None)
        
        return [b'<pad>'] + [token for token, count in counter.most_common(self.max_vocab_size)]
        
    def adapt(self, data_sample):
        vocab = self._get_vocab(data_sample)
        indices = tf.range(len(vocab), dtype=tf.int64)
        table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
        self.table = tf.lookup.StaticVocabularyTable(table_init, self.n_oov_buckets)
        
        return self
        
    def call(self, input_data):
        data = self._preprocess(input_data)
        
        return self.table.lookup(data)
    
    def compute_output_shape(self, batch_input_shape):
        return tf.TensorShape(batch_input_shape.as_list() + [self.output_sequence_length])
    
    def get_config(self):
        base_config = super().get_config()
        return {**base_config, 
                'output_sequence_length': self.output_sequence_length,
                'max_vocab_size': self.max_vocab_size,
                'n_oov_buckets': self.n_oov_buckets
               }

In [5]:
# max_vocab_size = 10000
# oov_buckets = 500

# text_vectorizer = TextVectorization(input_shape=(), max_vocab_size=max_vocab_size,
#                                     n_oov_buckets=oov_buckets).adapt(df_train.text)
# model = keras.models.Sequential([
#     text_vectorizer,
#     keras.layers.Embedding(input_dim=max_vocab_size+oov_buckets+1, output_dim=200, mask_zero=True),
#     keras.layers.Bidirectional(keras.layers.GRU(units=100, return_sequences=True)),
#     keras.layers.Bidirectional(keras.layers.GRU(units=100, return_sequences=True, dropout=0.2)),
#     keras.layers.GRU(units=200, dropout=0.2),
#     keras.layers.Dropout(0.2),
#     keras.layers.Dense(1, activation='sigmoid')
# ])

In [6]:
max_vocab_size = 10000
oov_buckets = 500

text_vectorizer = TextVectorization(input_shape=(), max_vocab_size=max_vocab_size,
                                    n_oov_buckets=oov_buckets).adapt(df_train.text)
model = keras.models.Sequential([
    text_vectorizer,
    keras.layers.Embedding(input_dim=max_vocab_size+oov_buckets+1, output_dim=200, mask_zero=True),
    keras.layers.Bidirectional(keras.layers.LSTM(units=100, return_sequences=True)),
    keras.layers.Bidirectional(keras.layers.LSTM(units=100, return_sequences=True, dropout=0.2)),
    keras.layers.LSTM(units=200, dropout=0.2),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(1, activation='sigmoid')
])

2021-09-16 14:54:21.645814: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 520805352 exceeds 10% of free system memory.
2021-09-16 14:54:23.197324: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 2426880000 exceeds 10% of free system memory.


In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 64)                0         
_________________________________________________________________
embedding (Embedding)        (None, 64, 200)           2100200   
_________________________________________________________________
bidirectional (Bidirectional (None, 64, 200)           240800    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 64, 200)           240800    
_________________________________________________________________
lstm_2 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dropout (Dropout)            (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 2

In [8]:
import os

root_logdir = os.path.join(os.curdir, "logs")

def get_run_logdir():
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S_LSTM")
    return os.path.join(root_logdir, run_id)


model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='nadam')
callbacks = [
    tf.keras.callbacks.TensorBoard(get_run_logdir(), histogram_freq=1, profile_batch='400,500'),
    tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
]

2021-09-16 14:55:02.441204: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2021-09-16 14:55:02.441366: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.
2021-09-16 14:55:02.441473: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1365] Profiler found 1 GPUs
2021-09-16 14:55:02.454967: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcupti.so.11.0
2021-09-16 14:55:02.642371: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2021-09-16 14:55:02.642542: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1487] CUPTI activity buffer flushed


In [9]:
model.fit(train_set, epochs=100, callbacks=callbacks, validation_data=val_set)

Epoch 1/100


2021-09-16 14:55:18.547913: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2021-09-16 14:55:18.629239: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2199995000 Hz
2021-09-16 14:55:20.697957: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11
2021-09-16 14:55:21.010239: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11
2021-09-16 14:55:21.361334: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8


  403/49375 [..............................] - ETA: 32:56 - loss: 0.6095 - accuracy: 0.6594

2021-09-16 14:55:42.952754: I tensorflow/core/profiler/lib/profiler_session.cc:136] Profiler session initializing.
2021-09-16 14:55:42.952796: I tensorflow/core/profiler/lib/profiler_session.cc:155] Profiler session started.


  505/49375 [..............................] - ETA: 1:05:45 - loss: 0.5989 - accuracy: 0.6708

2021-09-16 14:55:48.802271: I tensorflow/core/profiler/lib/profiler_session.cc:71] Profiler session collecting data.
2021-09-16 14:55:48.874278: I tensorflow/core/profiler/internal/gpu/cupti_tracer.cc:1487] CUPTI activity buffer flushed
2021-09-16 14:55:49.975810: I tensorflow/core/profiler/internal/gpu/cupti_collector.cc:228]  GpuTracer has collected 208090 callback api events and 207280 activity events. 
2021-09-16 14:55:54.429560: I tensorflow/core/profiler/lib/profiler_session.cc:172] Profiler session tear down.
2021-09-16 14:55:59.791937: I tensorflow/core/profiler/rpc/client/save_profile.cc:137] Creating directory: ./logs/run_2021_09_16-14_55_02_LSTM/train/plugins/profile/2021_09_16_14_55_54
2021-09-16 14:56:03.933825: I tensorflow/core/profiler/rpc/client/save_profile.cc:143] Dumped gzipped tool data for trace.json.gz to ./logs/run_2021_09_16-14_55_02_LSTM/train/plugins/profile/2021_09_16_14_55_54/ahmedessam-G5-5587.trace.json.gz
2021-09-16 14:56:06.782407: I tensorflow/core/pro

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


<tensorflow.python.keras.callbacks.History at 0x7fc6db737ca0>