In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
from keras.models import model_from_yaml
from keras.models import model_from_json

In [2]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [3]:
cols = ['sentiment','text']
train_data = pd.read_csv(
    "./train_fix.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

In [4]:
train_data.head()

Unnamed: 0,sentiment,text
0,1,This sound track was beautiful! It paints the ...
1,1,I'm reading a lot of reviews saying that this ...
2,1,This soundtrack is my favorite music of all ti...
3,1,I truly like this soundtrack and I enjoy video...
4,1,"If you've played the game, you know how divine..."


In [5]:
train_data.count()

sentiment    1048576
text         1048576
dtype: int64

In [6]:
data = train_data

In [7]:
def clean_tweet(tweet):
    tweet2 = BeautifulSoup(tweet, "lxml").get_text()
    tweet2 = re.sub(r"@[A-Za-z0-9]+", ' ', tweet2)
    tweet2 = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet2)
    tweet2 = re.sub(r"[^a-zA-Z.!?']", ' ', tweet2)
    tweet2 = re.sub(r" +", ' ', tweet2)
    return tweet2

In [8]:
data_clean = [clean_tweet(tweet) for tweet in data.text]

In [9]:
data_labels = data.sentiment.values
data_labels[data_labels == 4] = 1

In [10]:
set(data_labels)

{0, 1}

In [11]:
tokenizer = tfds.deprecated.text.SubwordTextEncoder.load_from_file("Tokenizer")

In [12]:
data_inputs = [tokenizer.encode(sentence) for sentence in data_clean]

In [13]:
MAX_LEN = max([len(sentence) for sentence in data_inputs])
print(MAX_LEN)
MAX_LEN=512

416


In [14]:
data_inputs = tf.keras.preprocessing.sequence.pad_sequences(data_inputs,
                                                            value=0,
                                                            padding="post",
                                                            maxlen=MAX_LEN)

In [15]:
test_inputs=data_inputs[:10000]
test_labels=data_labels[:10000]
train_inputs=data_inputs[10000:]
train_labels=data_labels[10000:]

In [16]:
print(test_inputs[1].shape)
print(test_labels.shape)
print(train_inputs.shape)
print(train_labels.shape)

(512,)
(10000,)
(1038576, 512)
(1038576,)


In [17]:
class DCNN(tf.keras.Model):

    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.pool_1 = layers.GlobalMaxPool1D()
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.pool_2 = layers.GlobalMaxPool1D()
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        
        self.pool_3 = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool_1(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool_2(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool_3(x_3)

        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)

        return output

In [18]:
VOCAB_SIZE = tokenizer.vocab_size

EMB_DIM = 512
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(data_labels))

DROPOUT_RATE = 0.5

BATCH_SIZE = 265
NB_EPOCHS = 3

In [19]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

2021-11-06 17:45:46.924709: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-06 17:45:46.941292: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-06 17:45:46.942011: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-11-06 17:45:46.943901: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [20]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [21]:
Dcnn.fit(train_inputs,
         train_labels,
         batch_size=BATCH_SIZE,
         validation_split=0.2,
         shuffle=True,
         verbose=1,
         epochs=NB_EPOCHS)

2021-11-06 17:45:47.485287: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1701601280 exceeds 10% of free system memory.
2021-11-06 17:45:48.356136: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/3


2021-11-06 17:45:49.853453: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8204


  60/3136 [..............................] - ETA: 22:50 - loss: 0.5905 - accuracy: 0.6684

2021-11-06 17:46:21.869282: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 265.40MiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.


 326/3136 [==>...........................] - ETA: 20:53 - loss: 0.3533 - accuracy: 0.8341

2021-11-06 17:48:20.612639: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 265.40MiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.


 699/3136 [=====>........................] - ETA: 18:08 - loss: 0.2970 - accuracy: 0.8690

2021-11-06 17:51:07.454844: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 265.59MiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2021-11-06 17:51:07.455297: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 265.79MiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.




2021-11-06 18:09:05.700237: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 265.79MiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.


Epoch 2/3

2021-11-06 18:23:35.965668: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 265.40MiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.




2021-11-06 18:31:31.842294: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 265.79MiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2021-11-06 18:31:31.842736: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 265.59MiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.


Epoch 3/3

2021-11-06 18:51:57.906368: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 265.59MiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.




<keras.callbacks.History at 0x7ffa69979550>

In [22]:
Dcnn.summary()

Model: "dcnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  33694720  
_________________________________________________________________
conv1d (Conv1D)              multiple                  102500    
_________________________________________________________________
global_max_pooling1d (Global multiple                  0         
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  153700    
_________________________________________________________________
global_max_pooling1d_1 (Glob multiple                  0         
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  204900    
_________________________________________________________________
global_max_pooling1d_2 (Glob multiple                  0      

In [23]:
Dcnn.save("./Model/")

2021-11-06 19:00:40.155963: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./Model/assets


In [24]:
results = Dcnn.evaluate(test_inputs,test_labels)



In [25]:
model_yaml = Dcnn.to_json()
with open("model.yaml", "w") as yaml_file:
    yaml_file.write(model_yaml)
model.save_weights("model.h5")

NotImplementedError: 