In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
from keras.models import model_from_yaml
from keras.models import model_from_json
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
try:
    %tensorflow_version 2.x
except Exception:
    pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds

In [3]:
df = pd.read_csv("../Datasets/Stem-Cuvinte-Eliminate/train-punct-stop-stem-200.csv")
df=df.dropna()
df=df.sample(3000000)
#df=df.drop(["news_title","url","reddit_title"],axis=1)

In [4]:
df.head()

Unnamed: 0,sentiment,text
2633442,0,second review content somewher present horribl...
887608,2,easi guy everyon hate pari obvious go look fau...
1273233,2,josh groban fan not enjoy cd romant song beaut...
1126739,2,without doubt best album ever compar gilmour r...
2107033,0,got vacuum januari 2008 chose vacuum great rev...


In [5]:
df.count()

sentiment    3000000
text         3000000
dtype: int64

In [6]:
data_clean = df.text

In [7]:
data_labels = df.sentiment.values

In [8]:
data_clean

2633442    second review content somewher present horribl...
887608     easi guy everyon hate pari obvious go look fau...
1273233    josh groban fan not enjoy cd romant song beaut...
1126739    without doubt best album ever compar gilmour r...
2107033    got vacuum januari 2008 chose vacuum great rev...
                                 ...                        
2153686    happen sound qualiti horribl lyric could pen a...
3107329    subject excel feel realism good mani point see...
1550473    ive use book sinc bought instal window xp ive ...
2407267    limp bizkit worst rock rap group ever use term...
731857     big fan talk head david byrn everyth els heard...
Name: text, Length: 3000000, dtype: object

In [9]:
data_labels




array([0, 2, 2, ..., 2, 0, 2])

In [10]:
data_labels_good=[]
for i in data_labels:
    if i==2:
        data_labels_good.append(1)
    else:
        data_labels_good.append(0)

data_labels_good=np.array(data_labels_good)

In [11]:
data_labels_good

array([0, 1, 1, ..., 1, 0, 1])

In [12]:
set(data_labels_good)

{0, 1}

In [13]:
import pickle
with open('tokenizer-200.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [14]:
max_phrase_len=128

In [15]:
X_train = tokenizer.texts_to_sequences(data_clean)
X_train = pad_sequences(X_train, maxlen = max_phrase_len)

In [16]:
len(X_train[1])

128

In [20]:
class DCNN(tf.keras.Model):

    def __init__(self,
                 vocab_size,
                 emb_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        self.embedding = layers.Embedding(vocab_size,
                                          emb_dim)
        self.bigram = layers.Conv1D(filters=nb_filters,
                                    kernel_size=2,
                                    padding="valid",
                                    activation="relu")
        self.pool_1 = layers.GlobalMaxPool1D()
        self.trigram = layers.Conv1D(filters=nb_filters,
                                     kernel_size=3,
                                     padding="valid",
                                     activation="relu")
        self.pool_2 = layers.GlobalMaxPool1D()
        self.fourgram = layers.Conv1D(filters=nb_filters,
                                      kernel_size=4,
                                      padding="valid",
                                      activation="relu")
        
        self.pool_3 = layers.GlobalMaxPool1D()
        self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool_1(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool_2(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool_3(x_3)

        merged = tf.concat([x_1, x_2, x_3], axis=-1) # (batch_size, 3 * nb_filters)
        merged = self.dense_1(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)

        return output

In [21]:
#VOCAB_SIZE = 16384
VOCAB_SIZE = 8192
EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES = len(set(data_labels_good))

DROPOUT_RATE = 0.5

BATCH_SIZE = 265
NB_EPOCHS = 3

In [22]:
Dcnn = DCNN(vocab_size=VOCAB_SIZE,
            emb_dim=EMB_DIM,
            nb_filters=NB_FILTERS,
            FFN_units=FFN_UNITS,
            nb_classes=NB_CLASSES,
            dropout_rate=DROPOUT_RATE)

In [23]:
if NB_CLASSES == 2:
    Dcnn.compile(loss="binary_crossentropy",
                 optimizer="adam",
                 metrics=["accuracy"])
else:
    Dcnn.compile(loss="sparse_categorical_crossentropy",
                 optimizer="adam",
                 metrics=["sparse_categorical_accuracy"])

In [24]:
Dcnn.fit(X_train,
         data_labels_good,
         batch_size=BATCH_SIZE,
         validation_split=0.2,
         shuffle=True,
         verbose=1,
         epochs=NB_EPOCHS)

2022-02-18 18:04:28.224932: W tensorflow/core/framework/cpu_allocator_impl.cc:80] Allocation of 1228800000 exceeds 10% of free system memory.
2022-02-18 18:04:28.935271: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/3


2022-02-18 18:04:30.509288: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8204


Epoch 2/3
   3/9057 [..............................] - ETA: 8:20 - loss: 0.2409 - accuracy: 0.9044

2022-02-18 18:13:33.557290: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.49GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-02-18 18:13:33.569845: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.49GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-02-18 18:13:33.581925: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.49GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.


Epoch 3/3


<keras.callbacks.History at 0x7f62e2e7bb80>

In [22]:
Dcnn.summary()

Model: "dcnn"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  1638400   
_________________________________________________________________
conv1d (Conv1D)              multiple                  40100     
_________________________________________________________________
global_max_pooling1d (Global multiple                  0         
_________________________________________________________________
conv1d_1 (Conv1D)            multiple                  60100     
_________________________________________________________________
global_max_pooling1d_1 (Glob multiple                  0         
_________________________________________________________________
conv1d_2 (Conv1D)            multiple                  80100     
_________________________________________________________________
global_max_pooling1d_2 (Glob multiple                  0      

In [25]:
Dcnn.save("./Model-200-stem/")

2022-02-18 18:45:43.520023: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: ./Model-200-stem/assets


In [24]:
model_yaml = Dcnn.to_json()
with open("model.yaml", "w") as yaml_file:
    yaml_file.write(model_yaml)
model.save_weights("model.h5")

NotImplementedError: 