In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling1D, Embedding, Layer, MultiHeadAttention, Add, LayerNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


2023-02-17 17:01:30.078654: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [2]:
# # print Tensorflow and CUDA information
# print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
# print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU')))
# print(f"Tensorflow version: {tf.__version__}")
# print(f"Keras version: {keras.__version__}")
 
# if tf.test.gpu_device_name():
#     gpu_devices = tf.config.list_physical_devices('GPU')
#     details = tf.config.experimental.get_device_details(gpu_devices[0])
#     name = details.get('device_name', 'Unknown GPU')
    
#     print(f"Using {name}")
# else:
#     print("No GPU found")

In [3]:
def load_embeddings(filename):
    """
    Load a DataFrame from the generalized text format used by word2vec, GloVe,
    fastText, and ConceptNet Numberbatch. The main point where they differ is
    whether there is an initial line with the dimensions of the matrix.
    """
    labels = []
    rows = []
    with open(filename, encoding='utf-8') as infile:
        for i, line in enumerate(infile):
            items = line.rstrip().split(' ')
            if len(items) == 2:
                # This is a header row giving the shape of the matrix
                continue
            labels.append(items[0])
            values = np.array([float(x) for x in items[1:]], 'f')
            rows.append(values)
    
    arr = np.vstack(rows)
    return pd.DataFrame(arr, index=labels, dtype='f')

In [4]:
%%time
numberbatch_embeddings = load_embeddings("embeddings/numberbatch-en-17.04b.txt")
numberbatch_embeddings.shape

EMBED_SIZE = numberbatch_embeddings.shape[1]

CPU times: user 27.8 s, sys: 725 ms, total: 28.5 s
Wall time: 28.6 s


In [5]:
labeled_content = pd.read_csv('data/labeled_content.csv')
labeled_content = labeled_content[labeled_content['score'] != 0]

#concatenate title and first 256 words in content in one column
labeled_content['title_content'] = labeled_content['title'].astype(str) + ' ' + labeled_content['content'].astype(str).str[:256]


X = labeled_content['title_content'].astype(str).values
y = labeled_content['score'].values
                                #.astype('float32')

In [6]:
NUM_TOP_WORDS = None # use entire vocabulary!
MAX_TITLE_LEN = 256 # maximum and minimum number of words
NUM_CLASSES = 2

tokenizer = Tokenizer(
                    num_words=NUM_TOP_WORDS,
                    filters = '—!"“”#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t…\'‘’'
                     )

tokenizer.fit_on_texts(X)

sequences = tokenizer.texts_to_sequences(X)

word_index = tokenizer.word_index
NUM_TOP_WORDS = len(word_index) if NUM_TOP_WORDS==None else NUM_TOP_WORDS
top_words = min((len(word_index),NUM_TOP_WORDS))

X = pad_sequences(sequences, maxlen=MAX_TITLE_LEN)

y = np.where(y == -1, 0, 1)

y = keras.utils.to_categorical(y, num_classes=NUM_CLASSES)

print(f"Found {len(word_index):,} unique tokens. Distilled to {top_words:,} top words.")



# now fill in the matrix, using the ordering from the
# keras word tokenizer from before
found_words = 0
embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))

for word, i in word_index.items():
    
    try:
        embedding_vector = numberbatch_embeddings.loc[word]
        # words not found in embedding index will be ALL-ZEROS
        embedding_matrix[i] = embedding_vector
        found_words = found_words+1
    except:
        #print(word)
        pass


print(f"Embedding Shape: {embedding_matrix.shape}")
print(f"Total words found: {found_words:,}")
print(f"Percentage: {round(100 * found_words / embedding_matrix.shape[0], 2)}")

Found 25,980 unique tokens. Distilled to 25,980 top words.
Embedding Shape: (25981, 300)
Total words found: 22,546
Percentage: 86.78


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=402)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (6963, 256)
X_test shape: (1741, 256)
y_train shape: (6963, 2)
y_test shape: (1741, 2)


In [8]:
print(X_test)

[[    0     0     0 ...  1234     1 17599]
 [    0     0     0 ...    23     5   276]
 [    0     0     0 ...    27     1  2017]
 ...
 [    0     0     0 ...     4  3415     7]
 [    0     0     0 ...   103     4   398]
 [    0     0     0 ...   122    63    60]]


In [9]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBED_SIZE,
                            weights=[embedding_matrix],# here is the embedding getting saved
                            input_length=MAX_TITLE_LEN,
                            trainable=False)

class PositionalEncoding(Layer):
    def __init__(self, max_len, embedding_dim, **kwargs):
        super(PositionalEncoding, self).__init__(**kwargs)
        self.max_len = max_len
        self.embedding_dim = embedding_dim

    def call(self, inputs):
        positions = tf.range(start=0, limit=self.max_len, delta=1)
        positions = tf.cast(positions, tf.float32)
        positions = tf.expand_dims(positions, axis=-1)
        pos_encoding = inputs + (positions / 10000 ** (2 * (positions // 2) / self.embedding_dim))
        return pos_encoding

In [10]:
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [11]:
embed_dim = EMBED_SIZE  # Embedding size for each token
num_heads = 64  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer
num_trans = 3

inputs = Input(shape=(MAX_TITLE_LEN,), dtype=tf.int32)
embedding = embedding_layer(inputs)
pos_encoding = PositionalEncoding(MAX_TITLE_LEN,embed_dim)
x = pos_encoding(embedding)

transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)

for i in range(num_trans): x = transformer_block(x) 
x = GlobalAveragePooling1D()(x)
x = Dropout(0.1)(x)
x = Dense(20, activation="relu")(x)
x = Dropout(0.1)(x)
outputs = Dense(2, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

2023-02-17 17:02:04.034950: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2023-02-17 17:02:04.036527: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2023-02-17 17:02:04.119479: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:16:00.0 name: Tesla V100-SXM2-32GB computeCapability: 7.0
coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 31.75GiB deviceMemoryBandwidth: 836.37GiB/s
2023-02-17 17:02:04.119518: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2023-02-17 17:02:04.122632: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10
2023-02-17 17:02:04.122701: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.10
2

In [12]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 256, 300)     7794300     input_1[0][0]                    
__________________________________________________________________________________________________
positional_encoding (Positional (None, 256, 300)     0           embedding[0][0]                  
__________________________________________________________________________________________________
transformer_block (TransformerB (None, 256, 300)     23118632    positional_encoding[0][0]        
                                                                 transformer_block[0][0]      

In [14]:
# Compile the model with mean squared error loss and Adam optimizer
model.compile(optimizer=Adam(lr=0.0001), loss="binary_crossentropy", metrics=["accuracy"])
 
# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=64, epochs=5)
# TODO: see how big we can make the batch size
# probably very large when using the V100's

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2aab2bc90c90>

In [None]:
def plot_history(history, name="Results"):
    # plot f1 score, loss, and accuracy for training and validation together
    plt.figure(figsize=(20, 5))
 
    plt.subplot(1,2,1)
    plt.plot(history['loss'])
    plt.plot(history['val_loss'])
    plt.ylabel('Loss')
    plt.xlabel('epochs')
    plt.title('Loss')
    plt.legend(['Train', 'Test'], loc='best')
 
    plt.subplot(1,2,2)
    plt.plot(history['accuracy'])
    plt.plot(history['val_accuracy'])
    plt.ylabel('Accuracy')
    plt.xlabel('epochs')
    plt.title('Accuracy')
    plt.legend(['Train', 'Test'], loc='best')
 
    # set the title for the whole figure
    plt.suptitle(name, fontsize=16)
 
    plt.show()

In [None]:
plot_history(model.history.history)

In [15]:
y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)
y_test = np.argmax(y_test, axis=1)
 

In [17]:
for i in range(1500):
    #rand = np.random.randint(0,len(X_test))
    print(f"Predicted: {y_pred[i]}")
    print(f"Actual: {y_test[i]}")
    
    

Predicted: 0
Actual: 0
Predicted: 0
Actual: 0
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 0
Predicted: 0
Actual: 0
Predicted: 0
Actual: 0
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 0
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 0
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 0
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 0
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 0
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 0
Predicted: 0
Actual: 0
Predicted: 0
Actual: 0
Predicted: 0
Actual: 1
Predicted: 0
Actual: 0
Predicted: 0
Actual: 0
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 1
Predicted: 0
Actual: 0
Predicted: 

In [18]:
print(sum(y_pred))

0
