In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
# print Tensorflow and CUDA information
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU')))
print(f"Tensorflow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")
 
if tf.test.gpu_device_name():
    gpu_devices = tf.config.list_physical_devices('GPU')
    details = tf.config.experimental.get_device_details(gpu_devices[0])
    name = details.get('device_name', 'Unknown GPU')
    
    print(f"Using {name}")
else:
    print("No GPU found")

In [6]:
headlines = pd.read_csv('data/headlines.csv')
scores = pd.read_csv('data/scores.csv')

In [7]:
def load_embeddings(filename):
    """
    Load a DataFrame from the generalized text format used by word2vec, GloVe,
    fastText, and ConceptNet Numberbatch. The main point where they differ is
    whether there is an initial line with the dimensions of the matrix.
    """
    labels = []
    rows = []
    with open(filename, encoding='utf-8') as infile:
        for i, line in enumerate(infile):
            items = line.rstrip().split(' ')
            if len(items) == 2:
                # This is a header row giving the shape of the matrix
                continue
            labels.append(items[0])
            values = np.array([float(x) for x in items[1:]], 'f')
            rows.append(values)
    
    arr = np.vstack(rows)
    return pd.DataFrame(arr, index=labels, dtype='f')

In [None]:
%%time
numberbatch_embeddings = load_embeddings("embeddings/numberbatch-en-17.04b.txt")
numberbatch_embeddings.shape

EMBED_SIZE = numberbatch_embeddings.shape[1]

CPU times: user 27.5 s, sys: 663 ms, total: 28.2 s
Wall time: 28.3 s


(418081, 300)

In [11]:
labeled_headlines = pd.read_csv('data/labeled_headlines.csv')

X = labeled_headlines['title'].astype(str).values
y = labeled_headlines['score'].values

In [None]:
NUM_TOP_WORDS = None # use entire vocabulary!
MAX_TITLE_LEN = 32 # maximum and minimum number of words
NUM_CLASSES = 2

tokenizer = Tokenizer(num_words=NUM_TOP_WORDS)
tokenizer.fit_on_texts(X)

sequences = tokenizer.texts_to_sequences(X)

word_index = tokenizer.word_index
NUM_TOP_WORDS = len(word_index) if NUM_TOP_WORDS==None else NUM_TOP_WORDS
top_words = min((len(word_index),NUM_TOP_WORDS))

X = pad_sequences(sequences, maxlen=MAX_TITLE_LEN)

y_ohe = keras.utils.to_categorical(y, num_classes=2)

print(f"Found {len(word_index):,} unique tokens. Distilled to {top_words:,} top words.")



# now fill in the matrix, using the ordering from the
# keras word tokenizer from before
found_words = 0
embedding_matrix = np.zeros((len(word_index) + 1, EMBED_SIZE))

for word, i in word_index.items():
    embedding_vector = numberbatch_embeddings.loc[word].values

    if embedding_vector is not None:
        # words not found in embedding index will be ALL-ZEROS
        embedding_matrix[i] = embedding_vector
        found_words = found_words+1

print(f"Embedding Shape: {embedding_matrix.shape}")
print(f"Total words found: {found_words:,}")
print(f"Percentage: {round(100 * found_words / embedding_matrix.shape[0], 2)}")

In [None]:
X_train, X_test, y_train_ohe, y_test_ohe = train_test_split(X, y_ohe, test_size=0.2, stratify=y)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train_ohe shape: {y_train_ohe.shape}")
print(f"y_test_ohe shape: {y_test_ohe.shape}")

In [None]:
embedding_layer = Embedding(len(word_index) + 1,
                            EMBED_SIZE,
                            weights=[embedding_matrix],# here is the embedding getting saved
                            input_length=MAX_TITLE_LEN,
                            trainable=False)

class PositionalEncoding(Layer):
    def __init__(self, max_len, embedding_dim, **kwargs):
        super(PositionalEncoding, self).__init__(**kwargs)
        self.max_len = max_len
        self.embedding_dim = embedding_dim

    def call(self, inputs):
        positions = tf.range(start=0, limit=self.max_len, delta=1)
        positions = tf.cast(positions, tf.float32)
        positions = tf.expand_dims(positions, axis=-1)
        pos_encoding = inputs + tf.math.sin(positions / (10000 ** (2 * tf.range(0, self.embedding_dim, 2) / self.embedding_dim)))
        return pos_encoding

In [None]:
 # Define the hyperparameters
max_len = MAX_TITLE_LEN  # Maximum sequence length for padding
num_classes = 10  # Number of classes for classification
embedding_dim = 100  # Dimension of the word embedding vectors
num_heads = 2  # Number of attention heads in each transformer layer
num_transformer_layers = 2  # Number of transformer layers in the model
hidden_units = 64  # Number of hidden units in the feedforward layers
 
inputs = Input(shape=(max_len,), dtype=tf.int32)

embedding = embedding_layer(inputs)

pos_encoding = PositionalEncoding(max_len, embedding_dim)(embedding)

# Define the transformer layers
for i in range(num_transformer_layers):
    # Multi-head self-attention layer
    multi_head_attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
    attention_output = multi_head_attention(embedding, embedding)
    attention_output = tf.keras.layers.Dropout(0.1)(attention_output)
    attention_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention_output + embedding)

    # Feedforward layer
    feedforward = tf.keras.Sequential([
        tf.keras.layers.Dense(hidden_units, activation='relu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(embedding_dim)
    ])
    
    feedforward_output = feedforward(attention_output)
    feedforward_output = tf.keras.layers.Dropout(0.1)(feedforward_output)
    transformer_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention_output + feedforward_output)

    # Update the positional encoding
    pos_encoding = PositionalEncoding(max_len, embedding_dim)(transformer_output)
 
# Global average pooling layer
pooling_layer = GlobalAveragePooling1D()
pooling_output = pooling_layer(transformer_output)
 
# Dense layer with dropout
dense_layer = Dense(64, activation='relu')(pooling_output)
dropout_layer = Dropout(0.5)(dense_layer)
 
# Output layer
outputs = Dense(num_classes, activation='softmax')(dropout_layer)
 
# Create the model
model = Model(inputs=inputs, outputs=outputs)

In [None]:
# Compile the model
model.compile(optimizer=Adam(lr=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
 
# Train the model
model.fit(X_train, y_train_ohe, validation_data=(X_test, y_test_ohe), batch_size=256, epochs=10)
# TODO: see how big we can make the batch size
# probably very large when using the V100's