In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [None]:
# print Tensorflow and CUDA information
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("Num CPUs Available: ", len(tf.config.experimental.list_physical_devices('CPU')))
print(f"Tensorflow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")
 
if tf.test.gpu_device_name():
    gpu_devices = tf.config.list_physical_devices('GPU')
    details = tf.config.experimental.get_device_details(gpu_devices[0])
    name = details.get('device_name', 'Unknown GPU')
    
    print(f"Using {name}")
else:
    print("No GPU found")

In [6]:
headlines = pd.read_csv('data/headlines.csv')
scores = pd.read_csv('data/scores.csv')

In [7]:
def load_embeddings(filename):
    """
    Load a DataFrame from the generalized text format used by word2vec, GloVe,
    fastText, and ConceptNet Numberbatch. The main point where they differ is
    whether there is an initial line with the dimensions of the matrix.
    """
    labels = []
    rows = []
    with open(filename, encoding='utf-8') as infile:
        for i, line in enumerate(infile):
            items = line.rstrip().split(' ')
            if len(items) == 2:
                # This is a header row giving the shape of the matrix
                continue
            labels.append(items[0])
            values = np.array([float(x) for x in items[1:]], 'f')
            rows.append(values)
    
    arr = np.vstack(rows)
    return pd.DataFrame(arr, index=labels, dtype='f')

In [10]:
%%time
numberbatch_embeddings = load_embeddings("embeddings/numberbatch-en-17.04b.txt")
numberbatch_embeddings.shape

CPU times: user 27.5 s, sys: 663 ms, total: 28.2 s
Wall time: 28.3 s


(418081, 300)

In [11]:
labeled_headlines = pd.read_csv('data/labeled_headlines.csv')

In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, GlobalAveragePooling1D, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
 
# Define the hyperparameters
max_len = 500  # Maximum sequence length for padding
vocab_size = 5000  # Maximum number of words to keep based on word frequency
num_classes = 10  # Number of classes for classification
embedding_dim = 100  # Dimension of the word embedding vectors
num_heads = 2  # Number of attention heads in each transformer layer
num_transformer_layers = 2  # Number of transformer layers in the model
hidden_units = 64  # Number of hidden units in the feedforward layers
 


# Define the transformer layers
for i in range(num_transformer_layers):
    # Multi-head self-attention layer
    multi_head_attention = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)
    attention_output = multi_head_attention(embedding, embedding)
    attention_output = tf.keras.layers.Dropout(0.1)(attention_output)
    #attention_output = tf.keras.layers.LayerNormalization(epsilon=
    # Feedforward layer
    feedforward = tf.keras.Sequential([
        tf.keras.layers.Dense(hidden_units, activation='relu'),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(embedding_dim)
    ])
    feedforward_output = feedforward(attention_output)
    feedforward_output = tf.keras.layers.Dropout(0.1)(feedforward_output)
    transformer_output = tf.keras.layers.LayerNormalization(epsilon=1e-6)(attention_output + feedforward_output)
 
# Global average pooling layer
pooling_layer = GlobalAveragePooling1D()
pooling_output = pooling_layer(transformer_output)
 
# Dense layer with dropout
dense_layer = Dense(64, activation='relu')(pooling_output)
dropout_layer = Dropout(0.5)(dense_layer)
 
# Output layer
outputs = Dense(num_classes, activation='softmax')(dropout_layer)
 
# Create the model
model = Model(inputs=inputs, outputs=outputs)
 
# Compile the model
model.compile(optimizer=Adam(lr=0.0001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
 
# Train the model
model.fit(x_train, y_train, validation_data=(x_test, y_test), batch_size=32, epochs=10)