In [19]:
import tensorflow as tf
from keras.layers import MultiHeadAttention, LayerNormalization, Dropout, Layer
from keras.layers import Embedding, Input, GlobalAveragePooling1D, Dense
from keras.datasets import imdb
from keras.models import Sequential, Model
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

In [13]:
def transformer_block(embed_dim, num_heads, ff_dim, rate=0.1):
    att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
    ffn = Sequential(
        [Dense(ff_dim, activation="relu"),
         Dense(embed_dim), ]
    )
    layernorm1 = LayerNormalization(epsilon=1e-6)
    layernorm2 = LayerNormalization(epsilon=1e-6)
    dropout1 = Dropout(rate)
    dropout2 = Dropout(rate)
    
    def call(inputs, training):
        attn_output = att(inputs, inputs)
        attn_output = dropout1(attn_output, training=training)
        out1 = layernorm1(inputs + attn_output)
        ffn_output = ffn(out1)
        ffn_output = dropout2(ffn_output, training=training)
        return layernorm2(out1 + ffn_output)
    
    return call

In [14]:
def token_and_position_embedding(maxlen, vocab_size, embed_dim):
    token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
    pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)
    
    def call(x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = pos_emb(positions)
        x = token_emb(x)
        return x + positions
    
    return call

In [15]:
#Load training data
train_df = pd.read_csv('../data/cleaned_train.csv')

# Extract the columns you want to use as input features
columns = ['BERT_sentiment_score', 'normalised_word_count', 'pub_day', 'pub_hour']
# Extract input features from the dataframe
x_train = train_df[columns].dropna()

y_train = np.array(train_df['n_comments'])

In [16]:
#Load test data
test_df = pd.read_csv('../data/cleaned_test.csv')

x_test = test_df[columns].dropna()

In [17]:
def label_encode(raw_data):
    # Concatenate train and test data vertically to ensure consistent label encoding
    combined_df = pd.concat([train_df['topic'], test_df['topic']], axis=0)

    # Create an instance of LabelEncoder
    le = LabelEncoder()

    # Fit and transform the combined data using LabelEncoder
    combined_encoded = le.fit_transform(combined_df)
    
    return np.array(le.transform(raw_data.dropna()))

In [21]:
# Apply the LabelEncoder transformation on train data
train_topic_encoded = pd.DataFrame(label_encode(train_df['topic']), columns=['topic encoded'])
# Concatenate the 'topic_encoded' tensor with the other input features
x_train = pd.concat([x_train, train_topic_encoded], axis=1)

In [22]:
# Apply the LabelEncoder transformation on train data
test_topic_encoded = pd.DataFrame(label_encode(test_df['topic']), columns=['topic encoded'])
# Concatenate the 'topic_encoded' tensor with the other input features
x_test = pd.concat([x_test, test_topic_encoded], axis=1)

In [24]:
# Define model architecture
max_seq_length = x_train.shape[1]
vocab_size = np.max(x_train) + 1
embedding_dim = 32
num_heads = 2
ff_dim = 32

inputs = Input(shape=(max_seq_length,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length)(inputs)
transformer_block_fn = transformer_block(embed_dim=embedding_dim, num_heads=num_heads, ff_dim=ff_dim)
transformer_block = transformer_block_fn(embedding_layer, training=True)
pooling_layer = GlobalAveragePooling1D()(transformer_block)
dropout_layer = Dropout(rate=0.1)(pooling_layer)
outputs = Dense(units=1, activation='linear')(dropout_layer)  # Change activation to 'linear' for regression

model = Model(inputs=inputs, outputs=outputs)

# Compile the model
model.compile(optimizer=Adam(), loss='mse', metrics=['mae'])  # Change loss function to 'mse' for regression

# Train the model
history = model.fit(x_train, y_train, batch_size=64, epochs=4, validation_split=0.2)

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().