# Multilayer Perceptron Regression Model
Predicts the number of comments an article will get based on the article's topic, tone, length, hour of the day and day of the week

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
import pandas as pd
import numpy as np

In [32]:
#Load train data
train_df = pd.read_csv('../data/cleaned_train.csv')

# Extract the columns you want to use as input features
columns = ['BERT_sentiment_score', 'normalised_word_count', 'pub_day', 'pub_hour']
# Extract input features from the dataframe
x_train = train_df[columns].dropna()

y_train = np.array(train_df['n_comments'].dropna())

In [33]:
#Load test data
test_df = pd.read_csv('../data/cleaned_test.csv')

x_test = test_df[columns].dropna()

In [34]:
def label_encode(raw_data):
    # Concatenate train and test data vertically to ensure consistent label encoding
    combined_df = pd.concat([train_df['topic'], test_df['topic']], axis=0)

    # Create an instance of LabelEncoder
    le = LabelEncoder()

    # Fit and transform the combined data using LabelEncoder
    combined_encoded = le.fit_transform(combined_df)
    
    return np.array(le.transform(raw_data.dropna()))

In [35]:
# Apply the LabelEncoder transformation on train data
train_topic_encoded = pd.DataFrame(label_encode(train_df['topic']), columns=['topic encoded'])
# Concatenate the 'topic_encoded' tensor with the other input features
x_train = pd.concat([x_train, train_topic_encoded], axis=1)

In [36]:
# Apply the LabelEncoder transformation on train data
test_topic_encoded = pd.DataFrame(label_encode(test_df['topic']), columns=['topic encoded'])
# Concatenate the 'topic_encoded' tensor with the other input features
x_test = pd.concat([x_test, test_topic_encoded], axis=1)

In [37]:
# Training and validation split
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [38]:
# Define the EarlyStopping callback
early_stopping = EarlyStopping(monitor='val_loss', 
                               # Number of epochs to wait for improvement
                               patience=10,  
                               verbose=1, 
                               # Restore the weights of the best epoch
                               restore_best_weights=True) 

In [None]:
# Hyperparameters
num_features = 5
dropout = 0.2

In [39]:
# Define MLP model architecture
model = Sequential()
# Input layer
model.add(Dense(128, activation='relu', input_dim=num_features))
model.add(BatchNormalization())
model.add(Dropout(dropout))
# Hidden layer
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(dropout))
# Hidden layer
model.add(Dense(32, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(dropout))
# Output layer
model.add(Dense(1, activation='linear'))

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             (None, 128)               768       
                                                                 
 batch_normalization (BatchN  (None, 128)              512       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_8 (Dense)             (None, 64)                8256      
                                                                 
 batch_normalization_1 (Batc  (None, 64)               256       
 hNormalization)                                                 
                                                                 
 dropout_1 (Dropout)         (None, 64)               

In [40]:
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mean_squared_error', 'mean_absolute_error'])

# Train the model
model.fit(x_train, y_train, batch_size=32, epochs=100, validation_data=(x_val, y_val), callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 24: early stopping


<keras.callbacks.History at 0x22e960f0ca0>

In [43]:
# save the trained model
model.save('../models/MLP_regression_model.h5')

In [41]:
# Evaluate the model on validation data
loss, mse, mae = model.evaluate(x_val, y_val)
print('Validation MSE:', mse)
print('Validation MAE:', mae)

Validation MSE: 231915.515625
Validation MAE: 273.0476379394531
