# Seq2Vec Sentiment Modeling in Tensorflow

## 1.0 - Import Packages

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import regularizers
import matplotlib.pyplot as plt
import tensorflow_hub as hub
import pandas as pd
from sklearn.metrics import mean_absolute_error
from keras import backend as K
from keras.layers import Dropout
import os
import numpy as np

In [2]:
CUR_DIR = os.path.abspath(os.curdir)
ROOT_DIR = os.path.dirname(CUR_DIR)
IMAGES_DIR = os.path.join(ROOT_DIR, "images")
DATA_DIR = os.path.join(ROOT_DIR, "data")
MODELS_DIR = os.path.join(ROOT_DIR, "models")
EVAL_DIR = os.path.join(ROOT_DIR, "evaluation")
MODEL_PERF_DIR = os.path.join(EVAL_DIR, "model_performance")
GRAPHS_DIR = os.path.join(EVAL_DIR, "graphs")
writepath = os.path.join(MODEL_PERF_DIR, "performance.csv")

## 1.1 Define Functions

In [3]:
def plot_loss(history,model):
    """
    The purpose of this function is to plot the validation and training loss function across epochs.
    """
    plt.plot(history.history['mae'], label='training')
    plt.plot(history.history['val_mae'], label='val')
    plt.xlabel('epoch')
    plt.ylabel('mae')
    plt.title(f'Loss for {model.name}')
    plt.legend(loc='upper right')
    output_path = os.path.join(MODEL_PERF_DIR,f'{model.name}.png')
    plt.savefig(output_path)
    plt.show()
    print(output_path)

In [4]:
def relu_advanced(x):
    """The purpose of this function is the bound the output value of the network between 1 and 5 inclusively which matches the domain the stars get on the reviews."""
    return (K.relu(x, max_value=5))

In [5]:
def transpose_df(df,reset_index,prefix):
    if reset_index == False:
        out_df = df.groupby('star',as_index=False)['prediction'].mean().T
    elif reset_index == True:
        out_df = pd.DataFrame(df.groupby('star')['prediction'].skew()).reset_index().T
    new_header = out_df.iloc[0]
    new_header = [f'{prefix} {int(i)} Star' for i in new_header]
    new_header
    out_df = out_df[1:] #take the data less the header row
    out_df.columns = new_header
    return out_df

In [6]:
def write_performance(model,mae,writepath,eval_df):
    # df = pd.DataFrame(eval_df.groupby('star')['prediction'].skew()).reset_index().T#.pivot(columns='star',values='prediction')
    # new_header = df.iloc[0] #grab the first row for the header
    # new_header = [f'Prediction Skewness for {int(i)} Star' for i in new_header]
    # df = df[1:] #take the data less the header row
    # df.columns = new_header 
    data = {
        'model_name':model.name,
        'mae':mae
    }
    grouped_eval_df = eval_df.groupby('star',as_index=False)['prediction'].mean()
    avg_prefix = 'Average Prediction for'
    skew_prefix = 'Prediction Skewness for'
    avg_df = transpose_df(eval_df,False,avg_prefix)
    skew_df = transpose_df(eval_df,True,skew_prefix)
    
    # grouped_eval_df.to_csv(os.path.join(DATA_DIR,'output','summary', f'{model.name}'))
    for col in avg_df.columns:
        data.update({col:avg_df[col][0]})
    for col in skew_df.columns:
        data.update({col:skew_df[col][0]})
    print(data)
    out_df = pd.DataFrame(data,index=[0])
    mode = 'a' if os.path.exists(writepath) else 'w'
    header = False if os.path.exists(writepath) else True
    out_df.to_csv(writepath, mode=mode, index=False, header=header)
    # print message
    print("Data appended successfully.")

In [7]:
# df = pd.DataFrame(eval_df.groupby('star')['prediction'].skew()).reset_index().T#.pivot(columns='star',values='prediction')
# new_header = df.iloc[0] #grab the first row for the header
# new_header = [f'skewness for {int(i)} star' for i in new_header]
# df = df[1:] #take the data less the header row
# df.columns = new_header
# # df.index=1
# # df3=pd.DataFrame({'nice':1, 'yes':2})
# # df3.index=1
# # pd.concat([df,df3],axis=1,ignore_index=True)
# df#['skewness for 1 star'][0]

In [8]:
def plot_score_distribution_by_group(model,eval_df):
    # Create subplots 
    fig, axes = plt.subplots(nrows=2, ncols=3, 
                           constrained_layout = True,figsize=(15,10))
    fig.delaxes(axes[1][2])
    plt.text(x=0.5, y=0.94, s=f"Model Prediction Distribution by Stars for model: {model.name}", fontsize=18, ha="center", transform=fig.transFigure)
    plt.subplots_adjust(top=0.9, wspace=0.3)

    # Generate histograms
    i=0
    colors = ['black', 'midnightblue', 'darkgreen','mediumpurple','darkred']
    for ax, (name, subdf) in zip(axes.flatten(), eval_df.groupby('star')):
        subdf.hist('prediction', ax=ax, rwidth=0.9,color = colors[i])
        i+=1
        ax.set_title(name)
        ax.set_xlabel('prediction')
        ax.set_ylabel('count')
    # plt.subplots_adjust(wspace=1, hspace=1)
    plt.savefig(os.path.join(MODEL_PERF_DIR,f'prediction_distribution_{model.name}.png'))
    plt.show()

In [9]:
def performance_evaluation(X_test, y_test, model):
    y_pred = model.predict(X_test)
    print(f'The prediction values range between {min(y_pred)} and {max(y_pred)}')
    mae = mean_absolute_error(y_test, y_pred)
    print(f'Mean Absolute Error: {mae}')
    eval_df = pd.merge(X_test, y_test, left_index=True, right_index=True)
    eval_df['prediction'] = y_pred.ravel()
    eval_df['absolute_error'] = (eval_df['prediction'] - eval_df['star']).abs()
    eval_df.to_csv(os.path.join(DATA_DIR,'output','scoring', f'{model.name}'))
    # grouped_eval_df = eval_df.groupby('star',as_index=False)['prediction'].mean()
    # grouped_eval_df.to_csv(os.path.join(DATA_DIR,'output','summary', f'{model.name}'))
    plot_score_distribution_by_group(model,eval_df)
    write_performance(model,mae,writepath,eval_df)
    print('Done')

## 1.1 Import Data 

In [10]:
validation_df = pd.read_csv(os.path.join(DATA_DIR,'raw','validation.csv'))
training_df = pd.read_csv(os.path.join(DATA_DIR,'raw','training.csv'))
test_df = pd.read_csv(os.path.join(DATA_DIR,'raw','test.csv'))

X_train = training_df['review']
y_train = training_df['star']
X_val = validation_df['review']
y_val = validation_df['star']
X_test = test_df['review']
y_test = test_df['star']

# Convert to tensorflow datasets
train_ds = tf.data.Dataset.from_tensor_slices((X_train,y_train)).shuffle(buffer_size=1024).batch(128)
test_ds = tf.data.Dataset.from_tensor_slices((X_test,y_test)).shuffle(buffer_size=1024).batch(128)
val_ds = tf.data.Dataset.from_tensor_slices((X_val,y_val)).shuffle(buffer_size=1024).batch(128)

2022-03-14 07:53:08.761558: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-03-14 07:53:08.762190: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-03-14 07:53:08.762341: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (42b10f96906e): /proc/driver/nvidia/version does not exist
2022-03-14 07:53:08.770803: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## 2.0 Create embedding layer & Build simple model

In [11]:
handle = 'https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim-with-oov/1'
# emb_layer = hub.KerasLayer(handle = handle,
#                input_shape=[],dtype=tf.string,trainable=True)

emb_layer = hub.KerasLayer(handle = handle, output_shape=(20,1),
               input_shape=(None,),dtype=tf.string,trainable=True, name='embed')

callback = tf.keras.callbacks.EarlyStopping(monitor='val_mae', patience=3)

2022-03-14 07:53:08.992301: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [12]:
embed = hub.load(handle)
embeddings = embed(["hello","hello"])
embeddings

<tf.Tensor: shape=(2, 20), dtype=float32, numpy=
array([[ 0.52211034, -0.5292976 ,  0.33719632,  0.0902942 , -0.6266122 ,
         0.45860922, -0.70826894,  0.55458295,  1.1077714 ,  0.40197402,
        -1.5735748 ,  0.35205668,  0.8920213 , -0.05832118, -0.29793447,
        -0.2865127 ,  0.7564523 ,  0.6519507 , -0.7274325 , -0.45961407],
       [ 0.52211034, -0.5292976 ,  0.33719632,  0.0902942 , -0.6266122 ,
         0.45860922, -0.70826894,  0.55458295,  1.1077714 ,  0.40197402,
        -1.5735748 ,  0.35205668,  0.8920213 , -0.05832118, -0.29793447,
        -0.2865127 ,  0.7564523 ,  0.6519507 , -0.7274325 , -0.45961407]],
      dtype=float32)>

## Baseline

In [13]:
# emb_layer = hub.KerasLayer(handle = handle, output_shape=(20,1),
#                input_shape=(None,),dtype=tf.string,trainable=True, name='embed')

# base_model = keras.Sequential([
#         layers.Input(shape=(), name="Input", dtype=tf.string),
#         emb_layer,
#         layers.Dense(10,activation='relu'),
#         layers.Dropout(0.1),
#         layers.Dense(1,activation='relu')])

# base_model.compile(loss=tf.keras.losses.MeanSquaredError(),
#                    optimizer=keras.optimizers.Adam(lr=0.00005,decay=1e-6),
#                    metrics=['mse','mae'])

# base_model.summary()
# tf.keras.utils.plot_model(base_model,show_shapes=True, show_dtype=True,
#     show_layer_names=True, rankdir='TB', expand_nested=True, dpi=96)

In [None]:
epochs = 100

base_model = keras.Sequential([
        layers.Input(shape=(), name="Input", dtype=tf.string),
        emb_layer,
        layers.Dense(10,activation='relu', dtype=tf.float32),
        layers.Dense(1,activation=relu_advanced)],name='2.0-Baseline-No-Regularization')

base_model.compile(loss=tf.keras.losses.MeanSquaredError(),
                   optimizer=keras.optimizers.Adam(lr=0.0001,decay=1e-6),
                   metrics=['mse','mae'])


base_history = base_model.fit(train_ds,
                    epochs = epochs,
                    validation_data=val_ds,
                    callbacks=[callback],
                    verbose=1)



Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

In [None]:
plot_loss(base_history,base_model)

In [None]:
performance_evaluation(X_test, y_test, base_model)

In [None]:
# tf.keras.utils.plot_model(base_model,show_shapes=True, show_dtype=True,
#     show_layer_names=True, rankdir='TB', expand_nested=True, dpi=96)

In [None]:
# plot_performance(base_history,2.0,'No-Regularization','Baseline')

In [None]:
# plt.plot(base_history.history['mae'], label='training')
# plt.plot(base_history.history['val_mae'], label='val')
# plt.xlabel('epoch')
# plt.ylabel('mae')
# plt.title('Baseline No Regularization')
# plt.legend(loc='upper right')
# plt.savefig(os.path.join(IMAGES_DIR,'2.0_No-Regularizer-Baseline.png'))
# plt.show()

In [None]:
performance_evaluation(X_test, y_test, base_model)

In [None]:
# y_pred_base = base_model.predict(X_test)
# y_pred_base.ravel()

In [None]:
# print(min(y_pred_base),max(y_pred_base))

In [None]:
# mean_absolute_error(y_test, y_pred_base)

In [None]:
base_model.save(os.path.join(MODELS_DIR,base_model.name))

In [None]:
# new_model = tf.keras.models.load_model(os.path.join(MODELS_DIR,'2.1-LSTM_TFIDF'))
# new_model.summary()

In [None]:
# eval_df = pd.merge(X_test, y_test, left_index=True, right_index=True)
# eval_df['prediction'] = base_model.predict(eval_df.review).ravel()
# eval_df['absolute_error'] = (eval_df['prediction'] - eval_df['star']).abs()
# eval_df

In [None]:
# eval_df.groupby('star',as_index=False)['prediction'].mean()

## RNN

In [None]:
embed
embed(X_train).shape

In [None]:
# callback = tf.keras.callbacks.EarlyStopping(monitor='val_mae', patience=3)
rnn_model = tf.keras.Sequential([
    emb_layer,
    layers.Reshape(target_shape= (20,1)),
    # layers.Dense(20,activation='relu'),
    # layers.SimpleRNN(20,return_sequences=True,input_shape=(None, 20), activation='relu'),
    # layers.SimpleRNN(15,input_shape=(None, 20), activation='tanh',return_sequences=True),
    layers.SimpleRNN(2, activation='tanh',return_sequences=False),
    # layers.SimpleRNN(10,input_shape=(None, 15), activation='tanh',return_sequences=False),

    # layers.Dense(15,activation='tanh'),
    layers.Dense(1,activation=relu_advanced)
],name='2.0-RNN-No-Regularization')

rnn_model.compile(loss=tf.keras.losses.MeanSquaredError(),
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              metrics=['mse','mae'])

num_epochs=100
rnn_history=rnn_model.fit(train_ds, 
                          epochs=num_epochs, 
                          validation_data = val_ds, 
                          callbacks=[callback],
                          verbose=1)

In [None]:
rnn_model.summary()

In [None]:
plot_loss(rnn_history,rnn_model)

In [None]:
performance_evaluation(X_test, y_test, rnn_model)

In [None]:
# plt.plot(rnn_history.history['mae'], label='training')
# plt.plot(rnn_history.history['val_mae'], label='val')
# plt.xlabel('epoch')
# plt.ylabel('mae')
# plt.title('RNN No Regularization')
# plt.legend(loc='upper right')
# plt.savefig(os.path.join(IMAGES_DIR,'2.0_No-Regularizer-RNN.png'))
# plt.show()

In [None]:
# y_pred_rnn = rnn_model.predict(test_ds)
# y_pred_rnn

In [None]:
rnn_model.save(os.path.join(MODELS_DIR,rnn_model.name))
# mean_absolute_error(y_test, y_pred_rnn)

In [None]:
# max(y_pred)

In [None]:
# min(y_pred)

In [None]:
# rnn_eval_df = pd.merge(X_test, y_test, left_index=True, right_index=True)
# rnn_eval_df['prediction'] = rnn_model.predict(rnn_eval_df.review).ravel()
# rnn_eval_df['absolute_error'] = (rnn_eval_df['prediction'] - rnn_eval_df['star']).abs()
# rnn_eval_df

In [None]:
# max(rnn_eval_df.star)

In [None]:
# rnn_eval_df.groupby('star',as_index=False)['prediction'].mean()

## GRU

In [None]:
# gru_emb_layer = emb_layer = hub.KerasLayer(handle = handle, output_shape=[20],
#                input_shape=(),dtype=tf.string,trainable=True)
gru_model = tf.keras.Sequential([
    emb_layer,
    layers.Reshape(target_shape= (20,1)),
    # layers.Dense(20,activation='relu'),
    # layers.SimpleRNN(20,return_sequences=True,input_shape=(None, 20), activation='relu'),
    # layers.GRU(15,input_shape=(None, 20), activation='tanh',return_sequences=True),
    # layers.GRU(10,input_shape=(None, 15), activation='tanh',return_sequences=False),
    layers.GRU(10, activation='tanh',return_sequences=False),
    layers.Dense(5,activation='tanh'),
    layers.Dense(1,activation=relu_advanced)
],name='2.0-GRU-No-Regularization')
gru_model.compile(loss='mse',
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              metrics=['mse','mae'])

num_epochs=100
gru_history=gru_model.fit(train_ds,
                          epochs=num_epochs, 
                          validation_data = val_ds, 
                          callbacks=[callback],
                          verbose=1)

In [None]:
gru_model.save(os.path.join(MODELS_DIR,gru_model.name))

In [None]:
# y_pred_gru = gru_model.predict(test_ds)
# y_pred_gru

In [None]:
# gru_eval_df = pd.merge(X_test, y_test, left_index=True, right_index=True)
# gru_eval_df['prediction'] = gru_model.predict(gru_eval_df.review).ravel()
# gru_eval_df['absolute_error'] = (gru_eval_df['prediction'] - gru_eval_df['star']).abs()
# gru_eval_df

In [None]:
plot_loss(gru_history,gru_model)

In [None]:
performance_evaluation(X_test, y_test, gru_model)

In [None]:
# plt.plot(gru_history.history['mae'], label='training')
# plt.plot(gru_history.history['val_mae'], label='val')
# plt.xlabel('epoch')
# plt.ylabel('mae')
# plt.title('GRU No Regularization')
# plt.legend(loc='upper right')
# plt.savefig(os.path.join(IMAGES_DIR,'2.0_No-Regularizer-GRU.png'))
# plt.show()

In [None]:
# gru_eval_df.groupby('star',as_index=False)['prediction'].mean()

## LSTM

In [None]:
# lstm_emb_layer = hub.KerasLayer(handle = handle, output_shape=[20],
#                input_shape=(),dtype=tf.string,trainable=True)
lstm_model = tf.keras.Sequential([
    emb_layer,
    layers.Reshape(target_shape= (20,1)),
    # layers.Dense(20,activation='relu'),
    # layers.SimpleRNN(20,return_sequences=True,input_shape=(None, 20), activation='relu'),
    # layers.GRU(15,input_shape=(None, 20), activation='tanh',return_sequences=True),
    # layers.GRU(10,input_shape=(None, 15), activation='tanh',return_sequences=False),
    layers.LSTM(10, activation='tanh',return_sequences=False),
    layers.Dense(5,activation='tanh'),
    layers.Dense(1,activation=relu_advanced)
],name='2.0-LSTM-No-Regularization')
lstm_model.compile(loss='mse',
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              metrics=['mse','mae'])

num_epochs=100
lstm_history=lstm_model.fit(train_ds,
                          epochs=num_epochs, 
                          validation_data = val_ds, 
                          callbacks=[callback],
                          verbose=1)

In [None]:
performance_evaluation(X_test, y_test, lstm_model)

In [None]:
plot_loss(lstm_history,lstm_model)

In [None]:
lstm_model.save(os.path.join(MODELS_DIR,lstm_model.name))

In [None]:
# plt.plot(lstm_history.history['mae'], label='training')
# plt.plot(lstm_history.history['val_mae'], label='val')
# plt.xlabel('epoch')
# plt.ylabel('mae')
# plt.title('LSTM No Regularization')
# plt.legend(loc='upper right')
# plt.savefig(os.path.join(IMAGES_DIR,'2.0_No-Regularizer-LSTM.png'))
# plt.show()

In [None]:
# lstm_eval_df = pd.merge(X_test, y_test, left_index=True, right_index=True)
# lstm_eval_df['prediction'] = lstm_model.predict(lstm_eval_df.review).ravel()
# lstm_eval_df['absolute_error'] = (lstm_eval_df['prediction'] - lstm_eval_df['star']).abs()
# lstm_eval_df

In [None]:
# lstm_eval_df.groupby('star',as_index=False)['prediction'].mean()