In [1]:
# Packages used in the script
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import io
import gzip
import spacy
import re
from contractions import contractions_dict

In [None]:
df_spo = pd.read_csv (r'Path where the CSV file is stored\df_sports.csv')

In [204]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import optimizers

In [205]:
max_features = 1800
maxlen = 50

In [206]:
word_tokenizer = Tokenizer()

X = df_spo['sentences']
word_tokenizer.fit_on_texts(X)

X_encoded = word_tokenizer.texts_to_sequences(X) 

X_padded = pad_sequences(X_encoded, maxlen=maxlen, padding='post', truncating='post')

X = X_padded.astype('int')
X = np.array(X)

In [207]:
import itertools
tags = set(itertools.chain.from_iterable(df_spo.IOB))
print(tags)
n_tags = len(tags)

{'I_neg', 'O', 'I_pos', 'B_neg', 'B_pos'}


In [208]:
tag2idx = {t: i for i, t in enumerate(tags)}
tag2idx['O']

1

In [209]:
y = [[tag2idx[w] for w in s] for s in df_spo['IOB']]
y = pad_sequences(y, maxlen=maxlen, padding='post', truncating='post', value = tag2idx['O'])

In [210]:
y = to_categorical(y, dtype = 'int', num_classes= 5)

In [211]:
embedding_size  = 300  
vocabulary_size = len(word_tokenizer.word_index) + 1

In [212]:
embedding_weights = np.zeros((vocabulary_size, embedding_size))

In [213]:
# Total number of words
word2id = word_tokenizer.word_index

In [214]:
# Split in train and test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [215]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(4000, 50) (4000, 50, 5)
(1000, 50) (1000, 50, 5)


In [216]:
opt = optimizers.Adam(learning_rate=0.01, decay=1e-6)

In [217]:
tf.random.set_seed(1234)

In [218]:
lstm_model = keras.Sequential()
lstm_model.add(tf.keras.layers.Embedding(vocabulary_size, embedding_size, input_length = maxlen)) #The embedding layer
lstm_model.add(tf.keras.layers.LSTM(64, dropout=0.6, return_sequences=True)) #Our LSTM layer
lstm_model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(5, activation='softmax')))
lstm_model.summary()

lstm_model.compile(opt, "categorical_crossentropy", metrics=["accuracy"])

history = lstm_model.fit(X_train, y_train, batch_size = 128, epochs=10) 

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 50, 300)           2224800   
_________________________________________________________________
lstm_14 (LSTM)               (None, 50, 64)            93440     
_________________________________________________________________
time_distributed_12 (TimeDis (None, 50, 5)             325       
Total params: 2,318,565
Trainable params: 2,318,565
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [219]:
loss, accuracy = lstm_model.evaluate(X_test, y_test, verbose = 1)
print('Loss: {0},\nAccuracy: {1}'.format(loss, accuracy))

Loss: 0.1022549420595169,
Accuracy: 0.9765400290489197


In [220]:
# Predict
y_pred_lstm = lstm_model.predict(X_test)

# Make hard classes - https://stackoverflow.com/questions/43672047/convert-probability-vector-into-target-vector-in-python
y_pred_lstm[y_pred_lstm > 0.5] = 1
y_pred_lstm[y_pred_lstm <= 0.5] = 0

y_pred_lstm = np.array(y_pred_lstm.astype('int'))

pred_lstm = np.array(tf.argmax(y_pred_lstm, axis = -1))
y_true = np.array(tf.argmax(y_test, axis = -1))

In [221]:
# Confusion matrix LSTM model - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.multilabel_confusion_matrix.html
from sklearn.metrics import multilabel_confusion_matrix
multilabel_confusion_matrix(y_test.flatten(), y_pred_lstm.flatten())

array([[[ 48751,   1249],
        [  1119, 198881]],

       [[198881,   1119],
        [  1249,  48751]]])

In [222]:
# Classification report LSTM model
from sklearn.metrics import classification_report
print(classification_report(y_true.flatten(), pred_lstm.flatten()))

              precision    recall  f1-score   support

           0       0.01      0.05      0.01        21
           1       0.98      0.99      0.99     48780
           2       0.38      0.10      0.16       109
           3       0.63      0.15      0.24       162
           4       0.54      0.32      0.40       928

    accuracy                           0.98     50000
   macro avg       0.51      0.32      0.36     50000
weighted avg       0.97      0.98      0.97     50000



In [223]:
# Precision LSTM model - https://www.tensorflow.org/api_docs/python/tf/keras/metrics/Precision
m = tf.keras.metrics.Precision()
m.update_state(y_test, y_pred_lstm)
precision = m.result().numpy()
print(precision)

0.97756165


In [224]:
# Recall LSTM model - https://www.tensorflow.org/api_docs/python/tf/keras/metrics/Recall
m = tf.keras.metrics.Recall()
m.update_state(y_test, y_pred_lstm)
recall = m.result().numpy()
print(recall)

0.97502


In [225]:
# F1-score LSTM model - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
F1 = 2 * (precision * recall) / (precision + recall)
print(F1)

0.976289180058759


In [226]:
#Bidirectional LSTM https://github.com/sergiovirahonda/TweetsSentimentAnalysis/blob/main/TweetsSentimentPredictions.ipynb
bilstm_model = keras.Sequential()
bilstm_model.add(tf.keras.layers.Embedding(vocabulary_size, embedding_size, input_length = maxlen))
bilstm_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,dropout=0.6, return_sequences=True)))
bilstm_model.add(tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(5, activation='softmax')))
bilstm_model.summary()

bilstm_model.compile(optimizer=opt,loss='categorical_crossentropy', metrics=["accuracy"])

history = bilstm_model.fit(X_train, y_train, batch_size = 128, epochs=10)

Model: "sequential_15"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 50, 300)           2224800   
_________________________________________________________________
bidirectional_6 (Bidirection (None, 50, 128)           186880    
_________________________________________________________________
time_distributed_13 (TimeDis (None, 50, 5)             645       
Total params: 2,412,325
Trainable params: 2,412,325
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [227]:
loss, accuracy = bilstm_model.evaluate(X_test, y_test, verbose = 1)
print("Loss: {0},\nAccuracy: {1}".format(loss, accuracy))

Loss: 0.09966423362493515,
Accuracy: 0.9751399755477905


In [228]:
# Predict
y_pred_bilstm = bilstm_model.predict(X_test)

# Make hard classes
y_pred_bilstm[y_pred_bilstm > 0.5] = 1
y_pred_bilstm[y_pred_bilstm <= 0.5] = 0

y_pred_bilstm = np.array(y_pred_bilstm.astype('int'))

pred_bilstm = np.array(tf.argmax(y_pred_bilstm, axis = -1))
y_true = np.array(tf.argmax(y_test, axis = -1))

In [229]:
# Confusion matrix LSTM model - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.multilabel_confusion_matrix.html
from sklearn.metrics import multilabel_confusion_matrix
multilabel_confusion_matrix(y_test.flatten(), y_pred_bilstm.flatten())

array([[[ 48704,   1296],
        [  1181, 198819]],

       [[198819,   1181],
        [  1296,  48704]]])

In [230]:
# Classification report LSTM model
from sklearn.metrics import classification_report
print(classification_report(y_true.flatten(), pred_bilstm.flatten()))

              precision    recall  f1-score   support

           0       0.01      0.05      0.01        21
           1       0.99      0.99      0.99     48780
           2       0.35      0.17      0.22       109
           3       0.47      0.30      0.36       162
           4       0.50      0.50      0.50       928

    accuracy                           0.97     50000
   macro avg       0.46      0.40      0.42     50000
weighted avg       0.97      0.97      0.97     50000



In [231]:
# Precision LSTM model - https://www.tensorflow.org/api_docs/python/tf/keras/metrics/Precision
m = tf.keras.metrics.Precision()
m.update_state(y_test, y_pred_bilstm)
precision = m.result().numpy()
print(precision)

0.9763256


In [232]:
# Recall LSTM model - https://www.tensorflow.org/api_docs/python/tf/keras/metrics/Recall
m = tf.keras.metrics.Recall()
m.update_state(y_test, y_pred_bilstm)
recall = m.result().numpy()
print(recall)

0.97408


In [233]:
# F1-score LSTM model - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
F1 = 2 * (precision * recall) / (precision + recall)
print(F1)

0.9752014951481518


In [234]:
#Add an attention-layer
from tensorflow.keras.layers import Dense, Lambda, Dot, Activation, Concatenate
from tensorflow.keras.layers import Layer

class Attention(Layer):

    def __init__(self, units=128, **kwargs):
        self.units = units
        super().__init__(**kwargs)

    def __call__(self, inputs):
        """
        Many-to-one attention mechanism for Keras.
        @param inputs: 3D tensor with shape (batch_size, time_steps, input_dim).
        @return: 2D tensor with shape (batch_size, 128)
        @author: felixhao28, philipperemy.
        """
        hidden_states = inputs
        hidden_size = int(hidden_states.shape[2])
        # Inside dense layer
        #              hidden_states            dot               W            =>           score_first_part
        # (batch_size, time_steps, hidden_size) dot (hidden_size, hidden_size) => (batch_size, time_steps, hidden_size)
        # W is the trainable weight matrix of attention Luong's multiplicative style score
        score_first_part = Dense(hidden_size, use_bias=False, name='attention_score_vec')(hidden_states)
        #            score_first_part           dot        last_hidden_state     => attention_weights
        # (batch_size, time_steps, hidden_size) dot   (batch_size, hidden_size)  => (batch_size, time_steps)
        h_t = Lambda(lambda x: x[:, -1, :], output_shape=(hidden_size,), name='last_hidden_state')(hidden_states)
        score = Dot(axes=[1, 2], name='attention_score')([h_t, score_first_part])
        attention_weights = Activation('softmax', name='attention_weight')(score)
        # (batch_size, time_steps, hidden_size) dot (batch_size, time_steps) => (batch_size, hidden_size)
        context_vector = Dot(axes=[1, 1], name='context_vector')([hidden_states, attention_weights])
        pre_activation = Concatenate(name='attention_output')([context_vector, h_t])
        attention_vector = Dense(self.units, use_bias=False, activation='tanh', name='attention_vector')(pre_activation)
        return attention_vector

    def get_config(self):
        return {'units': self.units}

    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [235]:
# Attention-based LSTM model
import os
os.environ['TF_KERAS'] = '1'
from keras_self_attention import SeqSelfAttention

attentionlstm_model = keras.Sequential()
attentionlstm_model.add(tf.keras.layers.Embedding(vocabulary_size, embedding_size, input_length = maxlen))# weights = [embedding_weights], trainable = True))
attentionlstm_model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)))

attentionlstm_model.add(Layer(SeqSelfAttention(attention_activation='sigmoid')))

attentionlstm_model.add(tf.keras.layers.Dense(5, 'softmax'))

attentionlstm_model.summary()
attentionlstm_model.compile(optimizer=opt,loss='categorical_crossentropy', metrics=["accuracy"])

history = attentionlstm_model.fit(X_train, y_train, batch_size = 128, epochs=10)

Model: "sequential_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 50, 300)           2224800   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 50, 128)           186880    
_________________________________________________________________
layer_2 (Layer)              (None, 50, 128)           0         
_________________________________________________________________
dense_16 (Dense)             (None, 50, 5)             645       
Total params: 2,412,325
Trainable params: 2,412,325
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [236]:
loss, accuracy = attentionlstm_model.evaluate(X_test, y_test, verbose = 1)
print("Loss: {0},\nAccuracy: {1}".format(loss, accuracy))

Loss: 0.10252467542886734,
Accuracy: 0.9764800071716309


In [237]:
# Predict
y_pred_attentionlstm = attentionlstm_model.predict(X_test)

# Make hard classes
y_pred_attentionlstm[y_pred_attentionlstm > 0.5] = 1
y_pred_attentionlstm[y_pred_attentionlstm <= 0.5] = 0

y_pred_attentionlstm = np.array(y_pred_attentionlstm.astype('int'))

pred_attentionlstm = np.array(tf.argmax(y_pred_attentionlstm, axis = -1))
y_true = np.array(tf.argmax(y_test, axis = -1))

In [238]:
# Confusion matrix LSTM model - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.multilabel_confusion_matrix.html
from sklearn.metrics import multilabel_confusion_matrix
multilabel_confusion_matrix(y_test.flatten(), y_pred_attentionlstm.flatten())

array([[[ 48760,   1240],
        [  1109, 198891]],

       [[198891,   1109],
        [  1240,  48760]]])

In [239]:
# Classification report LSTM model
from sklearn.metrics import classification_report
print(classification_report(y_true.flatten(), pred_attentionlstm.flatten()))

              precision    recall  f1-score   support

           0       0.02      0.10      0.03        21
           1       0.98      0.99      0.99     48780
           2       0.28      0.12      0.17       109
           3       0.56      0.15      0.23       162
           4       0.55      0.39      0.45       928

    accuracy                           0.98     50000
   macro avg       0.48      0.35      0.37     50000
weighted avg       0.97      0.98      0.97     50000



In [240]:
# Precision LSTM model - https://www.tensorflow.org/api_docs/python/tf/keras/metrics/Precision
m = tf.keras.metrics.Precision()
m.update_state(y_test, y_pred_attentionlstm)
precision = m.result().numpy()
print(precision)

0.97776175


In [241]:
# Recall LSTM model - https://www.tensorflow.org/api_docs/python/tf/keras/metrics/Recall
m = tf.keras.metrics.Recall()
m.update_state(y_test, y_pred_attentionlstm)
recall = m.result().numpy()
print(recall)

0.9752


In [242]:
# F1-score LSTM model - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
F1 = 2 * (precision * recall) / (precision + recall)
print(F1)

0.9764792344086389
