# Machine Learning For Sentiment Analysis

This notebook provides instructions for preparing data and constructing a machine learning model to perform sentiment analysis. The model architecture consists of an RNN-CNN-GRU joint architecture with a pre-trained/customized embedding utilizing word2vec. The code is suitable for execution on Jupyter Notebook or Google Colab.

In [1]:
# pandas version 1.1.4
from time import strftime
from pathlib import Path
import shutil
import tensorflow.keras.metrics as ms
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, concatenate, Bidirectional, GRU, Conv1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
import tensorflow
from keras.utils.vis_utils import plot_model
from keras.callbacks import EarlyStopping
from keras.models import load_model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from keras.optimizers import Adam
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, Conv1D
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, GRU, Bidirectional, Dropout, Flatten
from keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.preprocessing.text import Tokenizer
import pickle
import numpy as np
from sklearn import utils
import multiprocessing
from gensim.models.doc2vec import TaggedDocument
from gensim.models.word2vec import Word2Vec
import gensim
import os
import sys
from tqdm import tqdm
import re
import pandas as pd

pd.__version__

2023-03-12 17:06:30.711564: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-12 17:06:30.949836: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-12 17:06:30.949865: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-12 17:06:31.916791: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

'1.5.3'

In [2]:
print(gensim.__version__)

if gensim.__version__ < '4.3.1':
    !pip install --upgrade gensim

    # gensim-4.3.1

4.3.1


In [3]:
    print(tf.__version__)

2.11.0


## Mount Google Drive
To Mount your Google Drive, follow the steps. If you are running it on local machine, mounting will be skipped.



In [4]:
# Location of the files


is_colab = "google.colab" in sys.modules

if is_colab:
    from google.colab import drive
    drive.mount('/content/drive/')

In [5]:
cwd = os.getcwd()
print(cwd)

/mnt/g/My Drive/Colab Notebooks/ta-sa/sa


In [6]:
is_colab

False

In [7]:
NUM_CLASSES = 2
num_classes = NUM_CLASSES

## Setup the data, models and notebooks paths




In [8]:

if is_colab:
    data_file_location = '/content/drive/My Drive/Colab Notebooks/ta-sa/sa/data/'
    model_file_location = '/content/drive/My Drive/Colab Notebooks/ta-sa/sa/models/'
else:
    data_file_location = './data/'
    model_file_location = './models/'


tag = 'March-2023'

# Trianing data
training_data_covid_file = 'COVID19_Tweet_Main_annotated_V3_clean_training.csv'
training_data_covid_path = data_file_location + training_data_covid_file

training_data_oc_file = 'OCTranspo-all-Data_5classes_clean.csv'
training_data_oc_path = data_file_location + training_data_oc_file


training_data_quayside_file = 'Quayside-all-Data_5classes_clean.csv'
training_data_quayside_path = data_file_location + training_data_quayside_file


# Testing data
testing_data_file = 'COVID19_Tweet_Main_annotated_V3_clean_testing.csv'
testing_data_path = data_file_location + testing_data_file


# Embedding data
embedding_data_file = 'COVID19DATASET_APRIL_10_Embedding_clean.csv'
embedding_data_path = data_file_location + embedding_data_file


COIVD19_Best_Model_file = 'COVID19-Best-model-' + \
    str(NUM_CLASSES) + 'classes_' + tag + '.hdf5'
COIVD19_Best_Model_path = model_file_location + COIVD19_Best_Model_file


tokenizer_file = 'COVID19-tokenizer-' + \
    str(NUM_CLASSES) + 'classes_' + tag + '.pickle'
tokenizer_path = model_file_location + tokenizer_file

In [9]:

# Define the mapping dictionary
mapping_dict = {0: 0, 1: 0, 2: 1, 3: 2, 4: 2}

# if NUM_CLASSES == 3:
#     # Apply the mapping dictionary to the column
#     training_data_covid['Score 5-Classe'] = training_data_covid['Score 5-Classe'].map(
#         mapping_dict)


def classes_mapping(NUM_CLASSES, mapping_dict, df):
    if NUM_CLASSES == 3:
        # Apply the mapping dictionary to the column
        df['Score 5-Classe'] = df['Score 5-Classe'].map(mapping_dict)
    if NUM_CLASSES == 2:
        # Remove rows where col_mapped is 1
        # df['Score 5-Classe'] = df['Score 5-Classe'].astype(str)
        df = df[df['Score 5-Classe'] != 2]
        mapping_dict = {0: 0, 1: 0, 3: 1, 4: 1}
        df['Score 5-Classe'] = df['Score 5-Classe'].map(mapping_dict)

    return df

## Loading data

Change the paths to reflect the dataset locations


In [10]:
# Load the training data 1

training_data_covid = pd.read_csv(
    training_data_covid_path, encoding="ISO-8859-1", on_bad_lines='skip', usecols=[1, 5])
training_data_covid.head(10)

training_data_covid = classes_mapping(
    NUM_CLASSES, mapping_dict, training_data_covid)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Score 5-Classe'] = df['Score 5-Classe'].map(mapping_dict)


In [11]:
training_data_covid.info()
training_data_covid.columns

<class 'pandas.core.frame.DataFrame'>
Int64Index: 306 entries, 0 to 373
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   clean_text      306 non-null    object
 1   Score 5-Classe  306 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 7.2+ KB


Index(['clean_text', 'Score 5-Classe'], dtype='object')

In [12]:
training_data_covid["Score 5-Classe"].value_counts()

1    154
0    152
Name: Score 5-Classe, dtype: int64

In [13]:
training_data_covid["Score 5-Classe"].value_counts()

1    154
0    152
Name: Score 5-Classe, dtype: int64

In [14]:
text_col = "Text"

sentiment_col = 'Sentiment'

In [15]:
training_data_covid.columns = [text_col, sentiment_col]
training_data_covid.head(6)

Unnamed: 0,Text,Sentiment
0,planning today guess will eat can bean til apo...,0
2,perplex why government need certain amount peo...,0
3,problem with exempt border closure unless purp...,0
4,example community contribution influence time ...,1
5,wage subsidy make zero sense most small busine...,0
6,need more close will italy spain know,0


In [16]:
# Load the training data 2

training_data_oc = pd.read_csv(
    training_data_oc_path, encoding="ISO-8859-1", on_bad_lines='skip', usecols=[1, 2])
training_data_oc.head(2)

Unnamed: 0,Score 5-Classe,clean_text
0,4,first experience with train fantastic arrive m...
1,0,witness agent completely careless person distr...


In [17]:


training_data_oc = classes_mapping(NUM_CLASSES, mapping_dict, training_data_oc)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Score 5-Classe'] = df['Score 5-Classe'].map(mapping_dict)


In [18]:
training_data_oc["Score 5-Classe"].value_counts()

1    713
0    629
Name: Score 5-Classe, dtype: int64

In [19]:
# change the order of the coloumns
training_data_oc = training_data_oc[['clean_text', 'Score 5-Classe']]

In [20]:


training_data_oc.columns = [text_col, sentiment_col]
training_data_oc.head(2)

Unnamed: 0,Text,Sentiment
0,first experience with train fantastic arrive m...,1
1,witness agent completely careless person distr...,0


In [21]:
# Load the training data 3

training_data_quayside = pd.read_csv(training_data_quayside_path, encoding="ISO-8859-1",
                                     on_bad_lines='skip', usecols=[1, 2])  # error_bad_lines=False


training_data_quayside.head(2)

Unnamed: 0,Score 5-Classe,clean_text
0,0,waterfront toronto limited experience digital ...
1,1,data will collect google use advertising purpose


In [22]:
# if NUM_CLASSES == 3:
#     # Apply the mapping dictionary to the column
#     training_data_quayside['Score 5-Classe'] = training_data_quayside['Score 5-Classe'].map(
#         mapping_dict)


training_data_quayside = classes_mapping(
    NUM_CLASSES, mapping_dict, training_data_quayside)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Score 5-Classe'] = df['Score 5-Classe'].map(mapping_dict)


In [23]:
training_data_quayside["Score 5-Classe"].value_counts()

0    463
1    357
Name: Score 5-Classe, dtype: int64

In [24]:

# change the order of the coloumns
training_data_quayside = training_data_quayside[[
    'clean_text', 'Score 5-Classe']]

In [25]:

training_data_quayside.columns = [text_col, sentiment_col]
training_data_quayside.head(10)

Unnamed: 0,Text,Sentiment
0,waterfront toronto limited experience digital ...,0
1,data will collect google use advertising purpose,0
3,not_know why government commit much one company,0
4,excite project future,1
5,must stop sidewalk toronto wast money,0
6,city full sensor well wrong with,0
7,what will collect data guess google aim use ga...,0
8,why not_enough information regard project,0
10,realy excite look forward see smartcity,1
11,not_know why people not_like smart city mean g...,1


In [26]:
# Load Testing Data

testing_data = pd.read_csv(testing_data_path, encoding="ISO-8859-1",
                           on_bad_lines='skip', usecols=[1, 5])  # error_bad_lines=False
testing_data.head(2)

Unnamed: 0,clean_text,Score 5-Classe
0,read main reason company still operating due p...,1
1,lol chill man test many people possible right ...,3


In [27]:
# if NUM_CLASSES == 3:
#     # Apply the mapping dictionary to the column
#     testing_data['Score 5-Classe'] = testing_data['Score 5-Classe'].map(
#         mapping_dict)

testing_data = classes_mapping(NUM_CLASSES, mapping_dict, testing_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Score 5-Classe'] = df['Score 5-Classe'].map(mapping_dict)


In [28]:
testing_data["Score 5-Classe"].value_counts()

1    156
0    129
Name: Score 5-Classe, dtype: int64

In [29]:
testing_data.columns = [text_col, sentiment_col]
testing_data.head(2)

Unnamed: 0,Text,Sentiment
0,read main reason company still operating due p...,0
1,lol chill man test many people possible right ...,1


In [30]:
testing_data.dropna(inplace=True)

In [31]:
# testing_data[text_col][100:]

In [32]:
# Load Embedding Data

embedding_data = pd.read_csv(
    embedding_data_path, encoding="ISO-8859-1", on_bad_lines='skip', usecols=[4])
embedding_data.head(2)

Unnamed: 0,clean_text
0,health official across ontario ottawa prepare ...
1,government think slow


In [33]:
# import numpy as np
# embedding_data['clean_text'][1] = np.nan

In [34]:
embedding_data.dropna(inplace=True)

In [35]:
embedding_data.columns = [text_col]
embedding_data.head()

Unnamed: 0,Text
0,health official across ontario ottawa prepare ...
1,government think slow
2,coronavirus continue spread death toll what no...
3,more people wonder why coronavirus release pub...
4,link ontario dedicate webpage fill with inform...


## Data Preparation

We already did data cleaning in a separate file which include several cleaning operations.

In [36]:
# Concatenate all training datasets

training_data = pd.concat(
    [training_data_oc, training_data_quayside, training_data_covid])  # training_data_covid,

# training_data = pd.concat([training_data1, training_data2])

training_data.dropna(inplace=True)

# X_train
x_train = training_data[text_col]
x_train_covid = training_data_covid[text_col]

# y_train
y_train = training_data[sentiment_col]
y_train_covid = training_data_covid[sentiment_col]

In [37]:
testing_data.dropna(inplace=True)

x_test = testing_data[text_col]

y_test = testing_data[sentiment_col]

In [38]:
# Concatenate the text of all datasets so we can use them for Word embedding (word2vec) and tokenization. 
# The testing data will not be used in training


x = embedding_data[text_col]

word_embedding_text = pd.concat(
    [x,  x_train, x_test], ignore_index=True)  # x_train_covid,

type(word_embedding_text)
word_embedding_text[0]

# Word_embedding_text = Word_embedding_text.drop([3240, 9224])

# Word_embedding_text.to_csv(data_file_location + 'Word_embedding_text.csv')

'health official across ontario ottawa prepare possible spread coronavirus ctv report'

In [39]:
# Word_embedding_text.drop([0, 1])

word_embedding_text[3240]

'more information federal government economic response plan available read more'

In [40]:
# Word_embedding_text.head()

### Word2Vec
We will train two Word2Vec models using Continuous Bag Of Words (CBOW) and Skip Gram models. For each model, each word has a vector size of 100. So, the vector representation of each word has a dimension of 200.

In [41]:
#!pip install -U gensim
# super_data.CleanText=super_data.CleanText.astype(str) # Prevent pandas form automatically converts input to float

In [42]:


def labelize_documents(docs):
    labelized_docs = []
    for ind, doc in zip(docs.index, docs):
        labelized_docs.append(TaggedDocument(
            doc.split(), ["doc_id" + '_%s' % ind]))
    return labelized_docs


# all_x = pd.concat([x_train, x_test])
text_doc_id = labelize_documents(word_embedding_text)

In [43]:
text_doc_id[0][0]

['health',
 'official',
 'across',
 'ontario',
 'ottawa',
 'prepare',
 'possible',
 'spread',
 'coronavirus',
 'ctv',
 'report']

### CBOW

In [44]:
# CBOW
cores = multiprocessing.cpu_count()
# Continuous Bag Of Words
model_cbow = Word2Vec(sg=0, vector_size=100, negative=20, window=5,
                      min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
# sg ({0, 1}, optional) – Training algorithm: 1 for skip-gram; otherwise CBOW.
# size/vector_size (int, optional) – Dimensionality of the word vectors.
# window (int, optional) – Maximum distance between the current and predicted word within a sentence.
# min_count (int, optional) – Ignores all words with total frequency lower than this.
# workers (int, optional) – Use these many worker threads to train the model (=faster training with multicore machines).
# negative (int, optional) – If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used..The paper says that selecting 5-20 words works well for smaller datasets, and you can get away with only 2-5 words for large datasets.
# alpha (float, optional) – The initial learning rate.
# min_alpha (float, optional) – Learning rate will linearly drop to min_alpha as training progresses.

# build_vocab: Build vocabulary from a sequence of sentences
model_cbow.build_vocab([x.words for x in tqdm(text_doc_id)])

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 16893/16893 [00:00<00:00, 2166469.27it/s]


In [45]:
cores

8

`train`: Update the model’s neural weights from a sequence of sentences.
 total_examples (int) – Count of sentences.
`total_words` (int) – Count of raw words in sentences.
`epochs` (int) – Number of iterations (epochs) over the corpus.
`start_alpha` (float, optional) – Initial learning rate. If supplied, replaces the starting alpha from the constructor, for this one call to`train()`. Use only if making multiple calls to train(), when you want to manage the alpha learning-rate yourself (not recommended).
`end_alpha` (float, optional) – Final learning rate. Drops linearly from start_alpha. If supplied, this replaces the final `min_alpha` from the constructor, for this one call to train(). Use only if making multiple calls to train(), when you want to manage the alpha learning-rate yourself (not recommended).
`word_count` (int, optional) – Count of words already trained. Set this to 0 for the usual case of training on all words in sentences.
`queue_factor` (int, optional) – Multiplier for size of queue (number of workers * queue_factor).
`report_delay` (float, optional) – Seconds to wait before reporting progress.

In [46]:
%%time
for iteration in range(15):
    model_cbow.train(utils.shuffle([x.words for x in text_doc_id]), total_examples=len(
        text_doc_id), epochs=1)  # tqdm(text_doc_id)
    model_cbow.alpha -= 0.002
    model_cbow.min_alpha = model_cbow.alpha

CPU times: user 17.6 s, sys: 545 ms, total: 18.1 s
Wall time: 3.71 s


### Skip Gram

In [47]:
# Skip Gram
model_skip_gram = Word2Vec(sg=1, vector_size=100, negative=20,
                           window=5, min_count=2, workers=cores, alpha=0.02)
model_skip_gram.build_vocab([x.words for x in tqdm(text_doc_id)])

100%|█████████████████████████████████████████████████████████████████| 16893/16893 [00:00<00:00, 1706553.08it/s]


In [48]:
%%time
for epoch in range(15):
    model_skip_gram.train(utils.shuffle([x.words for x in
                                         text_doc_id]), total_examples=len(text_doc_id), epochs=1)  # tqdm(text_doc_id)
    model_skip_gram.alpha -= 0.002
    # print(model_ug_sg.alpha)
    model_skip_gram.min_alpha = model_skip_gram.alpha

CPU times: user 53.8 s, sys: 230 ms, total: 54 s
Wall time: 8.41 s


### Embeddings with CBOW and Skip-Gram

In [49]:

embeddings_index = {}
for w in model_cbow.wv.key_to_index.keys():
    embeddings_index[w] = np.append(model_cbow.wv[w], model_skip_gram.wv[w])
print('Found %s word vectors.' % len(embeddings_index))
# embeddings_index

Found 6833 word vectors.


### Tokenizer

We build the the tokenizer and save it to be used later.


In [50]:
#!pip install tensorflow
#!pip install keras


MAX_WORDS = 60000
# num_words = MAX_WORDS ; num_words = None
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(word_embedding_text)

x_train_seq = tokenizer.texts_to_sequences(x_train)
x_train_covid_seq = tokenizer.texts_to_sequences(x_train_covid)

x_test_seq = tokenizer.texts_to_sequences(x_test)

# saving tokenizer
with open(tokenizer_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Encoding the data

We use one-hot encoding, also called categorical encoding. It is widely used for categorical data.


In [51]:

y_train_labels = to_categorical(y_train)
y_train_covid_labels = to_categorical(y_train_covid)

y_test_labels = to_categorical(y_test)

In [52]:
y_train_labels[2]

array([1., 0.], dtype=float32)

### Sequence Padding
``pad_sequences`` is used to ensure that all sequences in a list have the same length. By default (``maxlen=None``) this is done by padding 0 in the beginning of each sequence until each sequence has the same length as the longest sequence.

In [53]:
# from keras.preprocessing.sequence import pad_sequences

# pad_sequences = ""
# from keras.utils import Sequence
MAX_LENGTH = 40   # the maximum length of words per tweet.

x_train_padded = pad_sequences(x_train_seq, maxlen=MAX_LENGTH)  # maxlen=MAXLEN
y_train_covid_padded = pad_sequences(
    x_train_covid_seq, maxlen=MAX_LENGTH)  # maxlen=MAXLEN

x_test_padded = pad_sequences(x_test_seq, maxlen=MAX_LENGTH)


# len(padded_test[0])
# padded_test[0]

In [54]:
embed_size = 200

# maximum number of words kept after tokenization based on their word frequency
embed_matrix = np.zeros((MAX_WORDS, embed_size))
for word, i in tokenizer.word_index.items():
    if i >= MAX_WORDS:
        continue
    embed_vector = embeddings_index.get(word)
    if embed_vector is not None:
        embed_matrix[i] = embed_vector

In [55]:


my_metrics = [ms.Precision(thresholds=0.5),
              ms.Recall(thresholds=0.5), ms.CategoricalAccuracy(), ms.AUC()]


def get_model(my_metrics):
    # Input layer:
    inpt = Input(shape=(MAX_LENGTH, ))
    # Emnedding Layer:
    layer = Embedding(MAX_WORDS, 200, weights=[
                      embed_matrix], input_length=MAX_LENGTH, trainable=True)(inpt)
    # Spatial dropout layer I:
    layer = SpatialDropout1D(0.3)(layer)
    # Bidirectional RNN (GRU) layer:
    layer = Bidirectional(GRU(100, return_sequences=True))(layer)
    # Convolutional layer:
    layer = Conv1D(128, kernel_size=2, padding="valid",
                   kernel_initializer="he_uniform")(layer)
    #  Pooling layers:
    avg_pool = GlobalAveragePooling1D()(layer)
    max_pool = GlobalMaxPooling1D()(layer)
    conc = concatenate([avg_pool, max_pool])
    # Dropout layer II:
    conc = Dropout(0.3)(conc)
    # DNN layer:
    den_layer = Dense(64, activation='relu')(conc)
    outp = Dense(num_classes, activation="softmax")(
        den_layer)  # sigmoid; softmax
    # Complete model
    model = Model(inputs=inpt, outputs=outp)
    # Defining the loss function, optimozer, and metrics
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Nadam(
        learning_rate=0.001), metrics=my_metrics)  # Adam, Nadam, SGD, Adamax
    return model

2023-03-12 17:06:48.011394: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2023-03-12 17:06:48.011453: W tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:265] failed call to cuInit: UNKNOWN ERROR (303)
2023-03-12 17:06:48.011487: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (IoT): /proc/driver/nvidia/version does not exist
2023-03-12 17:06:48.011725: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [56]:


def get_model_b():
    # Input layer:
    inpt = Input(shape=(MAX_LENGTH, ))
    # Emnedding Layer:
    layer = Embedding(MAX_WORDS, 200, weights=[
                      embed_matrix], input_length=MAX_LENGTH, trainable=True)(inpt)

    # Convolutional layer:
    layer = Conv1D(128, kernel_size=2, padding="valid",
                   kernel_initializer="he_uniform")(layer)
    # layer = Flatten()(layer)
    # Spatial dropout layer I:
    layer = Dropout(0.3)(layer)

    # Bidirectional RNN (GRU) layer:
    layer = Bidirectional(GRU(100, return_sequences=True))(layer)
    #  Pooling layers:
    avg_pool = GlobalAveragePooling1D()(layer)
    max_pool = GlobalMaxPooling1D()(layer)
    conc = concatenate([avg_pool, max_pool])
    # Dropout layer II:
    conc = Dropout(0.3)(conc)
    # DNN layer:
    den_layer = Dense(64, activation='relu')(conc)
    outp = Dense(num_classes, activation="softmax")(
        den_layer)  # sigmoid; softmax
    # Complete model
    model = Model(inputs=inpt, outputs=outp)
    # Defining the loss function, optimozer, and metrics
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Nadam(
        learning_rate=0.001), metrics=[tf.keras.metrics.Precision(thresholds=0.4),
                                       tf.keras.metrics.Recall(thresholds=0.4), tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.AUC()])  # Adam, Nadam, SGD,  ['categorical_accuracy']
    return model

In [57]:


def get_model4():
    # Input layer:
    inpt = Input(shape=(MAX_LENGTH, ))
    # Emnedding Layer:
    layer = Embedding(MAX_WORDS, 200, weights=[
                      embed_matrix], input_length=MAX_LENGTH, trainable=True)(inpt)

    # Bidirectional RNN (GRU) layer:
    layer = Bidirectional(GRU(100, return_sequences=True))(layer)
    # Convolutional layer:
    layer = Conv1D(128, kernel_size=2, padding="valid",
                   kernel_initializer="he_uniform")(layer)
    #  Pooling layers:
    avg_pool = GlobalAveragePooling1D()(layer)
    # max_pool = GlobalMaxPooling1D()(layer)
   # conc = concatenate([avg_pool, max_pool])

    # DNN layer:
    # den_layer = Dense(64, activation='relu')(conc)
    outp = Dense(num_classes, activation="softmax")(
        avg_pool)  # sigmoid; softmax
    # Complete model
    model = Model(inputs=inpt, outputs=outp)
    # Defining the loss function, optimozer, and metrics
    model.compile(loss='categorical_crossentropy', optimizer=Adam(
        lr=0.001), metrics=['categorical_accuracy'])  # categorical_crossentropy
    return model

In [58]:
def get_model1():
    # Input layer:
    inpt = Input(shape=(MAX_LENGTH, ))
    # Emnedding Layer:
    # layer = Embedding(MAX_WORDS, 200, weights=[embed_matrix], input_length=MAXLEN, trainable=True)(inpt)

    outp = Dense(num_classes, activation="softmax")(inpt)  # sigmoid; softmax
    # Complete model
    model = Model(inputs=inpt, outputs=outp)
    # Defining the loss function, optimozer, and metrics
    model.compile(loss='categorical_crossentropy', optimizer=Adam(
        lr=0.001), metrics=['categorical_accuracy'])  # categorical_crossentropy
    return model

In [59]:
def get_model2():
    # Input layer:
    inpt = Input(shape=(MAX_LENGTH, ))
    # Emnedding Layer:
    layer = Embedding(MAX_WORDS, 200, weights=[
                      embed_matrix], input_length=MAX_LENGTH, trainable=True)(inpt)
    # Spatial dropout layer I:
    # layer = SpatialDropout1D(0.3)(layer)
    # Bidirectional RNN (GRU) layer:
    # layer = Bidirectional(GRU(100, return_sequences=True))(layer)
    # Convolutional layer:
    layer = Conv1D(128, kernel_size=2, padding="valid",
                   kernel_initializer="he_uniform")(layer)
    #  Pooling layers:
    avg_pool = GlobalAveragePooling1D()(layer)
    # max_pool = GlobalMaxPooling1D()(layer)
    # conc = concatenate([avg_pool, max_pool])
    # Spatial dropout layer II:
    # conc = Dropout(0.3)(conc)
    # DNN layer:
    # den_layer = Dense(64, activation='relu')(conc)
    outp = Dense(num_classes, activation="softmax")(
        avg_pool)  # sigmoid; softmax
    # Complete model
    model = Model(inputs=inpt, outputs=outp)
    # Defining the loss function, optimozer, and metrics
    model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(
        lr=0.001), metrics=['categorical_accuracy'])  # categorical_crossentropy
    return model

In [60]:

# used_model = get_model_b()
# used_model = get_LSTM_model()
# used_model = get_CNN_LSTM_model()
filepath = COIVD19_Best_Model_path


# /usr/local/lib/python3.8/dist-packages/keras/optimizers/optimizer_v2/adam.py:117: UserWarning: The `lr` argument is deprecated, use `learning_rate` instead.

In [61]:
if "google.colab" in sys.modules:
  !pip install -q -U tensorboard-plugin-profile

In [62]:
    shutil.rmtree("my_logs", ignore_errors=True)

In [63]:


def get_run_logdir(root_logdir="my_logs"):
    return Path(root_logdir) / strftime("run_%Y_%m_%d_%H_%M_%S")

# run_logdir = get_run_logdir()

In [64]:

# tensorboard_cb = tf.keras.callbacks.TensorBoard(run_logdir, profile_batch=(100, 200))
checkpoint_path = os.path.join(model_file_location, 'checkpoints',
                               f'COVID19-best-model-{num_classes}-classes'+'.weights.epoch-{epoch:02d}-val_loss-{val_loss:.2f}-val_categorical_accuracy-{val_categorical_accuracy:.2f}.hdf5')

# I want to stop using checkpoint_path to save space in gDrive
checkpoint_path = COIVD19_Best_Model_path

model_checkpoint = ModelCheckpoint(
    checkpoint_path, monitor='val_categorical_accuracy', verbose=1, save_best_only=True, mode='max')


early_stop_callback_1 = EarlyStopping(
    monitor='val_loss', mode='min', min_delta=0.001, patience=5, verbose=1)

early_stop_callback_2 = EarlyStopping(
    monitor='val_categorical_accuracy', mode='max', patience=5, verbose=1)


my_callbacks = [model_checkpoint, early_stop_callback_1,
                early_stop_callback_2]  # , tensorboard_cb]

In [65]:

used_model = get_model(my_metrics)
history = used_model.fit(x=x_train_padded, y=y_train_labels, validation_data=(x_test_padded, y_test_labels), batch_size=128,
                         callbacks=my_callbacks, epochs=20, verbose=1)

Epoch 1/20
Epoch 1: val_categorical_accuracy improved from -inf to 0.68772, saving model to ./models/COVID19-Best-model-2classes_March-2023.hdf5
Epoch 2/20
Epoch 2: val_categorical_accuracy improved from 0.68772 to 0.78596, saving model to ./models/COVID19-Best-model-2classes_March-2023.hdf5
Epoch 3/20
Epoch 3: val_categorical_accuracy improved from 0.78596 to 0.80702, saving model to ./models/COVID19-Best-model-2classes_March-2023.hdf5
Epoch 4/20
Epoch 4: val_categorical_accuracy did not improve from 0.80702
Epoch 5/20
Epoch 5: val_categorical_accuracy did not improve from 0.80702
Epoch 6/20
Epoch 6: val_categorical_accuracy improved from 0.80702 to 0.82807, saving model to ./models/COVID19-Best-model-2classes_March-2023.hdf5
Epoch 7/20
Epoch 7: val_categorical_accuracy did not improve from 0.82807
Epoch 8/20
Epoch 8: val_categorical_accuracy did not improve from 0.82807
Epoch 8: early stopping


In [66]:
# %load_ext tensorboard
# %tensorboard --logdir=./my_logs

In [67]:
# extra code

if "google.colab" in sys.modules:
    from google.colab import output

    output.serve_kernel_port_as_window(6006)
else:
    from IPython.display import display, HTML

    display(HTML('<a href="http://localhost:6006/">http://localhost:6006/</a>'))

In [68]:
# freez layers
for layer in used_model.layers[0:-1]:
    layer.trainable = False

In [69]:

for layer in used_model.layers[0:-1]:
    layer.trainable = False

used_model.compile(loss='categorical_crossentropy',
                   optimizer=tf.keras.optimizers.Adam(
                       learning_rate=0.001), metrics=my_metrics)  # Adam, Nadam, SGD, Adamax

history = used_model.fit(x=y_train_covid_padded, y=y_train_covid_labels,
                         validation_data=(
                             x_test_padded, y_test_labels), batch_size=128,
                         callbacks=my_callbacks, epochs=20, verbose=1)

Epoch 1/20
Epoch 1: val_categorical_accuracy did not improve from 0.82807
Epoch 2/20
Epoch 2: val_categorical_accuracy did not improve from 0.82807
Epoch 3/20
Epoch 3: val_categorical_accuracy did not improve from 0.82807
Epoch 4/20
Epoch 4: val_categorical_accuracy did not improve from 0.82807
Epoch 5/20
Epoch 5: val_categorical_accuracy did not improve from 0.82807
Epoch 6/20
Epoch 6: val_categorical_accuracy did not improve from 0.82807
Epoch 7/20
Epoch 7: val_categorical_accuracy did not improve from 0.82807
Epoch 8/20
Epoch 8: val_categorical_accuracy did not improve from 0.82807
Epoch 8: early stopping


In [70]:
# history = used_model.fit(x=padded_train, y=y_train_labels, validation_data=(padded_test, y_test_labels), batch_size=128,
#     callbacks=my_callbacks, epochs=2, verbose=1)

In [71]:
# from sklearn.metrics import accuracy_score

# # Predict class probabilities for test set
# y_prob = used_model.predict(padded_test)

# # Convert probabilities to class labels
# y_pred = np.argmax(y_prob, axis=1)

# # Calculate accuracy for each class
# class_acc = []
# for c in range(num_classes):
#     idx = np.where(y_test_labels.argmax(axis=1) == c)[0]
#     acc = accuracy_score(y_test_labels[idx].argmax(axis=1), y_pred[idx])
#     class_acc.append(acc)

# # Calculate weighted accuracy
# weighted_acc = np.average(class_acc, weights=np.bincount(y_test_labels.argmax(axis=1)))

# weighted_acc

In [72]:
used_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 40)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 40, 200)      12000000    ['input_1[0][0]']                
                                                                                                  
 spatial_dropout1d (SpatialDrop  (None, 40, 200)     0           ['embedding[0][0]']              
 out1D)                                                                                           
                                                                                                  
 bidirectional (Bidirectional)  (None, 40, 200)      181200      ['spatial_dropout1d[0][0]']  

In [73]:

plot_model(used_model, to_file='model_plot.png',
           show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [74]:
#!pip install pydot

In [75]:
# used_model = get_model_s()


# history = used_model.fit(x=padded_train, y=y_train_labels, validation_data=(padded_test, y_test_labels), batch_size=128,
#     callbacks=my_callbacks, epochs=20, verbose=1)

In [76]:
# used_model.add(Dense(3, activation="softmax"))

# last_layer = used_model.get_layer(index=-1)
# last_layer.units = 1

# last_layer.output_shape

In [77]:
def create_model(num_layers=1, num_units=100, activation='relu', optimizer='nadam', kernel_initializer='glorot_uniform'):
    # Input layer:
    inpt = Input(shape=(MAX_LENGTH, ))
    # Emnedding Layer:
    layer = Embedding(MAX_WORDS, 200, weights=[
                      embed_matrix], input_length=MAX_LENGTH, trainable=True)(inpt)
    # Spatial dropout layer I:
    layer = SpatialDropout1D(0.3)(layer)
    # Bidirectional RNN (GRU) layer:
    for i in range(num_layers):
        layer = Bidirectional(GRU(num_units, return_sequences=True))(layer)
    # Convolutional layer:
    layer = Conv1D(128, kernel_size=2, padding="valid",
                   kernel_initializer=kernel_initializer)(layer)
    #  Pooling layers:
    avg_pool = GlobalAveragePooling1D()(layer)
    max_pool = GlobalMaxPooling1D()(layer)
    conc = concatenate([avg_pool, max_pool])
    # Dropout layer II:
    conc = Dropout(0.3)(conc)
    # DNN layer:
    den_layer = Dense(64, activation=activation)(conc)
    outp = Dense(num_classes, activation="softmax")(den_layer)
    # Complete model
    model = Model(inputs=inpt, outputs=outp)
    # Defining the loss function, optimizer, and metrics
    if optimizer == 'adam':
        opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    elif optimizer == 'nadam':
        opt = tf.keras.optimizers.Nadam(learning_rate=0.001)
    elif optimizer == 'sgd':
        opt = tf.keras.optimizers.SGD(learning_rate=0.001)
    elif optimizer == 'adamax':
        opt = tf.keras.optimizers.Adamax(learning_rate=0.001)
    else:
        raise ValueError('Invalid optimizer')
    model.compile(loss='categorical_crossentropy', optimizer=opt,
                  metrics=['categorical_accuracy'])
    return model

In [78]:
# from scikeras.wrappers import KerasClassifier
# !pip install scikeras[tensorflow]

In [79]:


# Define the parameter grid

def hyperparameters_tuning():

    param_grid = {
        'num_layers': [1],
        'num_units': [100],
        'activation': ['relu', 'selu', 'elu'],
        'optimizer': ['adam', 'nadam', 'adamax'],
        'kernel_initializer': ['he_uniform', 'glorot_uniform', 'uniform']
    }

    # Create the Keras model
    model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=32)

    # Create the grid search object
    grid_search = GridSearchCV(
        estimator=model, param_grid=param_grid, cv=3, n_jobs=-1)

    # Run the grid search
    grid_result = grid_search.fit(
        x_train_padded, y_train_labels, validation_data=(x_test_padded, y_test_labels))

    # Print the best results
    print('Best score:', grid_result.best_score_)
    print('Best parameters:', grid_result.best_params_)

    # Save the best model
    best_model = grid_result.best_estimator_.model
    best_model.save('best_model_GridSearchCV.h5')
    # Best parameters:  {'activation': 'relu', 'kernel_initializer': 'he_uniform', 'num_layers': 1, 'num_units': 100, 'optimizer': 'adam'}

    return best_model, grid_result

In [80]:
# Print the best parameters and score

if __name__ == '__main__':
    print("hyperparameters tuning ... ")
    # best_model, grid_result = hyperparameters_tuning()
    # print("Best parameters: ", grid_result.best_params_)
    # print("Best score: ", grid_result.best_score_)
    # with open('best_model_hyperparameters.txt', 'w') as f:
    #     f.write(str(grid_result.best_params_))

hyperparameters tuning ... 
