In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

from tensorflow.keras.losses import binary_crossentropy

2021-11-14 21:39:07.842452: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-14 21:39:07.842495: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
def save(name, words, vectors, path='.'):
    with open(f'{path}/{name}.txt', 'w+', encoding='utf-8') as doc:
        for word, vector in zip(words, vectors):
            doc.write(word + ' ' + ' '.join(str(value) for value in vector))
            doc.write('\n')

In [3]:
import os
import pickle

def load_embeddings(file_name, vocabulary):
    """
    Loads word embeddings from the file with the given name.
    :param file_name: name of the file containing word embeddings
    :type file_name: str
    :param vocabulary: captions vocabulary
    :type vocabulary: numpy.array
    :return: word embeddings
    :rtype: dict
    """
    embeddings = dict()
    with open(file_name, 'r', encoding='utf-8') as doc:
        line = doc.readline()
        while line != '':
            line = line.rstrip('\n').lower()
            parts = line.split(' ')
            vals = np.array(parts[1:], dtype=np.float)
            if parts[0] in vocabulary:
                embeddings[parts[0]] = vals
            line = doc.readline()
    return embeddings


def load_embedding_weights(vocabulary, embedding_size, embedding_type, path='.'):
    print("local")
    """
    Creates and loads embedding weights.
    :param vocabulary: vocabulary
    :type vocabulary: numpy.array
    :param embedding_size: embedding size
    :type embedding_size: int
    :param embedding_type: type of the pre-trained embeddings
    :type embedding_type: string
    :return: embedding weights
    :rtype: numpy.array
    """
    if os.path.exists(f'{path}/embedding_matrix_{embedding_type}_{embedding_size}.pkl'):
        with open(f'{path}/embedding_matrix_{embedding_type}_{embedding_size}.pkl', 'rb') as f:
            embedding_matrix = pickle.load(f)
    else:
        print('Creating embedding weights...')
        if embedding_type == 'glove':
            embeddings = load_embeddings(f'{path}/glove.6B.{embedding_size}d.txt', vocabulary)
        else:
          embeddings = load_embeddings(f'{path}/word2vecSG.iSarcasamEval.{embedding_size}d.txt', vocabulary)
        embedding_matrix = np.zeros((len(vocabulary), embedding_size))
        for i in range(len(vocabulary)):
            if vocabulary[i] in embeddings.keys():
                embedding_matrix[i] = embeddings[vocabulary[i]]
            else:
                embedding_matrix[i] = np.random.standard_normal(embedding_size)
        with open(f'{path}/embedding_matrix_{embedding_type}_{embedding_size}.pkl', 'wb') as f:
            pickle.dump(embedding_matrix, f)
    return embedding_matrix

from tensorflow.keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [4]:
def create_vocabulary(sentence_tokens):
    vocabulary = set()
    for tokens in sentence_tokens:
        vocabulary.update(tokens)

    vocabulary = list(vocabulary)
    word_to_id = {word: index for word, index in zip(vocabulary, range(len(vocabulary)))}
    return vocabulary, word_to_id

# Load Dataset

In [5]:
df = pd.read_csv('data/trial.csv',delimiter="	")
df.columns = ['file_name', 'misogynous', 'shaming', 'stereotype', 'objectification', 'violence', 'text_transcription']
df.head()

Unnamed: 0,file_name,misogynous,shaming,stereotype,objectification,violence,text_transcription
0,28.jpg,0,0,0,0,0,"not now, dad. We should burn Jon Snow. stop it..."
1,30.jpg,0,0,0,0,0,there may have been a mixcommunication with th...
2,33.jpg,0,0,0,0,0,i shouldn't have sold my boat
3,58.jpg,1,0,0,0,1,"Bitches be like, It was my fault i made him mad"
4,89.jpg,0,0,0,0,0,find a picture of 4 girls together on FB make ...


# Create Embeddings

In [6]:
from nltk import word_tokenize
from nltk.corpus import stopwords
ENGLISH_STOPWORDS = stopwords.words('english')

def remove_punctuation(token:str)->str:
    punctuation_regex = '!"#$&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    return ' '.join(word.strip(punctuation_regex) for word in token.split())

def nlp_pipeline(token:str) -> str:
    token = remove_punctuation(token)
    tokens = word_tokenize(token.lower())
    tokens = [token for token in tokens if token not in ENGLISH_STOPWORDS]
    return tokens

df['text_transcription_tokens'] = df['text_transcription'].apply(lambda x: nlp_pipeline(x))
df.head()

Unnamed: 0,file_name,misogynous,shaming,stereotype,objectification,violence,text_transcription,text_transcription_tokens
0,28.jpg,0,0,0,0,0,"not now, dad. We should burn Jon Snow. stop it...","[dad, burn, jon, snow, stop, dad, know, happen..."
1,30.jpg,0,0,0,0,0,there may have been a mixcommunication with th...,"[may, mixcommunication, decorator, happy, birt..."
2,33.jpg,0,0,0,0,0,i shouldn't have sold my boat,"[n't, sold, boat]"
3,58.jpg,1,0,0,0,1,"Bitches be like, It was my fault i made him mad","[bitches, like, fault, made, mad]"
4,89.jpg,0,0,0,0,0,find a picture of 4 girls together on FB make ...,"[find, picture, 4, girls, together, fb, make, ..."


## Word2Vec

In [7]:
sentences = df['text_transcription_tokens'].values

def build_embeddings(model_name, sentences, size, path):
    if model_name == 'word2vec':
        model = Word2Vec(sentences, vector_size=size, min_count=1, window=5, sg=1)
        vectors = model.wv.vectors      
        words = model.wv.index_to_key

        save(f'{model_name}SG.iSarcasamEval.{size}d', words, vectors,path=path)

for size in [10, 50, 100]:
    build_embeddings('word2vec', sentences, size=size, path='.')

# Task 1

## Tokenization and vocabulary creation

In [8]:
sentences = df['text_transcription_tokens'].values

vocabulary, word_to_id = create_vocabulary(sentences)
vocabulary_size = len(vocabulary)
max_length = max(map(lambda x: len(x), word_to_id.keys()))
X = df['text_transcription_tokens'].apply(lambda x: np.array([word_to_id[i] for i in x])).values

X_pad = pad_sequences(X, maxlen=max_length, padding='post')
y = df['misogynous']

## Pre-trained Weights

In [9]:
embeddings = {
    50: load_embedding_weights(vocabulary, 50, 'word2vecSG', ".")}

local


In [10]:
glove_50 = load_embedding_weights(vocabulary, 50, 'glove', "/mnt/d/Downloads")

local


## Define Models

In [11]:
import wandb
from wandb.keras import WandbCallback

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.1)

### Without initialized weights

In [13]:
model = Sequential(name="without_initialized_weights.task1")
model.add(Embedding(input_dim = vocabulary_size, output_dim=100))
model.add(LSTM(units=100, return_sequences=True))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate = 0.01), loss=binary_crossentropy, metrics=['accuracy',f1_m,precision_m, recall_m])
model.summary()

2021-11-14 21:39:31.099441: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-11-14 21:39:31.099482: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-11-14 21:39:31.099525: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (legion-y540): /proc/driver/nvidia/version does not exist
2021-11-14 21:39:31.099834: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "without_initialized_weights.task1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 100)         47000     
                                                                 
 lstm (LSTM)                 (None, None, 100)         80400     
                                                                 
 dense (Dense)               (None, None, 1)           101       
                                                                 
Total params: 127,501
Trainable params: 127,501
Non-trainable params: 0
_________________________________________________________________


In [14]:
run = wandb.init(reinit=True, name=model.name)
model.fit(X_train, y_train, epochs=100, callbacks=[WandbCallback()])

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maleksandar1932[0m (use `wandb login --relogin` to force relogin)
  warn("The `IPython.html` package has been deprecated since IPython 4.0. "
2021-11-14 21:39:36.637687: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-14 21:39:36.637762: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f3d44353af0>

In [15]:
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')
print(f'Test f1_score: {f1_score:.4f}')
print(f'Test precision: {precision:.4f}')
print(f'Test recall: {recall:.4f}')

run.finish()

Test loss: 6.9088
Test accuracy: 0.5030
Test f1_score: 0.6600
Test precision: 0.3350
Test recall: 22.0000


0,1
accuracy,▁▄██████████████████████████████████████
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
f1_m,▁▇██████████████████████████████████████
loss,█▇▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
precision_m,▁███████████████████████████████████████
recall_m,▁▄██████████████████████████████████████

0,1
accuracy,0.99862
epoch,99.0
f1_m,1.93687
loss,0.00219
precision_m,0.99773
recall_m,32.97917


### Word2Vec 50

In [16]:
model = Sequential(name="word2vec.50d.task1")

model.add(Embedding(input_dim = vocabulary_size, weights=[embeddings[50]], output_dim=50))
model.add(LSTM(units=64, return_sequences=True))
model.add(LSTM(units=128, return_sequences=True))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate = 0.01), loss=binary_crossentropy, metrics=['accuracy', f1_m,precision_m, recall_m])
model.summary()

Model: "word2vec.50d.task1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 50)          23500     
                                                                 
 lstm_1 (LSTM)               (None, None, 64)          29440     
                                                                 
 lstm_2 (LSTM)               (None, None, 128)         98816     
                                                                 
 dense_1 (Dense)             (None, None, 1)           129       
                                                                 
Total params: 151,885
Trainable params: 151,885
Non-trainable params: 0
_________________________________________________________________


In [17]:
run = wandb.init(reinit=True, name=model.name)
model.fit(X_train, y_train, epochs=50, callbacks=[WandbCallback()])

2021-11-14 21:40:39.145566: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-14 21:40:39.145613: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f3ca5cd3550>

In [18]:
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')
print(f'Test f1_score: {f1_score:.4f}')
print(f'Test precision: {precision:.4f}')
print(f'Test recall: {recall:.4f}')

run.finish()

Test loss: 6.2160
Test accuracy: 0.5697
Test f1_score: 0.7051
Test precision: 0.3595
Test recall: 18.3333


0,1
accuracy,▁▁▂▇████████████████████████████████████
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
f1_m,▂▁▂█████████████████████████████████████
loss,███▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
precision_m,▂▁▃█████████████████████████████████████
recall_m,▃▁▁▇████████████████████████████████████

0,1
accuracy,0.99862
epoch,49.0
f1_m,1.93869
loss,0.00237
precision_m,0.99873
recall_m,32.94074


### GloVe 50

In [19]:
model = Sequential(name="glove.50d.task1")

model.add(Embedding(input_dim = vocabulary_size, output_dim=50))
model.add(LSTM(units=128, return_sequences=True))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate = 0.05), loss=binary_crossentropy, metrics=['accuracy',f1_m,precision_m, recall_m])
model.summary()

Model: "glove.50d.task1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, None, 50)          23500     
                                                                 
 lstm_3 (LSTM)               (None, None, 128)         91648     
                                                                 
 dense_2 (Dense)             (None, None, 1)           129       
                                                                 
Total params: 115,277
Trainable params: 115,277
Non-trainable params: 0
_________________________________________________________________


In [20]:
run = wandb.init(reinit=True, name=model.name)
model.fit(X_train, y_train, epochs=50, callbacks=[WandbCallback()])

2021-11-14 21:41:16.595611: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-14 21:41:16.595653: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f3ca5b8c4c0>

In [22]:
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')
print(f'Test f1_score: {f1_score:.4f}')
print(f'Test precision: {precision:.4f}')
print(f'Test recall: {recall:.4f}')

Test loss: 3.0158
Test accuracy: 0.4727
Test f1_score: 0.7174
Test precision: 0.3626
Test recall: 33.0000


In [23]:
run.finish()

0,1
accuracy,▂▁▁▂▁▂▁▁▂▂▄▃▇▆▅▇▆███████████████████████
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
f1_m,▁▂▂▁▃▁▂▃▃▃▅▅▇█▇▇▇▇██████████████████████
loss,██▇▇▇█▇▇▆▆▆▆▅▄▄▄▃▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
precision_m,▁▄▂▃▂▅▄▂▆▃▅▇▇█▇▇▇▇██████████████████████
recall_m,▄▃█▁█▁▃█▁█▆▃█▅▅█▆███████████████████████

0,1
accuracy,0.99862
epoch,49.0
f1_m,1.94001
loss,0.00415
precision_m,0.99944
recall_m,32.93704


## Evaluate Models

WANDB was used to track the training process along with the evaluation, all of the results are available at the [link](https://wandb.ai/aleksandar1932/NLP_2021-Laboratory%20Exercises_2?workspace=user-aleksandar1932).

*Runs are postfixed with `.task1`*.

# Task 2

## Tokenization and vocabulary creation

In [24]:
df.head()

Unnamed: 0,file_name,misogynous,shaming,stereotype,objectification,violence,text_transcription,text_transcription_tokens
0,28.jpg,0,0,0,0,0,"not now, dad. We should burn Jon Snow. stop it...","[dad, burn, jon, snow, stop, dad, know, happen..."
1,30.jpg,0,0,0,0,0,there may have been a mixcommunication with th...,"[may, mixcommunication, decorator, happy, birt..."
2,33.jpg,0,0,0,0,0,i shouldn't have sold my boat,"[n't, sold, boat]"
3,58.jpg,1,0,0,0,1,"Bitches be like, It was my fault i made him mad","[bitches, like, fault, made, mad]"
4,89.jpg,0,0,0,0,0,find a picture of 4 girls together on FB make ...,"[find, picture, 4, girls, together, fb, make, ..."


In [62]:
sentences = df['text_transcription_tokens'].values

In [25]:
vocabulary, word_to_id = create_vocabulary(sentences)
vocabulary_size = len(vocabulary)
max_length = max(map(lambda x: len(x), word_to_id.keys()))
X = df['text_transcription_tokens'].apply(lambda x: np.array([word_to_id[i] for i in x])).values
X_pad = pad_sequences(X, maxlen=max_length, padding='post')

In [26]:
y = df.iloc[:,2:6].values.astype('float').reshape((-1,4))

## Pre-trained Weights

## Define Models

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.1)

### Without Pre-Trained Weights

In [28]:
from tensorflow.keras.losses import categorical_crossentropy, binary_crossentropy

In [41]:
model = Sequential(name="withoutInitializedWeights.misogenyType")
model.add(Embedding(input_dim=vocabulary_size, output_dim=50))
model.add(LSTM(units=128))
model.add(Dense(4,activation='softmax'))

model.compile(optimizer=Adam(learning_rate = 0.05), loss=categorical_crossentropy,  metrics=['accuracy',f1_m,precision_m, recall_m])
model.summary()
model.output_shape

Model: "withoutInitializedWeights.misogenyType"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, None, 50)          23500     
                                                                 
 lstm_11 (LSTM)              (None, 128)               91648     
                                                                 
 dense_10 (Dense)            (None, 4)                 516       
                                                                 
Total params: 115,664
Trainable params: 115,664
Non-trainable params: 0
_________________________________________________________________


(None, 4)

In [42]:
run = wandb.init(reinit=True, name=model.name)
model.fit(X_train, y_train, epochs=200, batch_size=10, callbacks=[WandbCallback()])

2021-11-14 21:50:00.262228: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-14 21:50:00.262265: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7f3bd04484c0>

In [43]:
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')
print(f'Test f1_score: {f1_score:.4f}')
print(f'Test precision: {precision:.4f}')
print(f'Test recall: {recall:.4f}')

run.finish()

Test loss: 0.3715
Test accuracy: 0.5000
Test f1_score: 0.6250
Test precision: 0.5000
Test recall: 0.8333


0,1
accuracy,▁▇████▇█▅█▄▅▅█▃█▄██▅▅▂▇▄▇████▂▅████▇▄██▄
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
f1_m,▁▃▆▆▇▅▂▄▃█▃▄▃█▄█▃▃▄▅▂▃▄▅▇▆▆▆▅▄▇█▆▆▆▃▃▅▅▂
loss,█▁▂▂▁▂▄▂▂▂▅▄▂▂▄▁▃▂▂▃▂▃▂▆▂▂▂▂▂▅▁▂▄▁▂▂▂▂▁▃
precision_m,▁▃▆▆▇▅▂▅▄█▃▅▃█▄█▃▂▄▆▃▃▄▅▇▆▆▆▅▄▇█▆▇▆▄▃▅▅▂
recall_m,▁▃▆▆▆▄▂▄▃▇▃▃▂█▄█▃▄▃▅▁▃▃▄▇▅▆▆▅▃▇▇▅▆▆▃▃▆▄▁

0,1
accuracy,0.26136
epoch,199.0
f1_m,0.22568
loss,0.38463
precision_m,0.16944
recall_m,0.36561


### Word2Vec 50

In [44]:
model = Sequential(name="word2vec.50d.misogenyType")
model.add(Embedding(input_dim=len(vocabulary), output_dim=50, weights=[embeddings[50]]))
model.add(LSTM(units=64, activation='relu'))
model.add(Dense(4,activation='softmax'))

model.compile(optimizer=Adam(learning_rate = 0.05), loss=categorical_crossentropy,  metrics=['accuracy',f1_m,precision_m, recall_m])
model.summary()
model.output_shape

Model: "word2vec.50d.misogenyType"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_11 (Embedding)    (None, None, 50)          23500     
                                                                 
 lstm_12 (LSTM)              (None, 64)                29440     
                                                                 
 dense_11 (Dense)            (None, 4)                 260       
                                                                 
Total params: 53,200
Trainable params: 53,200
Non-trainable params: 0
_________________________________________________________________


(None, 4)

In [45]:
run = wandb.init(reinit=True, name=model.name)
model.fit(X_train, y_train, epochs=200, callbacks=[WandbCallback()])

2021-11-14 21:51:15.527647: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-14 21:51:15.527690: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7f3bf0106b20>

In [46]:
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')
print(f'Test f1_score: {f1_score:.4f}')
print(f'Test precision: {precision:.4f}')
print(f'Test recall: {recall:.4f}')

run.finish()

Test loss: nan
Test accuracy: 0.5000
Test f1_score: nan
Test precision: nan
Test recall: nan


0,1
accuracy,▃▁██████████████████████████████████████
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
f1_m,▁█
loss,▁█
precision_m,▁█
recall_m,▁█

0,1
accuracy,0.59091
epoch,199.0
f1_m,
loss,
precision_m,
recall_m,


### GloVe 50

In [47]:
model = Sequential(name="word2vec.50d.misogenyType")
model.add(Embedding(input_dim=len(vocabulary), output_dim=50, weights=[embeddings[50]]))
model.add(LSTM(units=64, activation='relu'))
model.add(Dense(4,activation='softmax'))

model.compile(optimizer=Adam(learning_rate = 0.05), loss=categorical_crossentropy,  metrics=['accuracy',f1_m,precision_m, recall_m])
model.summary()
model.output_shape

Model: "word2vec.50d.misogenyType"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, None, 50)          23500     
                                                                 
 lstm_13 (LSTM)              (None, 64)                29440     
                                                                 
 dense_12 (Dense)            (None, 4)                 260       
                                                                 
Total params: 53,200
Trainable params: 53,200
Non-trainable params: 0
_________________________________________________________________


(None, 4)

In [48]:
run = wandb.init(reinit=True, name=model.name)
model.fit(X_train, y_train, epochs=200, callbacks=[WandbCallback()])

2021-11-14 21:51:50.906277: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-14 21:51:50.906329: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7f3ca4e64520>

In [49]:
loss, accuracy, f1_score, precision, recall = model.evaluate(X_test, y_test, verbose=0)
print(f'Test loss: {loss:.4f}')
print(f'Test accuracy: {accuracy:.4f}')
print(f'Test f1_score: {f1_score:.4f}')
print(f'Test precision: {precision:.4f}')
print(f'Test recall: {recall:.4f}')

run.finish()

Test loss: nan
Test accuracy: 0.5000
Test f1_score: nan
Test precision: nan
Test recall: nan


0,1
accuracy,▁▁██████████████████████████████████████
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
f1_m,▁█
loss,▁█
precision_m,▁█
recall_m,▁█

0,1
accuracy,0.59091
epoch,199.0
f1_m,
loss,
precision_m,
recall_m,


## Evaluate Models

WANDB was used to track the training process along with the evaluation, all of the results are available at the [link](https://wandb.ai/aleksandar1932/NLP_2021-Laboratory%20Exercises_2?workspace=user-aleksandar1932).

*Runs are postfixed with `.misogenyType`*.