In [82]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from tensorflow.python.keras.preprocessing.text import Tokenizer

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

from scripts.word_embeddings import load_embedding_weights

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy

In [83]:
def save(name, words, vectors, path='.'):
    with open(f'{path}/{name}.txt', 'w+', encoding='utf-8') as doc:
        for word, vector in zip(words, vectors):
            doc.write(word + ' ' + ' '.join(str(value) for value in vector))
            doc.write('\n')

# Load Dataset

In [84]:
df = pd.read_csv('data/trial.csv',delimiter="	")
df.columns = ['file_name', 'misogynous', 'shaming', 'stereotype', 'objectification', 'violence', 'text_transcription']
df.head()

Unnamed: 0,file_name,misogynous,shaming,stereotype,objectification,violence,text_transcription
0,28.jpg,0,0,0,0,0,"not now, dad. We should burn Jon Snow. stop it..."
1,30.jpg,0,0,0,0,0,there may have been a mixcommunication with th...
2,33.jpg,0,0,0,0,0,i shouldn't have sold my boat
3,58.jpg,1,0,0,0,1,"Bitches be like, It was my fault i made him mad"
4,89.jpg,0,0,0,0,0,find a picture of 4 girls together on FB make ...


# Create Embeddings

In [85]:
from nltk import word_tokenize
from nltk.corpus import stopwords
ENGLISH_STOPWORDS = stopwords.words('english')

def remove_punctuation(token:str)->str:
    punctuation_regex = '!"#$&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    return ' '.join(word.strip(punctuation_regex) for word in token.split())

def nlp_pipeline(token:str) -> str:
    token = remove_punctuation(token)
    tokens = word_tokenize(token.lower())
    tokens = [token for token in tokens if token not in ENGLISH_STOPWORDS]
    return tokens

df['text_transcription_tokens'] = df['text_transcription'].apply(lambda x: nlp_pipeline(x))
df.head()

Unnamed: 0,file_name,misogynous,shaming,stereotype,objectification,violence,text_transcription,text_transcription_tokens
0,28.jpg,0,0,0,0,0,"not now, dad. We should burn Jon Snow. stop it...","[dad, burn, jon, snow, stop, dad, know, happen..."
1,30.jpg,0,0,0,0,0,there may have been a mixcommunication with th...,"[may, mixcommunication, decorator, happy, birt..."
2,33.jpg,0,0,0,0,0,i shouldn't have sold my boat,"[n't, sold, boat]"
3,58.jpg,1,0,0,0,1,"Bitches be like, It was my fault i made him mad","[bitches, like, fault, made, mad]"
4,89.jpg,0,0,0,0,0,find a picture of 4 girls together on FB make ...,"[find, picture, 4, girls, together, fb, make, ..."


## Word2Vec

In [86]:
sentences = df['text_transcription_tokens'].values

def build_embeddings(model_name, sentences, size, path):
    if model_name == 'word2vec':
        model = Word2Vec(sentences, vector_size=size, min_count=1, window=5, sg=1)
        vectors = model.wv.vectors      
        words = model.wv.index_to_key

        save(f'{model_name}SG.iSarcasamEval.{size}d', words, vectors,path=path)

for size in [10, 50, 100]:
    build_embeddings('word2vec', sentences, size=size, path='.')

# Task 1

## Tokenization and vocabulary creation

In [87]:
sentences = df['text_transcription_tokens'].values
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

In [88]:
vocabulary = list(set(tokenizer.word_index.keys()))
vocabulary_size = len(tokenizer.word_index)
max_length = max(map(lambda x: len(x), tokenizer.word_index.keys()))

X = tokenizer.texts_to_sequences(sentences)
X_pad = pad_sequences(X, maxlen=max_length, padding='post')

In [89]:
y = df['misogynous']

## Load pre-trained weights

In [111]:
embeddings = {
    10: load_embedding_weights(vocabulary, 10, 'word2vecSG', "."),
    50: load_embedding_weights(vocabulary, 50, 'word2vecSG', "."),
    100: load_embedding_weights(vocabulary, 100, 'word2vecSG', ".")
}

In [114]:
glove_50 = load_embedding_weights(vocabulary, 50, 'glove', "/mnt/d/Downloads")

Creating embedding weights...


## Define Models

In [112]:
import wandb
from wandb.keras import WandbCallback

In [92]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.1)

### Without initialized weights

In [109]:
model = Sequential(name="withoutInitializedWeights")
model.add(Embedding(input_dim = vocabulary_size + 1, output_dim=100))
model.add(LSTM(units=100, return_sequences=True))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=Adam(learning_rate = 0.01), loss=binary_crossentropy, metrics=['accuracy'])
model.summary()

Model: "withoutInitializedWeights"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_27 (Embedding)    (None, None, 100)         47100     
                                                                 
 lstm_9 (LSTM)               (None, None, 100)         80400     
                                                                 
 dense_22 (Dense)            (None, None, 1)           101       
                                                                 
Total params: 127,601
Trainable params: 127,601
Non-trainable params: 0
_________________________________________________________________


In [110]:
run = wandb.init(reinit=True, name=model.name)
model.fit(X_train, y_train, epochs=100, callbacks=[WandbCallback()])
run.finish()

2021-11-13 17:32:23.992145: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-13 17:32:23.992188: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

0,1
accuracy,▁▃██████████████████████████████████████
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
loss,██▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
accuracy,0.99759
epoch,99.0
loss,0.00316


### Word2Vec 50

In [104]:
model = Sequential(name="word2vec.50d")

model.add(Embedding(input_dim = vocabulary_size + 1, weights=[embeddings[100]], output_dim=100))
model.add(LSTM(units=64, return_sequences=True))
model.add(LSTM(units=128, return_sequences=True))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate = 0.01), loss=binary_crossentropy, metrics=['accuracy'])
model.summary()

Model: "word2vec.50d"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_25 (Embedding)    (None, None, 100)         47100     
                                                                 
 lstm_6 (LSTM)               (None, None, 64)          42240     
                                                                 
 lstm_7 (LSTM)               (None, None, 128)         98816     
                                                                 
 dense_20 (Dense)            (None, None, 1)           129       
                                                                 
Total params: 188,285
Trainable params: 188,285
Non-trainable params: 0
_________________________________________________________________


In [106]:
run = wandb.init(reinit=True, name=model.name)
model.fit(X_train, y_train, epochs=50, callbacks=[WandbCallback()])
run.finish()

2021-11-13 17:31:28.069684: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-13 17:31:28.069729: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


0,1
accuracy,▁██▅████████▅█████████████▄█▅██▅██▅███▅█
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
loss,██▇█▆▆▆▆▆▆▆▆▅▅▅▄▄▄▄▃▄▄▄▃▃▂▄▄▃▂▁▂▂▂▂▂▁▁▂▁

0,1
accuracy,0.99828
epoch,49.0
loss,0.00301


### GloVe 50

In [124]:
model = Sequential(name="glove.50d")

model.add(Embedding(input_dim = vocabulary_size+1, output_dim=50))
model.add(LSTM(units=128, return_sequences=True))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate = 0.05), loss=binary_crossentropy, metrics=['accuracy'])
model.summary()

Model: "glove.50d"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_33 (Embedding)    (None, None, 50)          23550     
                                                                 
 lstm_14 (LSTM)              (None, None, 128)         91648     
                                                                 
 dense_26 (Dense)            (None, None, 1)           129       
                                                                 
Total params: 115,327
Trainable params: 115,327
Non-trainable params: 0
_________________________________________________________________


In [126]:
run = wandb.init(reinit=True, name=model.name)
model.fit(X_train, y_train, epochs=50, callbacks=[WandbCallback()])
run.finish()

2021-11-13 18:01:33.792602: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-13 18:01:33.792646: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


0,1
accuracy,████████████▁█▄█▄▄██▅███▁▄▅▅██████████▄█
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
loss,██▇▇▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▂▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁

0,1
accuracy,0.99828
epoch,49.0
loss,0.00303


WANDB was used to track the runs, all of the results are available at the [link](https://wandb.ai/aleksandar1932/NLP_2021-Laboratory%20Exercises_2?workspace=user-aleksandar1932).