# Utility Functions

In [47]:
WANDB_PROJECT_NAME = os.getenv("WANDB_PROJECT_NAME") or "[NLP] lab-04 | misogyny classification"

import pandas as pd
import os
import numpy as np
from tqdm import tqdm

from transformers import BertTokenizerFast, AlbertTokenizerFast, RobertaTokenizerFast, DistilBertTokenizerFast

In [3]:
def load_data() -> pd.DataFrame:
    """
    Loads the data from the csv file.
    :return: DataFrame
    """
    data_dir = os.getenv("DATA_DIR") or "/home/aleksandar/projects/NLP_2021/Laboratory Exercises/4/data"
    data_file = os.getenv("DATA_FILE") or "trial.csv"
    return pd.read_csv(os.path.join(data_dir, data_file), delimiter="	")

In [49]:
def tokenize(X, tokenizer=None):
    if not tokenizer:
        tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
    
    input_ids, attention_masks = [], []
    for sentence in tqdm(X):
        sentence_tokens = tokenizer.encode_plus(sentence, max_length=10, padding='max_length', truncation=True)
        input_ids.append(sentence_tokens["input_ids"])
        attention_masks.append(sentence_tokens['attention_mask'])

    return input_ids, attention_masks

In [61]:
from tensorflow.keras import backend as K
# Custom metrics to calculate recall, precision and f1 score

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# Load Data

In [4]:
df = load_data()
df.head()

Unnamed: 0,file_name,misogynous,shaming,stereotype,objectification,violence,Text Transcription
0,28.jpg,0,0,0,0,0,"not now, dad. We should burn Jon Snow. stop it..."
1,30.jpg,0,0,0,0,0,there may have been a mixcommunication with th...
2,33.jpg,0,0,0,0,0,i shouldn't have sold my boat
3,58.jpg,1,0,0,0,1,"Bitches be like, It was my fault i made him mad"
4,89.jpg,0,0,0,0,0,find a picture of 4 girls together on FB make ...


In [5]:
df.describe()

Unnamed: 0,misogynous,shaming,stereotype,objectification,violence
count,98.0,98.0,98.0,98.0,98.0
mean,0.44898,0.0,0.346939,0.020408,0.091837
std,0.499947,0.0,0.478443,0.142119,0.29028
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,1.0,0.0,0.0
max,1.0,0.0,1.0,1.0,1.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   file_name           98 non-null     object
 1   misogynous          98 non-null     int64 
 2   shaming             98 non-null     int64 
 3   stereotype          98 non-null     int64 
 4   objectification     98 non-null     int64 
 5   violence            98 non-null     int64 
 6   Text Transcription  98 non-null     object
dtypes: int64(5), object(2)
memory usage: 5.5+ KB


# Data-preprocessing

In [7]:
df = df.drop(columns=["file_name"])
df.head()

Unnamed: 0,misogynous,shaming,stereotype,objectification,violence,Text Transcription
0,0,0,0,0,0,"not now, dad. We should burn Jon Snow. stop it..."
1,0,0,0,0,0,there may have been a mixcommunication with th...
2,0,0,0,0,0,i shouldn't have sold my boat
3,1,0,0,0,1,"Bitches be like, It was my fault i made him mad"
4,0,0,0,0,0,find a picture of 4 girls together on FB make ...


## Train-Test Split

In [30]:
X = df['Text Transcription']
y_task1 = df['misogynous']
y_task2 = df[["shaming", "stereotype", "objectification", "violence"]]

In [82]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train_task1, y_test_task1 = train_test_split(X, y_task1, test_size=0.2, random_state=42)
_, _, y_train_task2, y_test_task2 = train_test_split(X, y_task2, test_size=0.2, random_state=42)


## Tokenization

In [41]:
train_input_ids, train_attention_masks = tokenize(X_train)
test_input_ids, test_attention_masks = tokenize(X_test)

100%|██████████| 78/78 [00:00<00:00, 3057.96it/s]
100%|██████████| 20/20 [00:00<00:00, 3330.00it/s]


# Define Models

In [78]:
from wandb.keras import WandbCallback
from transformers import TFBertForSequenceClassification, TFAlbertForSequenceClassification, TFRobertaForSequenceClassification, TFDistilBertForSequenceClassification

## Task 1

Weights and Biases is going to be used for logging model training and hyperparameter tuning. The project is available at [[NLP] lab-04 | misogyny classification](https://wandb.ai/aleksandar1932/[NLP]%20lab-04%20%7C%20misogyny%20classification?workspace=user-aleksandar1932).

### Bert Model

In [72]:
from wandb import wandb
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy

run = wandb.init(project=WANDB_PROJECT_NAME, job_type="training")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
2021-12-28 21:49:14.201105: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-12-28 21:49:14.201162: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [48]:
model, train_input_ids, train_attention_masks, test_input_ids, test_attention_masks = get_model_and_data(TFBertForSequenceClassification, 4, X_train, X_test)

Creating TFBertForSequenceClassification-bert-base-cased with 4 labels
Tokenizing data with BertTokenizerFast


100%|██████████| 78/78 [00:00<00:00, 3256.67it/s]
100%|██████████| 20/20 [00:00<00:00, 2811.86it/s]


Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

2021-12-28 21:37:48.732458: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (100)
2021-12-28 21:37:48.732585: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (legion-y540): /proc/driver/nvidia/version does not exist
2021-12-28 21:37:48.733082: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for pr

In [73]:
model.summary()
model.compile(optimizer=Adam(learning_rate=0.01),
                                  loss=binary_crossentropy,
                                  metrics=['accuracy', f1_m,precision_m, recall_m])

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  3076      
                                                                 
Total params: 108,313,348
Trainable params: 108,313,348
Non-trainable params: 0
_________________________________________________________________


In [74]:
model.fit([np.array(train_input_ids), np.array(train_attention_masks)],
          np.array(y_train_task1), batch_size=70, epochs=2, verbose=2,
          callbacks=[WandbCallback()])


Epoch 1/2
2/2 - 15s - loss: 8.0156 - accuracy: 0.0000e+00 - f1_m: 0.7760 - precision_m: 0.4304 - recall_m: 4.0000 - 15s/epoch - 8s/step
Epoch 2/2
2/2 - 3s - loss: 8.0156 - accuracy: 0.0000e+00 - f1_m: 1.0303 - precision_m: 0.5964 - recall_m: 4.0000 - 3s/epoch - 1s/step


<keras.callbacks.History at 0x7f322d812640>

In [75]:
model.evaluate([np.array(test_input_ids), np.array(
    test_attention_masks)], np.array(y_test_task1), batch_size=70, verbose=2)
run.finish()


1/1 - 3s - loss: 9.9120 - accuracy: 0.0000e+00 - f1_m: 0.6437 - precision_m: 0.3500 - recall_m: 4.0000 - 3s/epoch - 3s/step


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▁
epoch,▁█
f1_m,▁█
loss,▁▁
precision_m,▁█
recall_m,▁▁

0,1
accuracy,0.0
epoch,1.0
f1_m,1.03029
loss,8.01563
precision_m,0.59643
recall_m,4.0


### Albert Model

## Task 2

In [108]:
num_classes = 4

### Bert Model

In [113]:
run = wandb.init(project=WANDB_PROJECT_NAME, job_type="training")
model, train_input_ids, train_attention_masks, test_input_ids, test_attention_masks = get_model_and_data(
    TFBertForSequenceClassification, num_classes, X_train, X_test)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Problem at: /tmp/ipykernel_694/66653580.py 1 <module>


KeyboardInterrupt: 

In [110]:
model.compile(optimizer=Adam(learning_rate=0.01), loss="categorical_crossentropy",
              metrics=["accuracy", f1_m, precision_m, recall_m])


In [111]:
model.fit([np.array(train_input_ids), np.array(train_attention_masks)],
          np.array(y_train_task2), batch_size=70, epochs=1, verbose=2,
          callbacks=[WandbCallback()])


2/2 - 23s - loss: 0.9684 - accuracy: 0.1026 - f1_m: 0.3185 - precision_m: 0.2163 - recall_m: 0.6048 - 23s/epoch - 12s/step


<keras.callbacks.History at 0x7f31ba1ea850>

In [112]:
model.evaluate([np.array(test_input_ids), np.array(
    test_attention_masks)], np.array(y_test_task2), batch_size=70, verbose=2)
run.finish()


1/1 - 4s - loss: 4.8602 - accuracy: 0.3000 - f1_m: 0.4444 - precision_m: 0.3000 - recall_m: 0.8571 - 4s/epoch - 4s/step


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁
epoch,▁
f1_m,▁
loss,▁
precision_m,▁
recall_m,▁

0,1
accuracy,0.10256
epoch,0.0
f1_m,0.31851
loss,0.96842
precision_m,0.21635
recall_m,0.60476
