# Utility Functions

In [2]:
import os
import numpy as np

from sklearn.model_selection import train_test_split

from scripts.utils import load_data
from scripts.model import recall_m, precision_m, f1_m, get_model_and_data

WANDB_PROJECT_NAME = os.getenv("WANDB_PROJECT_NAME") or "[NLP] lab-04 | misogyny classification"

# Load Data

In [3]:
df = load_data()
df.head()

Unnamed: 0,file_name,misogynous,shaming,stereotype,objectification,violence,Text Transcription
0,28.jpg,0,0,0,0,0,"not now, dad. We should burn Jon Snow. stop it..."
1,30.jpg,0,0,0,0,0,there may have been a mixcommunication with th...
2,33.jpg,0,0,0,0,0,i shouldn't have sold my boat
3,58.jpg,1,0,0,0,1,"Bitches be like, It was my fault i made him mad"
4,89.jpg,0,0,0,0,0,find a picture of 4 girls together on FB make ...


In [None]:
df.describe()

In [None]:
df.info()

# Data-preprocessing

In [4]:
df = df.drop(columns=["file_name"])
df.head()

Unnamed: 0,misogynous,shaming,stereotype,objectification,violence,Text Transcription
0,0,0,0,0,0,"not now, dad. We should burn Jon Snow. stop it..."
1,0,0,0,0,0,there may have been a mixcommunication with th...
2,0,0,0,0,0,i shouldn't have sold my boat
3,1,0,0,0,1,"Bitches be like, It was my fault i made him mad"
4,0,0,0,0,0,find a picture of 4 girls together on FB make ...


## Train-Test Split

In [5]:
X = df['Text Transcription']
y_task1 = df['misogynous']
y_task2 = df[["shaming", "stereotype", "objectification", "violence"]]

In [6]:

X_train, X_test, y_train_task1, y_test_task1 = train_test_split(X, y_task1, test_size=0.2, random_state=42)
_, _, y_train_task2, y_test_task2 = train_test_split(X, y_task2, test_size=0.2, random_state=42)


# Define Models

In [7]:
from wandb.keras import WandbCallback
from transformers import TFBertForSequenceClassification, TFAlbertForSequenceClassification, TFRobertaForSequenceClassification, TFDistilBertForSequenceClassification

## Task 1

Weights and Biases is going to be used for logging model training and hyperparameter tuning. The project is available at [[NLP] lab-04 | misogyny classification](https://wandb.ai/aleksandar1932/[NLP]%20lab-04%20%7C%20misogyny%20classification?workspace=user-aleksandar1932).

### Bert Model

In [None]:
from wandb import wandb
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy

run = wandb.init(project=WANDB_PROJECT_NAME, job_type="training")

In [8]:
model, train_input_ids, train_attention_masks, test_input_ids, test_attention_masks = get_model_and_data(TFBertForSequenceClassification, 4, X_train, X_test)

Creating TFBertForSequenceClassification-bert-base-cased with 4 labels
Tokenizing data with BertTokenizerFast


100%|██████████| 78/78 [00:00<00:00, 3015.18it/s]
100%|██████████| 20/20 [00:00<00:00, 3630.96it/s]
2021-12-28 22:31:16.537762: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: UNKNOWN ERROR (100)
2021-12-28 22:31:16.537891: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (legion-y540): /proc/driver/nvidia/version does not exist
2021-12-28 22:31:16.538248: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: [

In [None]:
model.summary()
model.compile(optimizer=Adam(learning_rate=0.01),
                                  loss=binary_crossentropy,
                                  metrics=['accuracy', f1_m,precision_m, recall_m])

In [None]:
model.fit([np.array(train_input_ids), np.array(train_attention_masks)],
          np.array(y_train_task1), batch_size=70, epochs=2, verbose=2,
          callbacks=[WandbCallback()])


In [None]:
model.evaluate([np.array(test_input_ids), np.array(
    test_attention_masks)], np.array(y_test_task1), batch_size=70, verbose=2)
run.finish()


### Albert Model

In [None]:
# TODO

### Roberta Model

In [9]:
# TODO

### DistilBert Model

In [10]:
# TODO

## Task 2

In [None]:
num_classes = 4

### Bert Model

In [None]:
run = wandb.init(project=WANDB_PROJECT_NAME, job_type="training")
model, train_input_ids, train_attention_masks, test_input_ids, test_attention_masks = get_model_and_data(
    TFBertForSequenceClassification, num_classes, X_train, X_test)

In [None]:
model.compile(optimizer=Adam(learning_rate=0.01), loss="categorical_crossentropy",
              metrics=["accuracy", f1_m, precision_m, recall_m])


In [None]:
model.fit([np.array(train_input_ids), np.array(train_attention_masks)],
          np.array(y_train_task2), batch_size=70, epochs=1, verbose=2,
          callbacks=[WandbCallback()])


In [None]:
model.evaluate([np.array(test_input_ids), np.array(
    test_attention_masks)], np.array(y_test_task2), batch_size=70, verbose=2)
run.finish()


### Albert Model

In [None]:
# TODO

### Roberta Model

In [None]:
# TODO

### DistilBert Model

In [None]:
# TODO