# Utility Functions

In [149]:
import pandas as pd
import os
import numpy as np

from transformers import BertTokenizerFast

In [17]:
def load_data() -> pd.DataFrame:
    """
    Loads the data from the csv file.
    :return: DataFrame
    """
    data_dir = os.getenv("DATA_DIR") or "/home/aleksandar/projects/NLP_2021/Laboratory Exercises/4/data"
    data_file = os.getenv("DATA_FILE") or "trial.csv"
    return pd.read_csv(os.path.join(data_dir, data_file), delimiter="	")

In [151]:
def tokenize(X, tokenizer=None):
    if not tokenizer:
        tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
    
    input_ids, attention_masks = [], []
    for sentence in X:
        sentence_tokens = tokenizer.encode_plus(sentence, max_length=10, pad_to_max_length=True, truncation=True)
        input_ids.append(sentence_tokens["input_ids"])
        attention_masks.append(sentence_tokens['attention_mask'])

    return input_ids, attention_masks

# Load Data

In [144]:
df = load_data()
df.head()

Unnamed: 0,file_name,misogynous,shaming,stereotype,objectification,violence,Text Transcription
0,28.jpg,0,0,0,0,0,"not now, dad. We should burn Jon Snow. stop it..."
1,30.jpg,0,0,0,0,0,there may have been a mixcommunication with th...
2,33.jpg,0,0,0,0,0,i shouldn't have sold my boat
3,58.jpg,1,0,0,0,1,"Bitches be like, It was my fault i made him mad"
4,89.jpg,0,0,0,0,0,find a picture of 4 girls together on FB make ...


In [21]:
df.describe()

Unnamed: 0,misogynous,shaming,stereotype,objectification,violence
count,98.0,98.0,98.0,98.0,98.0
mean,0.44898,0.0,0.346939,0.020408,0.091837
std,0.499947,0.0,0.478443,0.142119,0.29028
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,1.0,0.0,0.0
max,1.0,0.0,1.0,1.0,1.0


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 98 entries, 0 to 97
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   file_name           98 non-null     object
 1   misogynous          98 non-null     int64 
 2   shaming             98 non-null     int64 
 3   stereotype          98 non-null     int64 
 4   objectification     98 non-null     int64 
 5   violence            98 non-null     int64 
 6   Text Transcription  98 non-null     object
dtypes: int64(5), object(2)
memory usage: 5.5+ KB


# Data-preprocessing

In [145]:
df = df.drop(columns=["file_name"])
df.head()

Unnamed: 0,misogynous,shaming,stereotype,objectification,violence,Text Transcription
0,0,0,0,0,0,"not now, dad. We should burn Jon Snow. stop it..."
1,0,0,0,0,0,there may have been a mixcommunication with th...
2,0,0,0,0,0,i shouldn't have sold my boat
3,1,0,0,0,1,"Bitches be like, It was my fault i made him mad"
4,0,0,0,0,0,find a picture of 4 girls together on FB make ...


## Train-Test Split

In [158]:
X = df['Text Transcription']
y_task1 = df['misogynous']
y_task2 = df[["shaming", "stereotype", "objectification", "violence"]]

In [160]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train_task1, y_test_task1 = train_test_split(X, y_task1, test_size=0.2, random_state=42)

## Tokenization

In [159]:
train_input_ids, train_attention_masks = tokenize(X_train)
train_input_ids, train_attention_masks = tokenize(X_train)




## Define Models

## Task 1

In [180]:
from transformers import TFBertForSequenceClassification, TFAlbertForSequenceClassification, TFRobertaForSequenceClassification, TFDistilBertForSequenceClassification
models = [TFBertForSequenceClassification, TFAlbertForSequenceClassification,
          TFRobertaForSequenceClassification, TFDistilBertForSequenceClassification]

import logging
import sys
from logging import getLogger
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

task1_logger = getLogger("Task1")

for model_cls in models:
    task1_logger.info(f"Performing classification with {model_cls.__name__}")
    model = model_cls.from_pretrained('bert-base-uncased', num_labels = len(set(y_task1)))
    model.summary()

INFO:Task1:Performing classification with TFBertForSequenceClassification
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/models/bert-base-uncased HTTP/1.1" 200 1093
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/tf_model.h5 HTTP/1.1" 302 0


All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_470 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
INFO:Task1:Performing classification with TFAlbertForSequenceClassification
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/models/bert-base-uncased HTTP/1.1" 200 1093
DEBUG:urll

You are using a model of type bert to instantiate a model of type albert. This is not supported for all configurations of models and can yield errors.


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/tf_model.h5 HTTP/1.1" 302 0


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFAlbertForSequenceClassification: ['mlm___cls', 'nsp___cls', 'bert']
- This IS expected if you are initializing TFAlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFAlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFAlbertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['albert', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_albert_for_sequence_classification_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 albert (TFAlbertMainLayer)  multiple                  11750400  
                                                                 
 dropout_475 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 11,751,938
Trainable params: 11,751,938
Non-trainable params: 0
_________________________________________________________________
INFO:Task1:Performing classification with TFRobertaForSequenceClassification
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/models/bert-base-uncased HTTP/1.1" 200 1093
DEBUG:url

You are using a model of type bert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/tf_model.h5 HTTP/1.1" 302 0


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFRobertaForSequenceClassification: ['mlm___cls', 'nsp___cls', 'bert']
- This IS expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['roberta', 'classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_roberta_for_sequence_classification_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 roberta (TFRobertaMainLayer  multiple                 108891648 
 )                                                               
                                                                 
 classifier (TFRobertaClassi  multiple                 592130    
 ficationHead)                                                   
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________
INFO:Task1:Performing classification with TFDistilBertForSequenceClassification
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/models/bert-base-uncased HTTP/1.1" 200 1093
DEB

You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.


DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /bert-base-uncased/resolve/main/tf_model.h5 HTTP/1.1" 302 0


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['mlm___cls', 'nsp___cls', 'bert']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier', 'distilbert', 'dropout_551', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_distil_bert_for_sequence_classification_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 108890112 
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_551 (Dropout)       multiple                  0         
                                                                 
Total params: 109,482,242
Trainable params: 109,482,242
Non-trainable params: 0
_________________________________________________________________


In [163]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import binary_crossentropy

bert_classification_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = len(set(y_task1)))

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [185]:
bert_classification_model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels = len(set(y_task1)))
bert_classification_model.compile(optimizer=Adam(learning_rate=0.01),
                                  loss=binary_crossentropy,
                                  metrics=['accuracy'])

bert_classification_model.fit([np.array(train_input_ids), np.array(train_attention_masks)], np.array(y_task1), batch_size=50, epochs=1, verbose=2)

DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "GET /api/models/albert-base-v2 HTTP/1.1" 200 1069
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /albert-base-v2/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:filelock:Attempting to acquire lock 140634166562720 on /home/aleksandar/.cache/huggingface/transformers/e48be00f755a5f765e36a32885e8d6a573081df3321c9e19428d12abadf7dba2.b8f28145885741cf994c0e8a97b724f6c974460c297002145e48e511d2496e88.lock
DEBUG:filelock:Lock 140634166562720 acquired on /home/aleksandar/.cache/huggingface/transformers/e48be00f755a5f765e36a32885e8d6a573081df3321c9e19428d12abadf7dba2.b8f28145885741cf994c0e8a97b724f6c974460c297002145e48e511d2496e88.lock
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface

Downloading:   0%|          | 0.00/684 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140634166562720 on /home/aleksandar/.cache/huggingface/transformers/e48be00f755a5f765e36a32885e8d6a573081df3321c9e19428d12abadf7dba2.b8f28145885741cf994c0e8a97b724f6c974460c297002145e48e511d2496e88.lock
DEBUG:filelock:Lock 140634166562720 released on /home/aleksandar/.cache/huggingface/transformers/e48be00f755a5f765e36a32885e8d6a573081df3321c9e19428d12abadf7dba2.b8f28145885741cf994c0e8a97b724f6c974460c297002145e48e511d2496e88.lock
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /albert-base-v2/resolve/main/tf_model.h5 HTTP/1.1" 302 0
DEBUG:filelock:Attempting to acquire lock 140634855550064 on /home/aleksandar/.cache/huggingface/transformers/3628e126c6415ff328335045fb48a3128858e7d555500c1787a6f2491d514092.befe87e8464523e3c0ff0bcdb17c367b1f258ff3136e3ec69e951a8efeb458f1.h5.lock
DEBUG:filelock:Lock 140634855550064 acquired on /home/aleksandar/.cache/h

Downloading:   0%|          | 0.00/60.1M [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140634855550064 on /home/aleksandar/.cache/huggingface/transformers/3628e126c6415ff328335045fb48a3128858e7d555500c1787a6f2491d514092.befe87e8464523e3c0ff0bcdb17c367b1f258ff3136e3ec69e951a8efeb458f1.h5.lock
DEBUG:filelock:Lock 140634855550064 released on /home/aleksandar/.cache/huggingface/transformers/3628e126c6415ff328335045fb48a3128858e7d555500c1787a6f2491d514092.befe87e8464523e3c0ff0bcdb17c367b1f258ff3136e3ec69e951a8efeb458f1.h5.lock


KeyboardInterrupt: 