In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
# 4.15.0
!pip install transformers==4.15.0
!pip install ekphrasis
!pip install emoji

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import transformers
from transformers import ViTFeatureExtractor, TFViTModel, TFViTForImageClassification, BertTokenizerFast, TFBertModel
from PIL import Image

import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow import keras
from collections import Counter
from tensorflow.keras.callbacks import ModelCheckpoint

import pandas as pd
import numpy as np
import os

transformers.__version__

In [5]:
os.mkdir('images')
import zipfile
with zipfile.ZipFile('/content/drive/MyDrive/mami/training.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/images')

In [6]:
os.mkdir('images_test')
import zipfile
with zipfile.ZipFile('/content/drive/MyDrive/mami/test.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/images_test')

In [None]:
df = pd.read_csv('/content/training.csv', sep='delimiter', header=None, skiprows=1)

In [None]:
df.info()

In [None]:
train_data = df[0].apply(lambda string: np.array(string.split('\t'), dtype='str'))
train_data = np.asarray(train_data)
train_array_ = np.stack(train_data)
print(np.shape(train_array_))

In [None]:
df2 = pd.DataFrame(train_array_)
df2.head()

In [14]:
train_array = df2.loc[df2[1] == '1']
train_array = train_array.reset_index()

In [None]:
train_array.info()

In [None]:
filenames = np.array(train_array[0])
text = np.array(train_array[6])
Y_shaming = train_array[2]
Y_stereotype = train_array[3]
Y_objectification = train_array[4]
Y_violence = train_array[5]

Y_shaming = np.array(Y_shaming).astype(np.float32)
Y_stereotype = np.array(Y_stereotype).astype(np.float32)
Y_objectification = np.array(Y_objectification).astype(np.float32)
Y_violence = np.array(Y_violence).astype(np.float32)

print(Counter(Y_shaming))
print(Counter(Y_stereotype))
print(Counter(Y_objectification))
print(Counter(Y_violence))

<h6>Text Train Split</h6>

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
filenames_train, filenames_val, text_train, text_val, Y_shaming_train, Y_shaming_val, Y_stereotype_train, Y_stereotype_val, Y_objectification_train, Y_objectification_val, Y_violence_train, Y_violence_val = train_test_split(filenames, text, Y_shaming, Y_stereotype, Y_objectification, Y_violence, test_size=0.05, random_state=3)

In [None]:
Y_train = np.column_stack((Y_shaming_train, Y_stereotype_train, Y_objectification_train, Y_violence_train))
print(np.shape(Y_train))

In [None]:
print("Train")
print(np.shape(filenames_train), type(filenames_train))
print(np.shape(text_train), type(text_train))
print(np.shape(Y_shaming_train), type(Y_shaming_train))
print(np.shape(Y_stereotype_train), type(Y_stereotype_train))
print(np.shape(Y_objectification_train), type(Y_objectification_train))
print(np.shape(Y_violence_train), type(Y_violence_train))
print("Val")
print(np.shape(filenames_val))
print(np.shape(text_val))
print(np.shape(Y_shaming_val))
print(np.shape(Y_stereotype_val))
print(np.shape(Y_objectification_val))
print(np.shape(Y_violence_val))

<h6>Text pre-processing</h6>

In [22]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

In [None]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

In [24]:
def print_text(texts,i,j):
    for u in range(i,j):
        print(texts[u])
        print()

In [None]:
print_text(text_train,0,5)
print("##############################################################################################################")
print_text(text_val,0,5)

In [26]:
# Functions for chat word conversion
f = open("slang.txt", "r")
chat_words_str = f.read()
chat_words_map_dict = {}
chat_words_list = []

for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [27]:
text_train = pd.Series(text_train)
text_val = pd.Series(text_val)

In [None]:
# Chat word conversion
# Training set
text_train = text_train.apply(lambda text: chat_words_conversion(text))
print_text(text_train,0,5)

print("********************************************************************************")

# Validation set
text_val = text_val.apply(lambda text: chat_words_conversion(text))
print_text(text_val,0,5)

In [29]:
def ekphrasis_pipe(sentence):
    cleaned_sentence = " ".join(text_processor.pre_process_doc(sentence))
    return cleaned_sentence

In [None]:
# Training set
text_train = text_train.apply(lambda text: ekphrasis_pipe(text))
print("Training set completed.......")
#Validation set
text_val = text_val.apply(lambda text: ekphrasis_pipe(text))
print("Validation set completed.......")

In [None]:
u = lambda text: len(text.split(" "))
sentence_lengths = []
for x in text_train:
    sentence_lengths.append(u(x))
print(sorted(sentence_lengths)[-500:])
print(len(sentence_lengths))

</h6>Text processing complete</h6>

In [32]:
from transformers import RobertaTokenizerFast, TFRobertaModel, MPNetTokenizerFast, TFMPNetModel, ElectraTokenizerFast, TFElectraModel, XLNetTokenizerFast, TFXLNetModel, AlbertTokenizerFast, TFAlbertModel

In [None]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
train_encodings = tokenizer(list(text_train), max_length=80, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(train_encodings["input_ids"]))

val_encodings = tokenizer(list(text_val), max_length=80, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(val_encodings["input_ids"]))

In [35]:
def image_processing(filename):
  # Read Image
  image_string = tf.io.read_file('/content/images/TRAINING/' + filename)
  img = tf.io.decode_jpeg(image_string, channels=3)
  # Resize image
  img = tf.image.resize(img, [224,224], method='bilinear')
  # Normalise image
  img = tf.cast(img, tf.float32)
  img = tf.math.divide(img, 255.0)
  img = tf.math.subtract(img, 0.5)
  img = tf.math.divide(img, 0.5)
  # Move channel axis
  img = tf.experimental.numpy.moveaxis(img, -1, 0)

  return img

In [36]:
batch_size = 16 # for tpu 128
def configure_for_performance(ds):
  # ds = ds.cache()
  ds = ds.shuffle(buffer_size=1000)
  ds = ds.batch(batch_size)
  ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
  return ds

In [37]:
def get_dataset(filenames, encodings, Y_shame, Y_stereotype, Y_objectification, Y_violence):
  image_names = Dataset.from_tensor_slices(filenames) # ==> 3x2 tensor
  image_features = image_names.map(image_processing)

  text_features = Dataset.from_tensor_slices(encodings["input_ids"])
  text_masks = Dataset.from_tensor_slices(encodings["attention_mask"])

  labels_shame = Dataset.from_tensor_slices(Y_shame)
  labels_stereotype = Dataset.from_tensor_slices(Y_stereotype)
  labels_objectification = Dataset.from_tensor_slices(Y_objectification)
  labels_violence = Dataset.from_tensor_slices(Y_violence)
  final_labels = Dataset.zip((labels_shame, labels_stereotype, labels_objectification, labels_violence))

  features = Dataset.zip((image_features, text_features, text_masks))

  dataset = Dataset.zip((features, final_labels))

  return dataset

In [38]:
dataset_train = get_dataset(filenames_train, train_encodings, Y_shaming_train, Y_stereotype_train, Y_objectification_train, Y_violence_train)
dataset_train = configure_for_performance(dataset_train)

In [39]:
images_features_val = np.zeros((250,3,224,224))
for i in range(0, 250):
  images_features_val[i] = image_processing(filenames_val[i])

In [40]:
# Attention Model base models
# For concatenation models simply replace tf.keras.layers.Attention() with tf.keras.layers.Concatenate()
def test(input_shape):
    img_input = keras.Input(shape=(3, 224, 224), dtype='float32')
    txt_input = keras.Input(shape=input_shape, dtype='int32')
    input_masks = keras.Input(shape=input_shape, dtype='int32')

    # Text
    model_txt = TFBertModel.from_pretrained("bert-base-uncased")
    layer_txt = model_txt.layers[0]
    text_seq = layer_txt([txt_input, input_masks])[0]

    # Images
    model_images = TFViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
    layer_img = model_images.layers[0]
    img_seq = layer_img(img_input)[0]
    
    joint_features_shame = tf.keras.layers.Attention()([img_seq, text_seq])
    joint_features_shame = tf.keras.layers.Conv1D(32, 30, 15)(joint_features_shame)
    joint_features_shame = tf.keras.layers.Flatten()(joint_features_shame)
    joint_features_shame = keras.layers.Dense(32, activation='relu')(joint_features_shame)
    shame_ = keras.layers.BatchNormalization()(joint_features_shame)

    joint_features_stereotype = tf.keras.layers.Attention()([img_seq, text_seq])
    joint_features_stereotype = tf.keras.layers.Conv1D(32, 30, 15)(joint_features_stereotype)
    joint_features_stereotype = tf.keras.layers.Flatten()(joint_features_stereotype)
    joint_features_stereotype = keras.layers.Dense(32, activation='relu')(joint_features_stereotype)
    stereotype_ = keras.layers.BatchNormalization()(joint_features_stereotype)

    joint_features_objectificaton = tf.keras.layers.Attention()([img_seq, text_seq])
    joint_features_objectificaton = tf.keras.layers.Conv1D(32, 30, 15)(joint_features_objectificaton)
    joint_features_objectificaton = tf.keras.layers.Flatten()(joint_features_objectificaton)
    joint_features_objectificaton = keras.layers.Dense(32, activation='relu')(joint_features_objectificaton)
    objectification_ = keras.layers.BatchNormalization()(joint_features_objectificaton)

    joint_features_violence = tf.keras.layers.Attention()([img_seq, text_seq])
    joint_features_violence = tf.keras.layers.Conv1D(32, 30, 15)(joint_features_violence)
    joint_features_violence = tf.keras.layers.Flatten()(joint_features_violence)
    joint_features_violence = keras.layers.Dense(32, activation='relu')(joint_features_violence)
    violence_ = keras.layers.BatchNormalization()(joint_features_violence)

    shame_ = keras.layers.Dense(1, activation='sigmoid')(shame_)
    stereotype_ = keras.layers.Dense(1, activation='sigmoid')(stereotype_)
    objectification_ = keras.layers.Dense(1, activation='sigmoid')(objectification_)
    violence_ = keras.layers.Dense(1, activation='sigmoid')(violence_)
    
    model = keras.Model(inputs=[img_input, txt_input, input_masks], outputs=[shame_, stereotype_, objectification_, violence_])    
    return model

In [41]:
class EvaluationMetric(keras.callbacks.Callback):   
    
    def __init__(self, val_img_features, val_encodings, val_masks, Y_val):
        super(EvaluationMetric, self).__init__()
        self.val_encodings = val_encodings
        self.val_masks = val_masks
        self.Y_val = Y_val
        self.val_img_features = val_img_features
    
    def on_epoch_begin(self, epoch, logs={}):
        print("\nTraining...")

    def on_epoch_end(self, epoch, logs={}):
        print("\nEvaluating...")
        val_prediction = self.model.predict([self.val_img_features, self.val_encodings, self.val_masks])
        
        pred = np.round(val_prediction)

        from sklearn.metrics import classification_report
        print("Shame")
        print(classification_report(self.Y_val[:,0], pred[0][:,0], digits=3))
        print("##################################################################")
        
        print("Stereotype")
        print(classification_report(self.Y_val[:,1], pred[1][:,0], digits=3))
        print("##################################################################")

        print("Objectification")
        print(classification_report(self.Y_val[:,2], pred[2][:,0], digits=3))
        print("##################################################################")

        print("Violence")
        print(classification_report(self.Y_val[:,3], pred[3][:,0], digits=3))
        print("##################################################################")

evaluation_metric = EvaluationMetric(images_features_val, val_encodings["input_ids"], val_encodings["attention_mask"], np.column_stack((Y_shaming_val, Y_stereotype_val, Y_objectification_val, Y_violence_val)))

In [42]:
checkpoint = ModelCheckpoint(filepath='/content/vit-bert.{epoch:03d}.h5',
                                 verbose = 0,
                                 save_weights_only=True,
                                 epoch=1)

In [None]:
print(Counter(Y_shaming_train))
print(Counter(Y_stereotype_train))
print(Counter(Y_objectification_train))
print(Counter(Y_violence_train))

In [None]:
optimizer = keras.optimizers.Adam(learning_rate=4e-5)
loss_fun = tf.keras.losses.BinaryCrossentropy(from_logits=False)
metric = [tf.metrics.BinaryAccuracy(), tf.metrics.Precision(), tf.metrics.Recall()]
model = test((80,))
model.compile(optimizer=optimizer, loss=loss_fun, metrics=metric)

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(
    model)

In [47]:
# history = model.fit(
#     dataset_train,
#     callbacks = [evaluation_metric, checkpoint],
#     epochs=10
# )

In [48]:
# import gc
# del images_features_val
# gc.collect()

<h6>Test Set</h6>

In [49]:
df_test = pd.read_csv('/content/test.csv', sep='delimiter', header=None, skiprows=1)

  return func(*args, **kwargs)


In [None]:
df_test.info()

In [None]:
test_data = df_test[0].apply(lambda string: np.array(string.split('\t'), dtype='str'))
test_data = np.asarray(test_data)
test_array = np.stack(test_data)
print(np.shape(test_array))

In [52]:
filenames_test = test_array[:,0]
text_test = test_array[:,1]

In [53]:
text_test = pd.Series(text_test)
text_test = text_test.apply(lambda text: chat_words_conversion(text))
text_test = text_test.apply(lambda text: ekphrasis_pipe(text))

In [54]:
def image_processing_test(filename):
  # Read Image
  image_string = tf.io.read_file('/content/images_test/test/' + filename)
  img = tf.io.decode_jpeg(image_string, channels=3)
  # Resize image
  img = tf.image.resize(img, [224,224], method='bilinear')
  # Normalise image
  img = tf.cast(img, tf.float32)
  img = tf.math.divide(img, 255.0)
  img = tf.math.subtract(img, 0.5)
  img = tf.math.divide(img, 0.5)
  # Move channel axis
  img = tf.experimental.numpy.moveaxis(img, -1, 0)

  return img

In [55]:
images_features_test = np.zeros((1000,3,224,224))
for i in range(0, 1000):
  images_features_test[i] = image_processing_test(filenames_test[i])

In [None]:
test_encodings = tokenizer(list(text_test), max_length=80, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(test_encodings["input_ids"]))

In [58]:
answer_test = model.predict([images_features_test, test_encodings["input_ids"], test_encodings["attention_mask"]])

In [None]:
def get_answer_list(answer):
  print(np.shape(answer))
  final = (np.round(answer)).astype(np.int)
  print("Shame")
  print(Counter(final[0][:,0]))
  print("Stereotype")
  print(Counter(final[1][:,0]))
  print("Objectify")
  print(Counter(final[2][:,0]))
  print("Violence")
  print(Counter(final[3][:,0]))
  return final

In [59]:
final_answer = get_answer_list(answer_test)

In [60]:
print(Counter(Y_shaming))
print(Counter(Y_stereotype))
print(Counter(Y_objectification))
print(Counter(Y_violence))

In [61]:
answer_list = final_answer
answer_names = list(filenames_test)

In [None]:
with open('answer.txt', 'w') as outf:
  for i in range(0, 999):
    outf.write(answer_names[i] + '\t' + '0' + '\t' + str(answer_list[0][i,0]) + '\t' + str(answer_list[1][i,0]) + '\t' + str(answer_list[2][i,0]) + '\t' + str(answer_list[3][i,0]) + '\n')
  outf.write((answer_names[999] + '\t' + '0' + '\t' + str(answer_list[0][999,0]) + '\t' + str(answer_list[1][999,0]) + '\t' + str(answer_list[2][999,0]) + '\t' + str(answer_list[3][999,0])))

In [None]:
import zipfile
zipfile.ZipFile('answer.zip', mode='w').write("answer.txt")