In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
# 4.15.0
!pip install transformers==4.15.0
!pip install ekphrasis
!pip install emoji

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import transformers
from transformers import ViTFeatureExtractor, TFViTModel, TFViTForImageClassification, BertTokenizerFast, TFBertModel
from PIL import Image

import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow import keras
from collections import Counter
from tensorflow.keras.callbacks import ModelCheckpoint

import pandas as pd
import numpy as np
import os

transformers.__version__

In [5]:
os.mkdir('images')
import zipfile
with zipfile.ZipFile('/content/drive/MyDrive/mami/training.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/images')

In [None]:
df = pd.read_csv('/content/training.csv', sep='delimiter', header=None, skiprows=1)

In [None]:
df.info()

In [None]:
train_data = df[0].apply(lambda string: np.array(string.split('\t'), dtype='str'))
train_data = np.asarray(train_data)
train_array = np.stack(train_data)
print(np.shape(train_array))

In [None]:
filenames = train_array[:,0]
text = train_array[:,6]
Y_misogyny = train_array[:,1]
Y_misogyny = Y_misogyny.astype(np.int)
Counter(Y_misogyny)

<h6>Text Train Split</h6>

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
filenames_train, filenames_val, text_train, text_val, Y_train_misogyny, Y_val_misogyny = train_test_split(filenames, text, Y_misogyny, test_size=0.05, random_state=3)

In [None]:
print("Train")
print(np.shape(filenames_train), type(filenames_train))
print(np.shape(text_train), type(text_train))
print(np.shape(Y_train_misogyny), type(Y_train_misogyny))
print("Val")
print(np.shape(filenames_val))
print(np.shape(text_val))
print(np.shape(Y_val_misogyny))

<h6>Text pre-processing</h6>

In [13]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

In [None]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

In [15]:
def print_text(texts,i,j):
    for u in range(i,j):
        print(texts[u])
        print()

In [None]:
print_text(text_train,0,5)
print("##############################################################################################################")
print_text(text_val,0,5)

In [17]:
# Functions for chat word conversion
f = open("slang.txt", "r")
chat_words_str = f.read()
chat_words_map_dict = {}
chat_words_list = []

for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [18]:
text_train = pd.Series(text_train)
text_val = pd.Series(text_val)

In [None]:
# Chat word conversion
# Training set
text_train = text_train.apply(lambda text: chat_words_conversion(text))
print_text(text_train,0,5)

print("********************************************************************************")

# Validation set
text_val = text_val.apply(lambda text: chat_words_conversion(text))
print_text(text_val,0,5)

# Test set
# text_test = text_test.apply(lambda text: chat_words_conversion(text))
# print_text(text_test,0,10)

In [20]:
def ekphrasis_pipe(sentence):
    cleaned_sentence = " ".join(text_processor.pre_process_doc(sentence))
    return cleaned_sentence

In [None]:
# Training set
text_train = text_train.apply(lambda text: ekphrasis_pipe(text))
print("Training set completed.......")
#Validation set
text_val = text_val.apply(lambda text: ekphrasis_pipe(text))
print("Validation set completed.......")
#Test set
# text_test = text_test.apply(lambda text: ekphrasis_pipe(text))
# print("Test set completed.......")

In [None]:
u = lambda text: len(text.split(" "))
sentence_lengths = []
for x in text_train:
    sentence_lengths.append(u(x))
print(sorted(sentence_lengths)[-500:])
print(len(sentence_lengths))

</h6>Text processing complete</h6>

In [32]:
from transformers import RobertaTokenizerFast, TFBertModel, TFRobertaModel, MPNetTokenizerFast, TFMPNetModel, ElectraTokenizerFast, TFElectraModel, XLNetTokenizerFast, TFXLNetModel, AlbertTokenizerFast, TFAlbertModel, DebertaTokenizer, TFDebertaModel

In [None]:
# Use the tokenizer as per the model being used
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [None]:
train_encodings = tokenizer(list(text_train), max_length=80, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(train_encodings["input_ids"]))

val_encodings = tokenizer(list(text_val), max_length=80, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(val_encodings["input_ids"]))

<h6>Image features pipeline</h6>

In [26]:
def image_processing(filename):
  # Read Image (Change path as per the directory structure)
  image_string = tf.io.read_file('/content/images/TRAINING/' + filename)
  img = tf.io.decode_jpeg(image_string, channels=3)
  # Resize image
  img = tf.image.resize(img, [224,224], method='bilinear')
  # Normalise image
  img = tf.cast(img, tf.float32)
  img = tf.math.divide(img, 255.0)
  img = tf.math.subtract(img, 0.5)
  img = tf.math.divide(img, 0.5)
  # Move channel axis
  img = tf.experimental.numpy.moveaxis(img, -1, 0)

  return img

In [27]:
batch_size = 16 # for tpu 128
def configure_for_performance(ds):
  # ds = ds.cache()
  ds = ds.shuffle(buffer_size=1000)
  ds = ds.batch(batch_size)
  ds = ds.prefetch(buffer_size=tf.data.AUTOTUNE)
  return ds

In [28]:
def get_dataset(filenames, encodings, Y_misogyny):
  image_names = Dataset.from_tensor_slices(filenames) # ==> 3x2 tensor
  image_features = image_names.map(image_processing)

  text_features = Dataset.from_tensor_slices(encodings["input_ids"])
  text_masks = Dataset.from_tensor_slices(encodings["attention_mask"])

  labels_misogyny = Dataset.from_tensor_slices(Y_misogyny)

  features = Dataset.zip((image_features, text_features, text_masks))

  dataset = Dataset.zip((features, labels_misogyny))

  return dataset

In [29]:
dataset_train = get_dataset(filenames_train, train_encodings, Y_train_misogyny)
dataset_train = configure_for_performance(dataset_train)

In [30]:
images_features_val = np.zeros((500,3,224,224))
for i in range(0, 500):
  images_features_val[i] = image_processing(filenames_val[i])

<h6>Simple Concatenation based model</h6>

In [31]:
# # Concat Model
# def mami_concat(input_shape):
#     img_input = keras.Input(shape=(3, 224, 224), dtype='float32')
#     txt_input = keras.Input(shape=input_shape, dtype='int32')
#     input_masks = keras.Input(shape=input_shape, dtype='int32')

#     # Text
#     model_txt = TFMPNetModel.from_pretrained("microsoft/mpnet-base")
#     layer_txt = model_txt.layers[0]
#     embeddings_txt = layer_txt([txt_input, input_masks])[0][:,0,:]
#     embeddings_txt = keras.layers.Dense(256,activation='relu')(embeddings_txt)

#     # Images
#     model_images = TFViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
#     embeddings_img = model_images(img_input)[0][:,0,:]
#     embeddings_img = keras.layers.Dense(256,activation='relu')(embeddings_img)
    
#     X =  keras.layers.Concatenate()([embeddings_txt, embeddings_img])

#     X = keras.layers.Dense(128,activation='relu')(X)

#     X = keras.layers.BatchNormalization()(X)

#     X = keras.layers.Dense(64,activation='relu')(X)
    
#     X = keras.layers.Dense(1,activation='sigmoid')(X)
    
#     model = keras.Model(inputs=[img_input, txt_input, input_masks], outputs=[X])    
#     return model

<h6>Attention based model</h6>

In [37]:
# Attention Model base models
def mami_attention(input_shape):
    img_input = keras.Input(shape=(3, 224, 224), dtype='float32')
    txt_input = keras.Input(shape=input_shape, dtype='int32')
    input_masks = keras.Input(shape=input_shape, dtype='int32')

    # Text
    model_txt = TFBertModel.from_pretrained("bert-base-uncased") # Change model definition as per requirement
    layer_txt = model_txt.layers[0]
    text_seq = layer_txt([txt_input, input_masks])[0]

    # Images
    model_images = TFViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
    layer_img = model_images.layers[0]
    img_seq = layer_img(img_input)[0]
    
    joint_features = tf.keras.layers.Attention()([img_seq, text_seq])

    joint_features = tf.keras.layers.Conv1D(32, 30, 15)(joint_features)
    
    joint_features = tf.keras.layers.Flatten()(joint_features)

    X = keras.layers.BatchNormalization()(joint_features)

    X = keras.layers.Dense(64,activation='relu')(X)
    
    X = keras.layers.Dense(1,activation='sigmoid')(X)
    
    model = keras.Model(inputs=[img_input, txt_input, input_masks], outputs=[X])    
    return model

In [38]:
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt

class EvaluationMetric(keras.callbacks.Callback):   
    
    def __init__(self, val_img_features, val_encodings, val_masks, Y_val):
        super(EvaluationMetric, self).__init__()
        self.val_encodings = val_encodings
        self.val_masks = val_masks
        self.Y_val = Y_val
        self.val_img_features = val_img_features
    
    def on_epoch_begin(self, epoch, logs={}):
        print("\nTraining...")

    def on_epoch_end(self, epoch, logs={}):
        print("\nEvaluating...")
        val_prediction = self.model.predict([self.val_img_features, self.val_encodings, self.val_masks])
        fpr, tpr, threshold = roc_curve(self.Y_val, val_prediction)
        minima = np.argmin(np.abs(fpr + tpr -1))
        threshold_final = threshold[minima] 
        print("Threshold is:", threshold_final)

        pred_ = []
        for i in range(0,len(self.Y_val)):
            num = val_prediction[i]
            if(num > 0.5):
              num = 1
            else:
              num = 0
            pred_.append(num)
        
        from sklearn.metrics import classification_report
        print("With Threshold")
        print(classification_report(self.Y_val, pred, digits=3))
        print("Without Threshold")
        print(classification_report(self.Y_val, pred_, digits=3))
        
evaluation_metric = EvaluationMetric(images_features_val, val_encodings["input_ids"], val_encodings["attention_mask"], Y_val_misogyny)

In [None]:
model = mami_attention((80,))
optimizer = keras.optimizers.Adam(learning_rate=4e-5)
loss_fun = tf.keras.losses.BinaryCrossentropy(from_logits=False)
metric = [tf.metrics.BinaryAccuracy(), tf.metrics.Precision(), tf.metrics.Recall()]
model.compile(optimizer=optimizer, loss=loss_fun, metrics=metric)

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(
    model)

In [42]:
checkpoint = ModelCheckpoint(filepath='/content/bert.{epoch:03d}.h5',
                                 verbose = 0,
                                 save_weights_only=True,
                                 epoch=1)

In [None]:
# ATTN BERT
history = model.fit(
    dataset_train,
    callbacks = [evaluation_metric, checkpoint],
    epochs=1
)

In [None]:
# model.load_weights("/content/att-vit-bert.003.h5")
# # model.save("/content/att-vit-bert-model.002.h5")

In [44]:
# import gc
# gc.collect()

<h6>Test Set Predictions</h6>

In [67]:
model.load_weights('/content/bert.001.h5')

In [45]:
os.mkdir('images_test')
import zipfile
with zipfile.ZipFile('/content/drive/MyDrive/mami/test.zip', 'r') as zip_ref:
    zip_ref.extractall('/content/images_test')

In [None]:
df_test = pd.read_csv('/content/test.csv', sep='delimiter', header=None, skiprows=1)

In [None]:
df_test.info()

In [None]:
test_data = df_test[0].apply(lambda string: np.array(string.split('\t'), dtype='str'))
test_data = np.asarray(test_data)
test_array = np.stack(test_data)
print(np.shape(test_array))

In [49]:
filenames_test = test_array[:,0]
text_test = test_array[:,1]

In [50]:
text_test = pd.Series(text_test)
text_test = text_test.apply(lambda text: chat_words_conversion(text))
text_test = text_test.apply(lambda text: ekphrasis_pipe(text))

In [51]:
def image_processing_test(filename):
  # Read Image
  image_string = tf.io.read_file('/content/images_test/test/' + filename)
  img = tf.io.decode_jpeg(image_string, channels=3)
  # Resize image
  img = tf.image.resize(img, [224,224], method='bilinear')
  # Normalise image
  img = tf.cast(img, tf.float32)
  img = tf.math.divide(img, 255.0)
  img = tf.math.subtract(img, 0.5)
  img = tf.math.divide(img, 0.5)
  # Move channel axis
  img = tf.experimental.numpy.moveaxis(img, -1, 0)

  return img

In [52]:
images_features_test = np.zeros((1000,3,224,224))
for i in range(0, 1000):
  images_features_test[i] = image_processing_test(filenames_test[i])

In [None]:
test_encodings = tokenizer(list(text_test), max_length=80, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(test_encodings["input_ids"]))

In [54]:
answer_test = model.predict([images_features_test, test_encodings["input_ids"], test_encodings["attention_mask"]])

In [68]:
def get_answer_list(answer):
  print(np.shape(answer))
  final = (np.where(answer_test > 0.5, 1, 0)).astype(np.int)
  print(Counter(final[:,0]))
  answer_list = final[:,0]
  return answer_list

In [None]:
final_answer = get_answer_list(answer_test)

In [None]:
Counter(final_answer)

In [61]:
answer_list = final_answer
answer_names = list(filenames_test)

In [62]:
with open('answer.txt', 'w') as outf:
  for i in range(0, len(answer_list)-1):
    outf.write(answer_names[i] + '\t' + str(answer_list[i]) + '\n')
  outf.write(answer_names[999] + '\t' + str(answer_list[999]))

In [64]:
import zipfile
zipfile.ZipFile('bert.zip', mode='w').write("answer.txt")