# Introduction


We wanted to develop a model to identify harmful multimodal content. This content combines different modalities, such as text and images, making it difficult for machines to understand.

When viewing a meme, for example, we don’t think about the words and photo independently of each other; we understand the combined meaning together. Hence we could not just perform classification on the text or the image alone 


In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
from tensorflow.keras.utils import load_img
import re
import string 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras import Sequential
from keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from tensorflow.keras.layers import Dense, Flatten, BatchNormalization, Activation, Dropout
from tensorflow.keras.layers import Conv1D, Embedding, GlobalAveragePooling1D 
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.preprocessing import image

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications import VGG16

from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer



from keras.applications import VGG16
from keras import models
from keras import layers
from keras.applications.imagenet_utils import preprocess_input
from keras import optimizers
from keras import metrics
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True


### Data Sources


We have used the Memotion Dataset which has 7K annotated memes - with human-annotated tags namely sentiment, and type of humor that is, sarcastic, humorous, or offensive.

In [None]:
df = pd.read_csv('/kaggle/input/memotion-dataset-7k/memotion_dataset_7k/labels.csv')
df.head()

In [None]:
data = df.drop(['Unnamed: 0', 'humour', 'sarcasm', 'offensive', 'motivational'],axis = 1)
data.head()

In [None]:
data.overall_sentiment.value_counts()

In [None]:
# Negative and Very Negative => 2
# Positive and Very Positive => 1
# Neutral => 0

task_a_labels = {
    'negative': 2 ,
    'very_negative': 2,
    'neutral' : 0,
    'positive' : 1,
    'very_positive': 1,
}

data['target'] = data['overall_sentiment'].map(task_a_labels)
data.target.value_counts()

In [None]:
data = data.drop(['text_ocr'], axis=1)
data = data.dropna()
data

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove URLs and hyperlinks
    text = re.sub(r'https?:\/\/[^\s]+', '', text)
    
    # Remove HTML tags and special characters
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Apply stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    
    # Join tokens back into a single string
    preprocessed_text = ' '.join(stemmed_tokens)
    
    return preprocessed_text

# Apply preprocessing to all text in the column
preprocessed_texts = [preprocess_text(text) for text in data.text_corrected]

# Create Tokenizer
tokenizer = Tokenizer(num_words=30000)
tokenizer.fit_on_texts(preprocessed_texts)

# Generate sequences
sequences = tokenizer.texts_to_sequences(preprocessed_texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=100)
print('Shape of data tensor:', padded_sequences.shape)



In [None]:
data['clean_text'] = preprocessed_texts
data.head()

In [None]:
embeddings_index = {}
f = open('/kaggle/input/glove6b300dtxt/glove.6B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()


In [None]:
embedding_dim = 300
embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
data.head()

In [None]:
data['image_path'] = data['image_name'].apply(lambda x: '/kaggle/input/memotion-dataset-7k/memotion_dataset_7k/images/'+x)
data.head()

In [None]:
image_names = ['image_121.jpg','image_4802.png','image_6786.jpg','image_6790.jpg','image_6792.jpg']
data = data[~data['image_name'].isin(image_names)]
data.shape

In [None]:
data.head()

In [None]:
data['target'] = data['target'].astype(str)

In [None]:
data.target.value_counts()

In [None]:
import pandas as pd
import numpy as np
import keras
from keras.utils import to_categorical
from tensorflow.keras.utils import load_img, img_to_array
from keras.preprocessing.text import Tokenizer

class CustomDataGenerator(keras.utils.Sequence):
    
    def __init__(self, dataframe, image_size, batch_size, tokenizer, max_text_len):
        self.df = dataframe
        self.image_size = image_size
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.max_text_len = max_text_len
        self.num_classes = len(self.df.target.unique())
        self.indexes = np.arange(len(self.df))
        
    def __len__(self):
        return int(np.ceil(len(self.df) / self.batch_size))
    
    def __getitem__(self, index):
        batch_indexes = self.indexes[index * self.batch_size:(index+1) * self.batch_size]
        batch_df = self.df.iloc[batch_indexes]
        
        images = []
        texts = []
        labels = []
        
        for i, row in batch_df.iterrows():
            # Load image
            image = load_img(row['image_path'], target_size=self.image_size)
            image = img_to_array(image)
            image /= 255.0
            images.append(image)
            
            # Tokenize text
            text = self.tokenizer.texts_to_sequences([row['clean_text']])[0]
            text = pad_sequences([text], maxlen=self.max_text_len, padding='post')[0]
            texts.append(text)
            
            # Get label
            label = row['target']
            labels.append(label)
        
        images = np.array(images)
        texts = np.array(texts)
        labels = to_categorical(labels, num_classes=self.num_classes)
        
        return [images, texts], labels


In [None]:
data

In [None]:
# Define constants
IMAGE_SIZE = (150, 150)
BATCH_SIZE = 32
MAX_TEXT_LEN = 100

# Split data into train and validation sets
train_df = data.sample(frac=0.8, random_state=42)
val_df = data.drop(train_df.index)

# Create generators
train_generator = CustomDataGenerator(train_df, IMAGE_SIZE, BATCH_SIZE, tokenizer, MAX_TEXT_LEN)
val_generator = CustomDataGenerator(val_df, IMAGE_SIZE, BATCH_SIZE, tokenizer, MAX_TEXT_LEN)


In [None]:
# Define a dictionary of class labels
class_dict = {0: 'neutral', 1: 'positive', 2: 'negative'}

x_batch, y_batch = train_generator.__getitem__(np.random.randint(0, len(train_generator)))

# Display the images and their labels
plt.figure(figsize=(20,20))
for i in range(20):
    plt.subplot(5,4,i+1)
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(x_batch[0][i], cmap=plt.cm.binary)
    plt.xlabel(class_dict[np.argmax(y_batch[i])])
plt.show()

In [None]:
from keras.optimizers import Adam
from keras import backend as K

def recall(y_true, y_pred):

    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall


def precision(y_true, y_pred):

    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision


def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * ((p * r) / (p + r))

In [None]:
from keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint(filepath='best_weights.h5',
                             monitor='val_f1',
                             save_best_only=True,
                             mode='max',
                             verbose=1)

In [None]:
########################################################## - img CNN
input_img = Input(shape=(150,150,3))
model = VGG16(weights='imagenet', include_top=False)
for layer in model.layers:
    layer.trainable = False
x = model(input_img)
flatten = Flatten()(x)
flatten = Dense(1024, activation='relu')(flatten)
flatten = Dense(512, activation='relu')(flatten)

########################################################## - text CNN

input_txt = Input(shape=(100,), dtype='int32')
print(input_txt)
txt = layers.Masking(mask_value=0)(input_txt)
txt = layers.Embedding(len(word_index) + 1,
                            300,
                            weights=[embedding_matrix],
                            input_length=100,
                            trainable=False)(txt)

txt = layers.Conv1D(32, 5)(txt)
txt = layers.Conv1D(60, 4)(txt)
txt = layers.Conv1D(100, 3)(txt)
text_lstm = layers.Bidirectional(layers.LSTM(30,return_sequences=True))(txt)
text_lstm = layers.Bidirectional(layers.LSTM(30,return_sequences=True))(text_lstm)
text_lstm = layers.Bidirectional(layers.LSTM(30,return_sequences=False))(text_lstm)
text_lstm = Dense(512, activation='relu')(text_lstm)
merged = keras.layers.concatenate([text_lstm,flatten], axis=1)

################################################################# - final bimodal combination

dense = Dense(1024, activation='relu')(merged)
dense = Dropout(0.1)(dense) 
dense = Dense(512, activation='relu')(dense)
dense = Dense(256, activation='relu')(dense)
dense = Dense(128, activation='relu')(dense)
dense = Dense(3, activation='softmax')(dense)
model = Model(inputs=(input_img,input_txt), outputs=dense)
model.summary()
model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.Adam(lr=2e-5),
              # optimizer=optimizers.RMSprop(),
              metrics=["accuracy",f1,recall,precision])

history = model.fit_generator(
    train_generator,
    steps_per_epoch=20,
    epochs=20,
    validation_data=val_generator,
    validation_steps=23)


In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 6))

# Plot accuracy curves
ax1.plot(epochs, acc, 'b', label='Training acc')
ax1.plot(epochs, val_acc, 'g', label='Validation acc')
ax1.set_title('Training and validation accuracy')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Accuracy')
ax1.legend()

# Plot loss curves
ax2.plot(epochs, loss, 'b', label='Training loss')
ax2.plot(epochs, val_loss, 'g', label='Validation loss')
ax2.set_title('Training and validation loss')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('Loss')
ax2.legend()

plt.show()

In [None]:
# Get training and validation recall and F1 scores
train_recall = history.history['recall']
val_recall = history.history['val_recall']
train_f1 = history.history['f1']
val_f1 = history.history['val_f1']

epochs = range(1, len(train_recall) + 1)

# Plot recall and F1 score side by side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,6))

# Plot recall
ax1.plot(epochs, train_recall, 'b', label='Training Recall')
ax1.plot(epochs, val_recall, 'r', label='Validation Recall')
ax1.set_title('Training and Validation Recall')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('Recall')
ax1.legend()

# Plot F1 score
ax2.plot(epochs, train_f1, 'b', label='Training F1 Score')
ax2.plot(epochs, val_f1, 'r', label='Validation F1 Score')
ax2.set_title('Training and Validation F1 Score')
ax2.set_xlabel('Epochs')
ax2.set_ylabel('F1 Score')
ax2.legend()

plt.show()


In [None]:
total_samples = len(val_generator) * BATCH_SIZE
steps = total_samples // BATCH_SIZE
print(steps)

In [None]:
model.load_weights('best_weights.h5')

In [None]:
a = model.evaluate_generator(val_generator,steps=44)
print(a)

In [None]:
for x in zip(model.metrics_names,a):
  print(x)

In [None]:
# Get predicted labels
y_pred = model.predict(val_generator)
y_pred_labels = np.argmax(y_pred, axis=1)

# Get true labels
y_true_labels = val_df['target'].values

# Get image paths and text for display
image_paths = val_df['image_path'].values
texts = val_df['clean_text'].values

class_dict = {0: 'neutral', 1: 'positive', 2: 'negative'}


# Display some examples
for i in range(10):
    # Load image
    image = load_img(image_paths[i], target_size=IMAGE_SIZE)
    
    # Get predicted and true labels
    pred_label = y_pred_labels[i]
    true_label = y_true_labels[i]
    
    # Get text
    text = texts[i]
    
    # Print predicted and true labels, and text
    print('Image:', image_paths[i])
    print('Predicted label:', pred_label)
    print('True label:', true_label)
    print('Text:', text)
    print('')
    
    # Show image
    plt.imshow(image)
    plt.axis('off')
    plt.show()
