In [1]:
#Importing all the required libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler, SequentialSampler)

from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings("ignore")



In [2]:
#Downloading NLTK libraries
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Use cuda if present
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running: ")
print(device)

Device available for running: 
cpu


In [4]:
#Root folder for data files (for storage and retrieval)
GDRIVE_PROJECT_FOLDER = '/content/gdrive/MyDrive/NLP_Project/'

In [5]:
#Mount the google drive to access data files
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [6]:
#Path variables for the train and test data files
train_data = GDRIVE_PROJECT_FOLDER+'train_data_processed1.csv'
test_data = GDRIVE_PROJECT_FOLDER+'test_data_processed1.csv'
test_true = GDRIVE_PROJECT_FOLDER+'Test_Actual_Final.csv'

In [7]:
#Dataset containing the meme ground truth 
true_df = pd.read_csv(test_true)
true_df.head()

Unnamed: 0.1,Unnamed: 0,Image_name,Labels
0,0,chuck_chuck_norris_meme_10.jpg,1_1100_1100
1,1,dr_evil_NDBB96K.png,1_0100_0200
2,2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,1_1110_1120
3,3,obama_2691536739_469698809820026_263513986_n.jpg,0_1111_1121
4,4,kim_threat-kim-jong-un-allegedly-working-on-mu...,0_0000_0000


In [8]:
#Extracting the first digit (1, 0 , -1) from Labels 
true_df['Sentiment'] = true_df['Labels'].str.split('_').str[0]
true_df['Sentiment'] = true_df['Sentiment'].astype(int)
true_df.head()

Unnamed: 0.1,Unnamed: 0,Image_name,Labels,Sentiment
0,0,chuck_chuck_norris_meme_10.jpg,1_1100_1100,1
1,1,dr_evil_NDBB96K.png,1_0100_0200,1
2,2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,1_1110_1120,1
3,3,obama_2691536739_469698809820026_263513986_n.jpg,0_1111_1121,0
4,4,kim_threat-kim-jong-un-allegedly-working-on-mu...,0_0000_0000,0


In [9]:
#Dataset containing the Train data
train_df = pd.read_csv(train_data, converters={'pre_tokens': eval, 'processed': eval})
train_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,image_name,text_ocr,text_corrected,humour,sarcasm,offensive,motivational,overall_sentiment,processed,tokenized_text,stop_tokens,rem_punct_tokens,pre_tokens
0,0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,1,1,0,0,1,"[look, friend, lightyear, sohalikut, trend, pl...","['look', 'there', 'my', 'friend', 'lightyear',...","['look', '', '', 'friend', 'lightyear', '', ''...","['look', 'friend', 'lightyear', 'sohalikut', '...","[look, friend, lightyear, sohalikut, trend, pl..."
1,1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,0,1,0,1,1,"[best, yearchallenge, complete, less, year, ku...","['the', 'best', 'of', 'yearchallenge', 'comple...","['', 'best', '', 'yearchallenge', 'completed',...","['best', 'yearchallenge', 'completed', 'years'...","[best, yearchalleng, complet, year, kudu, nare..."
2,2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,1,0,0,0,1,"[sam, thorne, strippin, follow, follow, saw, e...","['sam', 'thorne', 'strippin', 'follow', 'follo...","['sam', 'thorne', 'strippin', 'follow', 'follo...","['sam', 'thorne', 'strippin', 'follow', 'follo...","[sam, thorn, strippin, follow, follow, saw, po..."
3,3,3,image_4.png,10 Year Challenge - Sweet Dee Edition,10 Year Challenge - Sweet Dee Edition,1,1,1,1,1,"[year, challenge, sweet, dee, edition]","['year', 'challenge', 'sweet', 'dee', 'edition']","['year', 'challenge', 'sweet', 'dee', 'edition']","['year', 'challenge', 'sweet', 'dee', 'edition']","[year, challeng, sweet, dee, edit]"
4,4,4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,1,1,1,0,0,"[year, challenge, filter, hilarious, year, cha...","['year', 'challenge', 'with', 'no', 'filter', ...","['year', 'challenge', '', '', 'filter', 'hilar...","['year', 'challenge', 'filter', 'hilarious', '...","[year, challeng, filter, hilari, year, challen..."


In [10]:
#Checking the class labels balance in training dataset for task 1 (identifying meme as positive/negative/neutral - (1/-1/0))
train_df['overall_sentiment'].value_counts()

 1    4058
 0    2157
-1     615
Name: overall_sentiment, dtype: int64

In [11]:
#Dataset containing the processed text of test data
test_df = pd.read_csv(test_data, converters={'pre_tokens': eval, 'processed': eval})
test_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Image_name,Image_URL,OCR_extracted_text,corrected_text,processed,tokenized_text,stop_tokens,rem_punct_tokens,pre_tokens
0,0,0,chuck_chuck_norris_meme_10.jpg,https://gtmemes.com/wp-content/uploads/2019/03...,Some magicians can walk on water Chuck Norris...,Some magicians can walk on water Chuck Norris...,"[magician, walk, water, chuck, norris, swim, l...","['some', 'magicians', 'can', 'walk', 'on', 'wa...","['', 'magicians', '', 'walk', '', 'water', 'ch...","['magicians', 'walk', 'water', 'chuck', 'norri...","[magician, walk, water, chuck, norri, swim, land]"
1,1,1,dr_evil_NDBB96K.png,https://i.imgur.com/NDBB96K.png,ONE MILLION DOLLARS made on imgur,ONE MILLION DOLLARS made on imgur,"[one, million, dollar, make, imgur]","['one', 'million', 'dollars', 'made', 'on', 'i...","['', 'million', 'dollars', '', '', 'imgur']","['million', 'dollars', 'imgur']","[million, dollar, imgur]"
2,2,2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,https://media0ch-a.akamaihd.net/83/96/9e457d63...,Me: Mom can my friend sleep over? Mom: That's ...,Me: Mom can my friend sleep over? Mom: That's ...,"[mom, friend, sleep, mom, fine, boy, growingup...","['me', 'mom', 'can', 'my', 'friend', 'sleep', ...","['', 'mom', '', '', 'friend', 'sleep', '', 'mo...","['mom', 'friend', 'sleep', 'mom', 'fine', 'boy...","[mom, friend, sleep, mom, fine, boi, growingup..."
3,3,3,obama_2691536739_469698809820026_263513986_n.jpg,http://politicalmemes.com/wp-content/uploads/2...,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,"[guy, inherit, mess, whine, foxed, thing, guy,...","['this', 'guy', 'inherited', 'mess', 'did', 'h...","['', 'guy', 'inherited', 'mess', '', '', 'whin...","['guy', 'inherited', 'mess', 'whine', 'foxed',...","[gui, inherit, mess, whine, fox, thing, gui, f..."
4,4,4,kim_threat-kim-jong-un-allegedly-working-on-mu...,https://pics.me.me/threat-kim-jong-un-allegedl...,THREAT: Kim Jong Un allegedly working on multi...,THREAT: Kim Jong Un allegedly working on multi...,"[threat, kim, jong, un, allegedly, work, multi...","['threat', 'kim', 'jong', 'un', 'allegedly', '...","['threat', 'kim', 'jong', '', 'allegedly', 'wo...","['threat', 'kim', 'jong', 'allegedly', 'workin...","[threat, kim, jong, allegedli, work, multipl, ..."


In [12]:
#Creating a duplicate copy of train dataframe, so that modifications can be done in copy df if needed
train_df_sub = train_df
train_df_sub.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,image_name,text_ocr,text_corrected,humour,sarcasm,offensive,motivational,overall_sentiment,processed,tokenized_text,stop_tokens,rem_punct_tokens,pre_tokens
0,0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,1,1,0,0,1,"[look, friend, lightyear, sohalikut, trend, pl...","['look', 'there', 'my', 'friend', 'lightyear',...","['look', '', '', 'friend', 'lightyear', '', ''...","['look', 'friend', 'lightyear', 'sohalikut', '...","[look, friend, lightyear, sohalikut, trend, pl..."
1,1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,0,1,0,1,1,"[best, yearchallenge, complete, less, year, ku...","['the', 'best', 'of', 'yearchallenge', 'comple...","['', 'best', '', 'yearchallenge', 'completed',...","['best', 'yearchallenge', 'completed', 'years'...","[best, yearchalleng, complet, year, kudu, nare..."
2,2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,1,0,0,0,1,"[sam, thorne, strippin, follow, follow, saw, e...","['sam', 'thorne', 'strippin', 'follow', 'follo...","['sam', 'thorne', 'strippin', 'follow', 'follo...","['sam', 'thorne', 'strippin', 'follow', 'follo...","[sam, thorn, strippin, follow, follow, saw, po..."
3,3,3,image_4.png,10 Year Challenge - Sweet Dee Edition,10 Year Challenge - Sweet Dee Edition,1,1,1,1,1,"[year, challenge, sweet, dee, edition]","['year', 'challenge', 'sweet', 'dee', 'edition']","['year', 'challenge', 'sweet', 'dee', 'edition']","['year', 'challenge', 'sweet', 'dee', 'edition']","[year, challeng, sweet, dee, edit]"
4,4,4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,1,1,1,0,0,"[year, challenge, filter, hilarious, year, cha...","['year', 'challenge', 'with', 'no', 'filter', ...","['year', 'challenge', '', '', 'filter', 'hilar...","['year', 'challenge', 'filter', 'hilarious', '...","[year, challeng, filter, hilari, year, challen..."


## Multi class classification with CNN

In [13]:
class CNN(nn.Module):
    '''
      pretrained_embedding - None or pretrained_embedding
                              if none, embedding layer is used create a lookup table to store word embeddings
                              else pretrained word embeddings are used in further layers
      freeze_embedding - set to true when pretrained embeddings are used to freeze training of word embeddings
      vocab_size - size of vocabulary
      filter_sizes - set of filters with size (typically represent n-gram)
      num_classes - the number of output labels
      dropout - for regularization during training phase
    '''
    def __init__(self, pretrained_embedding, freeze_embedding, vocab_size, embed_dim, filter_sizes, num_filters, num_classes, dropout):
        
        super(CNN, self).__init__() 
        
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding, freeze=freeze_embedding, padding_idx=0)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embed_dim, padding_idx=0, max_norm=5.0)
        
        # list of 1 dimensional convolutionlayer for each of filters with input size of embedding dimension
        # and output size as number of filters of considered filter size
        self.conv1d_list = nn.ModuleList([
            nn.Conv1d(in_channels=self.embed_dim, out_channels=num_filters[i], kernel_size=filter_sizes[i])
              for i in range(len(filter_sizes))
        ])
        
        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)

    '''
      word embeddings of a sentence are identified from embedding layer
      convolution is performed on the embeddings and features and patterns are identified with the help of kernel
      max_pooling is used to identify the max feature from every resultant filter
      all the concatenated max features are fed into the linear layer and softmax activation is applied on output
    '''
    def forward(self, input_ids):
        x_embed = self.embedding(input_ids).float()
        x_reshaped = x_embed.permute(0, 2, 1)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2]) for x_conv in x_conv_list]
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list],
                         dim=1)
        
        x = self.fc(self.dropout(x_fc))
        x = F.softmax(x, dim = 1)
        return x

In [14]:
# Function to get the output tensor
def make_target(label):
    if label == -1:
        return torch.tensor([2], dtype=torch.long, device=device)
    elif label == 0:
        return torch.tensor([0], dtype=torch.long, device=device)
    else:
        return torch.tensor([1], dtype=torch.long, device=device)

In [15]:
'''
  create_word2idx() creates a vocabulary assigning unique integer to every word in vocabulary
'''
def create_word2idx(sentences):
    max_len = 0
    word2idx = {}
    word2idx['<pad>'] = 0
    word2idx['<unk>'] = 1

    idx = 2
    for tokenized_sent in sentences:
        for token in tokenized_sent:
            if token not in word2idx:
                word2idx[token] = idx
                idx += 1

        max_len = max(max_len, len(tokenized_sent))

    return word2idx, max_len

'''
  encode() converts a tokenized sentence in list of indices according to word2idx
'''
def encode(tokenized_sentences, word2idx, max_len):
    
    input_sentences = []
    for tokenized_sent in tokenized_sentences:
        #to make all sentences of equal length, remaining length in sentence from max length is padded
        tokenized_sent += ['<pad>'] * (max_len - len(tokenized_sent))

        input_sent = [word2idx.get(token) for token in tokenized_sent]
        input_sentences.append(input_sent)
    
    return np.array(input_sentences)

Using word2idx and encode methods, the training and test sentences are encoded

In [16]:
labels_task_1 = ['Neutral', 'Positive', 'Negative']
test_df_sub = pd.merge(test_df[['Unnamed: 0','pre_tokens']], true_df[['Unnamed: 0','Sentiment']], on='Unnamed: 0')
test_df_sub.rename(columns = {"Sentiment": "sentiment"}, inplace=True)

print("Building Word --> indices")

train_sentences_1 = train_df_sub['pre_tokens'].to_list()
train_labels_1 = train_df_sub['overall_sentiment'].to_list()
train_labels_1 = [2 if label==-1 else label for label in train_labels_1]
print(len(train_sentences_1), len(train_labels_1))

test_sentences_1 = list(test_df_sub['pre_tokens'])
test_labels_1 = test_df_sub['sentiment'].to_list()
test_labels_1 = [2 if label==-1 else label for label in test_labels_1]

word2idx_1, max_len = create_word2idx(train_sentences_1)

print("\nEncoding sentences")
train_input_sentences_1 = encode(train_sentences_1, word2idx_1, max_len)

print(train_input_sentences_1)

Building Word --> indices
6830 6830

Encoding sentences
[[   2    3    4 ...    0    0    0]
 [  13   14   15 ...    0    0    0]
 [  22   23   24 ...    0    0    0]
 ...
 [ 252  554 9297 ...    0    0    0]
 [ 168 6542  168 ...    0    0    0]
 [ 241 1479  177 ...    0    0    0]]


Using Word2Vec to create pretrained word embedding with window size 5 and embedding dimension as 300

In [17]:
sentence_tokens = train_df_sub['pre_tokens']

# Initializing the train model
import gensim
from gensim.models import word2vec
print("Word2Vec Training model....")
w2v_model = gensim.models.Word2Vec(sentence_tokens, size=300, window=5, min_count=2, sg = 1,
                                      hs = 0, negative = 10, workers= 32, seed = 34) 


Word2Vec Training model....


In [18]:
def create_embedding_vectors(embedding, vocab):
  vocab_size = len(vocab) + 1
  embedding_vectors = np.zeros((vocab_size, 300))

  for word, i in vocab.items():
    if word in embedding.wv.vocab:
      embedding_vectors[i] = embedding.wv[word]
    else:
      embedding_vectors[i] = np.zeros(300)
  return embedding_vectors

embedding_vectors = create_embedding_vectors(w2v_model, word2idx_1)

Dataloader method is used to prepare batch dataset for training and validation.
Each batch is of size 32 i.e 32 instances in each batch

In [19]:
def data_loader(train_inputs, val_inputs, train_labels, val_labels, batch_size=50):
    
    # Convert data type to torch.Tensor
    train_inputs, val_inputs, train_labels, val_labels = tuple(torch.tensor(data) for data in [train_inputs, val_inputs, train_labels, val_labels])

    # Create DataLoader for training data
    train_data = TensorDataset(train_inputs, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create DataLoader for validation data
    val_data = TensorDataset(val_inputs, val_labels)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    return train_dataloader, val_dataloader

In [20]:
# preparing batch dataloader
train_input_sentences_1, val_input_sentences_1, train_labels_1, val_labels_1 = train_test_split(train_input_sentences_1, train_labels_1, test_size=0.1, random_state=42)

train_dataloader_1, val_dataloader_1 = data_loader(train_input_sentences_1, val_input_sentences_1, train_labels_1, val_labels_1, batch_size=32)

Object of CNN class is initialized with 


1.   pretrained_embedding to None - Using embedding layer for word embeddings
2.   freeze_embedding to false - since pretrained_embedding is not used
3.   vocab_size to size of word2idx vocabulary
4.   embed_dim to 100 - each word embedding dimension
5.   filter_sizes - size of kernels
6.   num_classes - 3 (positive, negative, neutral)
7.   dropout - regularization to 0.5

Adadelta optimizer is used with learning rate 0.25 and Cross entropy loss is used for this multi class classification



In [21]:
#pretrained_embedding = torch.FloatTensor(embedding_vectors)
pretrained_embedding = None
epochs = 10
 
cnn_model_1 = CNN(pretrained_embedding=pretrained_embedding,
                        freeze_embedding=False,
                        vocab_size=len(word2idx_1),
                        embed_dim=100,
                        filter_sizes=[2, 4, 6], # bigram, 4-gram, 6-gram
                        num_filters=[100, 100, 100],
                        num_classes=3,
                        dropout=0.5)
    
cnn_model_1.to(device)

optimizer_1 = optim.Adadelta(cnn_model_1.parameters(), lr=0.25, rho=0.95)
loss_fn_1 = nn.CrossEntropyLoss()
#loss_fn_1 = nn.BCELoss()

In [22]:
for epoch in range(epochs):
  cnn_model_1.train()

  train_loss = 0
  train_accuracy = []

  #for every batch
  for i, batch in enumerate(train_dataloader_1):
      sentences, y_original = tuple(sent.to(device) for sent in batch)
      cnn_model_1.zero_grad()

      # feed forward sentence to get the class probabilities
      y_predicted = cnn_model_1(sentences)

      #y_original = y_original.float()

      # loss function is calculated with original labels against the predicted labes
      loss = loss_fn_1(y_predicted, y_original)
      train_loss += loss.item()

      # to find accuracy in each batch
      y_pred_label = torch.argmax(y_predicted, dim=1).flatten()
      correct = 0
      for i in range(len(y_original)):
        if(y_original[i] == y_pred_label[i]):
          correct +=1

      train_accuracy.append(correct/len(y_original))

      # backpropagation of loss
      loss.backward()
      optimizer_1.step()

  print("\nTraining: Epoch {} --> Loss {} | Accuracy {}".format(epoch, train_loss/len(train_dataloader_1), np.mean(train_accuracy)))

  # validation is carried out on the validation dataset
  if val_dataloader_1 is not None:

    cnn_model_1.eval()
    val_accuracy = []
    f1_val_nn = []

    for batch in val_dataloader_1:
        sentences, y_original = tuple(sent.to(device) for sent in batch)

        with torch.no_grad():
            y_predicted = cnn_model_1(sentences)

        y_pred_label = torch.argmax(y_predicted, dim=1).flatten()

        correct = 0
        for i in range(len(y_original)):
          if(y_original[i] == y_pred_label[i]):
            correct +=1

        val_accuracy.append(correct/len(y_original))
        f1_val_nn.append(f1_score(y_original, y_pred_label, average = 'macro'))

    print("Validation: Epoch {} --> Accuracy {} | F1 Score: {}".format(epoch, np.mean(val_accuracy), np.mean(f1_val_nn)))


Training: Epoch 0 --> Loss 0.9717246374318019 | Accuracy 0.5874352331606217
Validation: Epoch 0 --> Accuracy 0.6140237603305785 | F1 Score: 0.25845184498429224

Training: Epoch 1 --> Loss 0.9557980459588797 | Accuracy 0.5942357512953368
Validation: Epoch 1 --> Accuracy 0.6140237603305785 | F1 Score: 0.25845184498429224

Training: Epoch 2 --> Loss 0.9508428826850931 | Accuracy 0.5933182210708117
Validation: Epoch 2 --> Accuracy 0.6140237603305785 | F1 Score: 0.25845184498429224

Training: Epoch 3 --> Loss 0.9436038760323598 | Accuracy 0.5925626079447324
Validation: Epoch 3 --> Accuracy 0.6055010330578512 | F1 Score: 0.2623534682183835

Training: Epoch 4 --> Loss 0.9302946372353351 | Accuracy 0.6094559585492227
Validation: Epoch 4 --> Accuracy 0.6126033057851239 | F1 Score: 0.2580560503027832

Training: Epoch 5 --> Loss 0.9146329260243036 | Accuracy 0.6285082037996547
Validation: Epoch 5 --> Accuracy 0.5854855371900827 | F1 Score: 0.2940398850603484

Training: Epoch 6 --> Loss 0.8962883

In [23]:
# predict() takes the sentence and label as parameters, use the model to predict the label of the sentence
# and returns 1 if it predicts correctly else 0

def predict_1(text, label):
    max_len = 100

    # Tokenize, pad and encode text
    padded_tokens = text + ['<pad>'] * (max_len - len(text))
    input_id = [word2idx_1.get(token) if word2idx_1.get(token)!=None else 1 for token in padded_tokens]

    test_sentence = torch.tensor(input_id).unsqueeze(dim=0)

    y_predicted = cnn_model_1.forward(test_sentence)
    preds = torch.argmax(y_predicted, dim=1).flatten()

    if preds == label:
      pred = 1
    else:
      pred = 0

    return pred

For each of the sentence in test dataset, prediction is carried out and performance measures are displayed

In [24]:
test_accuracy = 0

correct_one = 0
total_one = 0
correct_two = 0
total_two = 0
correct_zero= 0
total_zero = 0
predicted_labels = []
for i in range(len(test_sentences_1)):
  pred = predict_1(test_sentences_1[i], test_labels_1[i])
  predicted_labels.append(pred)
  test_accuracy += pred

  if test_labels_1[i] == 0:
    total_zero += 1
    if pred == 1:
      correct_zero += 1

  if test_labels_1[i] == 1:
    total_one += 1
    if pred == 1:
      correct_one += 1
  
  if test_labels_1[i] == 2:
    total_two += 1
    if pred == 1:
      correct_two += 1

f1 = f1_score(test_labels_1, predicted_labels, average='macro')
print("\nTotal Test sentences: {}".format(len(test_sentences_1)))
print("\nTotal neutral test sentences: {}".format(total_zero))
print("Nuetral test sentences predicted correctly: {}".format(correct_zero))

print("\nTotal positive test sentences: {}".format(total_one))
print("Positive test sentences predicted correctly: {}".format(correct_one))

print("\nTotal negative test sentences: {}".format(total_two))
print("Negative test sentences predicted correctly: {}".format(correct_two))
print("\nTest Accuracy: {}".format(test_accuracy/len(test_sentences_1)))
print("Test F1 Score: {} \n".format(f1))

print(classification_report(test_labels_1, predicted_labels, target_names = labels_task_1))


Total Test sentences: 1840

Total neutral test sentences: 580
Nuetral test sentences predicted correctly: 26

Total positive test sentences: 1089
Positive test sentences predicted correctly: 1019

Total negative test sentences: 171
Negative test sentences predicted correctly: 0

Test Accuracy: 0.5679347826086957
Test F1 Score: 0.5869440799750079 

              precision    recall  f1-score   support

     Neutral       0.70      0.96      0.81       580
    Positive       0.98      0.94      0.96      1089
    Negative       0.00      0.00      0.00       171

    accuracy                           0.85      1840
   macro avg       0.56      0.63      0.59      1840
weighted avg       0.80      0.85      0.82      1840



# Multi Label classification with CNN

In [25]:
train_data = '/content/gdrive/MyDrive/NLP_Project/train_data_processed1.csv'
train_df=pd.read_csv(train_data)
train_df.info()           # explore data frame information 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6830 entries, 0 to 6829
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         6830 non-null   int64 
 1   Unnamed: 0.1       6830 non-null   int64 
 2   image_name         6830 non-null   object
 3   text_ocr           6830 non-null   object
 4   text_corrected     6830 non-null   object
 5   humour             6830 non-null   int64 
 6   sarcasm            6830 non-null   int64 
 7   offensive          6830 non-null   int64 
 8   motivational       6830 non-null   int64 
 9   overall_sentiment  6830 non-null   int64 
 10  processed          6830 non-null   object
 11  tokenized_text     6830 non-null   object
 12  stop_tokens        6830 non-null   object
 13  rem_punct_tokens   6830 non-null   object
 14  pre_tokens         6830 non-null   object
dtypes: int64(7), object(8)
memory usage: 800.5+ KB


In [26]:
# dropping unnecessary columns
train_df = train_df.drop(['Unnamed: 0','image_name','text_ocr','overall_sentiment'], axis=1)  # Drop some features
train_df.head()

Unnamed: 0,Unnamed: 0.1,text_corrected,humour,sarcasm,offensive,motivational,processed,tokenized_text,stop_tokens,rem_punct_tokens,pre_tokens
0,0,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,1,1,0,0,"['look', 'friend', 'lightyear', 'sohalikut', '...","['look', 'there', 'my', 'friend', 'lightyear',...","['look', '', '', 'friend', 'lightyear', '', ''...","['look', 'friend', 'lightyear', 'sohalikut', '...","['look', 'friend', 'lightyear', 'sohalikut', '..."
1,1,The best of #10 YearChallenge! Completed in le...,0,1,0,1,"['best', 'yearchallenge', 'complete', 'less', ...","['the', 'best', 'of', 'yearchallenge', 'comple...","['', 'best', '', 'yearchallenge', 'completed',...","['best', 'yearchallenge', 'completed', 'years'...","['best', 'yearchalleng', 'complet', 'year', 'k..."
2,2,Sam Thorne @Strippin ( Follow Follow Saw every...,1,0,0,0,"['sam', 'thorne', 'strippin', 'follow', 'follo...","['sam', 'thorne', 'strippin', 'follow', 'follo...","['sam', 'thorne', 'strippin', 'follow', 'follo...","['sam', 'thorne', 'strippin', 'follow', 'follo...","['sam', 'thorn', 'strippin', 'follow', 'follow..."
3,3,10 Year Challenge - Sweet Dee Edition,1,1,1,1,"['year', 'challenge', 'sweet', 'dee', 'edition']","['year', 'challenge', 'sweet', 'dee', 'edition']","['year', 'challenge', 'sweet', 'dee', 'edition']","['year', 'challenge', 'sweet', 'dee', 'edition']","['year', 'challeng', 'sweet', 'dee', 'edit']"
4,4,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,1,1,1,0,"['year', 'challenge', 'filter', 'hilarious', '...","['year', 'challenge', 'with', 'no', 'filter', ...","['year', 'challenge', '', '', 'filter', 'hilar...","['year', 'challenge', 'filter', 'hilarious', '...","['year', 'challeng', 'filter', 'hilari', 'year..."


In [27]:
'''train_df.humour[train_df['humour']!='not_funny']=1
train_df.humour[train_df['humour']=='not_funny']=0
train_df.sarcasm[train_df['sarcasm']!='not_sarcastic']=1
train_df.sarcasm[train_df['sarcasm']=='not_sarcastic']=0
train_df.offensive[train_df['offensive']!='not_offensive']=1
train_df.offensive[train_df['offensive']=='not_offensive']=0
train_df.motivational[train_df['motivational']!='not_motivational']=1
train_df.motivational[train_df['motivational']=='not_motivational']=0

train_df'''

"train_df.humour[train_df['humour']!='not_funny']=1\ntrain_df.humour[train_df['humour']=='not_funny']=0\ntrain_df.sarcasm[train_df['sarcasm']!='not_sarcastic']=1\ntrain_df.sarcasm[train_df['sarcasm']=='not_sarcastic']=0\ntrain_df.offensive[train_df['offensive']!='not_offensive']=1\ntrain_df.offensive[train_df['offensive']=='not_offensive']=0\ntrain_df.motivational[train_df['motivational']!='not_motivational']=1\ntrain_df.motivational[train_df['motivational']=='not_motivational']=0\n\ntrain_df"

In [28]:
print(train_df['humour'].value_counts())
print(train_df['sarcasm'].value_counts())
print(train_df['offensive'].value_counts())
print(train_df['motivational'].value_counts())

1    5212
0    1618
Name: humour, dtype: int64
1    5314
0    1516
Name: sarcasm, dtype: int64
1    4173
0    2657
Name: offensive, dtype: int64
0    4421
1    2409
Name: motivational, dtype: int64


In [29]:
class CNN_2(nn.Module):
    '''
      pretrained_embedding - None or pretrained_embedding
                              if none, embedding layer is used create a lookup table to store word embeddings
                              else pretrained word embeddings are used in further layers
      freeze_embedding - set to true when pretrained embeddings are used to freeze training of word embeddings
      vocab_size - size of vocabulary
      filter_sizes - set of filters with size (typically represent n-gram)
      num_classes - the number of output labels
      dropout - for regularization during training phase
    '''
    def __init__(self, pretrained_embedding, freeze_embedding, vocab_size, embed_dim, filter_sizes, num_filters, num_classes, dropout):
        
        super(CNN_2, self).__init__()
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding, freeze=freeze_embedding, padding_idx=0)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embed_dim, padding_idx=0, max_norm=5.0)
        
        # list of 1 dimensional convolutionlayer for each of filters with input size of embedding dimension
        # and output size as number of filters of considered filter size
        self.conv1d_list = nn.ModuleList([
              nn.Conv1d(in_channels=self.embed_dim,
                        out_channels=num_filters[i],
                        kernel_size=filter_sizes[i])
            for i in range(len(filter_sizes))
        ])

        self.fc = nn.Linear(np.sum(num_filters), num_classes)
        self.dropout = nn.Dropout(p=dropout)
        self.sigmoid = nn.Sigmoid()

    '''
      word embeddings of a sentence are identified from embedding layer
      convolution is performed on the embeddings and features and patterns are identified with the help of kernel
      max_pooling is used to identify the max feature from every resultant filter
      all the concatenated max features are fed into the linear layer and sigmoid activation is applied on output
    '''
    def forward(self, input_ids):
        x_embed = self.embedding(input_ids).float()
        x_reshaped = x_embed.permute(0, 2, 1)
        x_conv_list = [F.relu(conv1d(x_reshaped)) for conv1d in self.conv1d_list]
        x_pool_list = [F.max_pool1d(x_conv, kernel_size=x_conv.shape[2]) for x_conv in x_conv_list]
        
        x_fc = torch.cat([x_pool.squeeze(dim=2) for x_pool in x_pool_list], dim=1)
        x = self.fc(self.dropout(x_fc))
        x = self.sigmoid(x)

        return x

In [30]:
labels_task_2 = ['Humour', 'Sarcasm', 'Offensive', 'Motivational']
test_df_sub = pd.merge(test_df[['Unnamed: 0','pre_tokens']], true_df[['Unnamed: 0','Labels']], on='Unnamed: 0')

test_df_sub['Individual Labels'] = test_df_sub['Labels'].apply(lambda label: label.split('_')[1])
test_df_sub = test_df_sub.drop(columns = ['Unnamed: 0', 'Labels'])

separate_labels = test_df_sub['Individual Labels'].apply(lambda x: pd.Series(list(x)))

labels = ['humour', 'sarcasm', 'offensive', 'motivational']

for i in range(4):
  test_df_sub[labels[i]] = separate_labels[i]

test_sentences_2 = test_df_sub['pre_tokens'].to_list()
test_labels_2 = test_df_sub[['humour', 'sarcasm', 'offensive', 'motivational']].to_numpy()
print(len(test_sentences_2), len(test_labels_2))

1840 1840


Using word2idx and encode methods, the training and test sentences are encoded

In [31]:
print("Building Word --> indices")

train_sentences_2 = train_df_sub['pre_tokens'].to_list()
train_labels_2 = train_df[['humour', 'sarcasm', 'offensive', 'motivational']].to_numpy()
print(len(train_sentences_2), len(train_labels_2))

word2idx_2, max_len = create_word2idx(train_sentences_2)

print("\nEncoding sentences")
train_input_sentences_2 = encode(train_sentences_2, word2idx_2, max_len)

print(train_input_sentences_2)

print(train_labels_2)

Building Word --> indices
6830 6830

Encoding sentences
[[   2    3    4 ...    0    0    0]
 [  13   14   15 ...    0    0    0]
 [  22   23   24 ...    0    0    0]
 ...
 [ 252  554 9297 ...    0    0    0]
 [ 168 6542  168 ...    0    0    0]
 [ 241 1479  177 ...    0    0    0]]
[[1 1 0 0]
 [0 1 0 1]
 [1 0 0 0]
 ...
 [1 1 1 0]
 [0 1 0 1]
 [0 0 0 1]]


Object of CNN_2 class is initialized with 


1.   pretrained_embedding to None - Using embedding layer for word embeddings
2.   freeze_embedding to false - since pretrained_embedding is not used
3.   vocab_size to size of word2idx vocabulary
4.   embed_dim to 100 - each word embedding dimension
5.   filter_sizes - size of kernels
6.   num_classes - 4 (humour, sarcasm, offensive and motivational)
7.   dropout - regularization to 0.5

Adam optimizer is used with learning rate 0.01 and Binary Cross entropy loss is used for this multi label classification

In [32]:
#pretrained_embedding = torch.FloatTensor(embedding_vectors)
pretrained_embedding = None
epochs = 10

cnn_model_2 = CNN_2(pretrained_embedding=pretrained_embedding,
                        freeze_embedding=False,
                        vocab_size=len(word2idx_2),
                        embed_dim=100,
                        filter_sizes=[2, 4, 6],
                        num_filters=[50, 50, 50],
                        num_classes=4,
                        dropout=0.5)
    
cnn_model_2.to(device)

optimizer_2 = optim.Adam(cnn_model_2.parameters(), lr=0.01)
loss_fn_2 = nn.BCELoss()

Training labels are converted to tensors and then
Dataloader method is used to prepare batch dataset for training and validation.
Each batch is of size 32 i.e 32 instances in each batch

In [33]:
train_labels_tensor_2 = []
for labels in train_labels_2:
  labels_tensor = []
  for label in labels:
    label_tensor = torch.tensor(label)
    labels_tensor.append(label_tensor)
  train_labels_tensor_2.append(labels_tensor)

train_input_sentences_2, val_input_sentences_2, train_labels_2, val_labels_2 = train_test_split(train_input_sentences_2, train_labels_tensor_2, test_size=0.1, random_state=42)

train_dataloader_2, val_dataloader_2 = data_loader(train_input_sentences_2, val_input_sentences_2, train_labels_2, val_labels_2, batch_size=32)

In [34]:
print("\nValidation accuracy here is displayed as")
print("only the total number of instances where the model predicts all the labels correctly")

for epoch in range(epochs):
  cnn_model_2.train()

  train_loss = 0
  validation_loss = 0
  train_accuracy = []

  # for every batch
  for i, batch in enumerate(train_dataloader_2):
      sentences, y_original = tuple(sent.to(device) for sent in batch)
      y_original = y_original.float()
      cnn_model_2.zero_grad()

      # feed forward sentence to calculate probabilities of all labels
      y_predicted = cnn_model_2(sentences)

      # loss function is calculated with original labels against the predicted labes
      loss = loss_fn_2(y_predicted, y_original)
      train_loss += loss.item()

      correct = 0
      for i in range(len(y_original)):
        y_correct = torch.count_nonzero(y_original[i] == y_predicted[i])
        y_correct_int = torch.IntTensor.item(y_correct)
        if y_correct_int == 4:
          correct +=1

      train_accuracy.append(correct/len(y_original))

      # loss is backpropagated
      loss.backward()
      optimizer_2.step()

  print("\nTraining: Epoch {} --> Loss {}".format(epoch, train_loss/len(train_dataloader_2)))

  # validation is carried out on validation dataset
  if val_dataloader_2 is not None:

    cnn_model_2.eval()
    val_accuracy = []
    f1_val_nn2 = []

    for batch in val_dataloader_2:
        sentences, y_original = tuple(sent.to(device) for sent in batch)
        y_original = y_original.float()

        with torch.no_grad():
            y_predicted = cnn_model_2(sentences)

        for i in range(len(y_predicted)):
          y_predicted[i] = (y_predicted[i]>0.5).float()
        validation_loss += loss_fn_2(y_predicted, y_original).item()

        correct = 0
        for i in range(len(y_original)):
          y_correct = torch.count_nonzero(y_original[i] == y_predicted[i])
          y_correct_int = torch.IntTensor.item(y_correct)
          if y_correct_int == 4:
            correct +=1

        val_accuracy.append(correct/len(y_original))
        f1_val_nn2.append(f1_score(y_original, y_predicted, average = 'macro'))

    print("Validation: Epoch {} --> Accuracy {} | F1 Score: {}".format(epoch, np.mean(val_accuracy), np.mean(f1_val_nn2)))
    #print("\nTraining: Epoch {} --> Loss {}".format(epoch, validation_loss/len(val_dataloader_2)))


Validation accuracy here is displayed as
only the total number of instances where the model predicts all the labels correctly

Training: Epoch 0 --> Loss 0.6557005822967371
Validation: Epoch 0 --> Accuracy 0.2168130165289256 | F1 Score: 0.6209893646365395

Training: Epoch 1 --> Loss 0.6240985461467289
Validation: Epoch 1 --> Accuracy 0.2055785123966942 | F1 Score: 0.6100827070425178

Training: Epoch 2 --> Loss 0.5388907726873388
Validation: Epoch 2 --> Accuracy 0.1958935950413223 | F1 Score: 0.6280365951475714

Training: Epoch 3 --> Loss 0.4394507664472946
Validation: Epoch 3 --> Accuracy 0.18879132231404958 | F1 Score: 0.6532169939918597

Training: Epoch 4 --> Loss 0.35521297774475474
Validation: Epoch 4 --> Accuracy 0.15896177685950413 | F1 Score: 0.5983081012314552

Training: Epoch 5 --> Loss 0.31134707901453107
Validation: Epoch 5 --> Accuracy 0.1518595041322314 | F1 Score: 0.6303541440682076

Training: Epoch 6 --> Loss 0.2834907699897499
Validation: Epoch 6 --> Accuracy 0.1676136

In [35]:
'''
  predict() takes the sentence and label as parameters, use the model to predict the probabilites of each label
  since sigmoid function converts the value between 0 and 1, if the predicted probability is greater than 0.5,
  the label is associated to the sentence else not
'''
def predict_2(text, label):

    max_len = 100
    padded_tokens = text + ['<pad>'] * (max_len - len(text))
    sentence = [word2idx_2.get(token) if word2idx_2.get(token)!=None else 1 for token in padded_tokens]

    sentence = torch.tensor(sentence).unsqueeze(dim=0)

    y_predicted = cnn_model_2.forward(sentence)
    y_predicted = (y_predicted>0.5).float()
    
    label = torch.tensor(label)
    y_correct = torch.count_nonzero(label == y_predicted)
    y_correct_bool = label == y_predicted
    y_correct_int = torch.IntTensor.item(y_correct)

    return y_predicted, y_correct_bool, y_correct, y_correct_int

Prediction is carried out for all the test sentences and the performance measures are displayed

In [36]:
test_full_accuracy = 0
one_correct = 0
two_correct = 0
three_correct = 0
test_full_incorrect = 0
humour = 0
sarcasm = 0
offensive = 0
motivational = 0
predicted_labels_2 = []
test_labels_int_2 = []

for i in range(len(test_sentences_2)):
  test_labels_int_2.append(list(map(int, test_labels_2[i])))
  test_labels_tensor = []
  for label in test_labels_2[i]:
    test_labels_tensor.append(torch.tensor(int(label)))
  y_predicted, correct_bool, correct_tensor, correct_int = predict_2(test_sentences_2[i], test_labels_tensor)

  predicted_labels_2.append(y_predicted[0])
  if correct_int == 4:
    test_full_accuracy += 1
  elif correct_int == 3:
    three_correct += 1
  elif correct_int == 2:
    two_correct += 1
  elif correct_int == 1:
    one_correct += 1
  else:
    test_full_incorrect += 1

  if correct_bool[0][0]:
    humour += 1
  if correct_bool[0][1]:
    sarcasm += 1
  if correct_bool[0][2]:
    offensive += 1
  if correct_bool[0][3]:
    motivational += 1

pred_labels_2 = []
for pred in predicted_labels_2:
  pred_labels_2.append(pred.cpu().detach().numpy().tolist())

f1_2 = f1_score(test_labels_int_2, pred_labels_2, average='macro')
print("\nTotal test instances: {}".format(len(test_sentences_2)))
print("All correct: {}".format(test_full_accuracy))
print("Utmost 3 correct: {}".format(three_correct))
print("Utmost 2 correct: {}".format(two_correct))
print("Only 1 correct: {}".format(one_correct))
print("All incorrect: {}".format(test_full_incorrect))

print("\nAccuracies Label Wise")
print("humour: {}".format(humour/len(test_sentences_2)))
print("Sarcasm: {}".format(sarcasm/len(test_sentences_2)))
print("Offensive: {}".format(offensive/len(test_sentences_2)))
print("Motivational: {} \n".format(motivational/len(test_sentences_2)))
print(classification_report(test_labels_int_2, pred_labels_2, target_names=labels_task_2))
print("\nF1 Score with macro average: {}".format(f1_2))


Total test instances: 1840
All correct: 287
Utmost 3 correct: 646
Utmost 2 correct: 552
Only 1 correct: 281
All incorrect: 74

Accuracies Label Wise
humour: 0.6657608695652174
Sarcasm: 0.7146739130434783
Offensive: 0.5451086956521739
Motivational: 0.5043478260869565 

              precision    recall  f1-score   support

      Humour       0.76      0.82      0.79      1402
     Sarcasm       0.77      0.89      0.83      1424
   Offensive       0.61      0.74      0.67      1146
Motivational       0.37      0.50      0.42       678

   micro avg       0.66      0.77      0.71      4650
   macro avg       0.63      0.74      0.68      4650
weighted avg       0.67      0.77      0.72      4650
 samples avg       0.66      0.74      0.66      4650


F1 Score with macro average: 0.6778615200702238
