In [1]:
#Importing all the required libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler, SequentialSampler)

from imblearn.over_sampling import SMOTE, RandomOverSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score

import warnings
warnings.filterwarnings("ignore")



In [2]:
#Downloading NLTK libraries
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# Use cuda if present
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device available for running: ")
print(device)

Device available for running: 
cpu


In [4]:
#Root folder for data files (for storage and retrieval)
GDRIVE_PROJECT_FOLDER = '/content/gdrive/MyDrive/NLP_Project/'

In [5]:
#Mount the google drive to access data files
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [6]:
#Path variables for the train and test data files
train_data = GDRIVE_PROJECT_FOLDER+'train_data_processed.csv'
test_data = GDRIVE_PROJECT_FOLDER+'test_data_processed.csv'
test_true = GDRIVE_PROJECT_FOLDER+'Test_Actual_Final.csv'

In [7]:
#Dataset containing the meme ground truth 
true_df = pd.read_csv(test_true)
true_df.head()

Unnamed: 0.1,Unnamed: 0,Image_name,Labels
0,0,chuck_chuck_norris_meme_10.jpg,1_1100_1100
1,1,dr_evil_NDBB96K.png,1_0100_0200
2,2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,1_1110_1120
3,3,obama_2691536739_469698809820026_263513986_n.jpg,0_1111_1121
4,4,kim_threat-kim-jong-un-allegedly-working-on-mu...,0_0000_0000


In [8]:
#Extracting the first digit (1, 0 , -1) from Labels 
true_df['Sentiment'] = true_df['Labels'].str.split('_').str[0]
true_df['Sentiment'] = true_df['Sentiment'].astype(int)
true_df.head()

Unnamed: 0.1,Unnamed: 0,Image_name,Labels,Sentiment
0,0,chuck_chuck_norris_meme_10.jpg,1_1100_1100,1
1,1,dr_evil_NDBB96K.png,1_0100_0200,1
2,2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,1_1110_1120,1
3,3,obama_2691536739_469698809820026_263513986_n.jpg,0_1111_1121,0
4,4,kim_threat-kim-jong-un-allegedly-working-on-mu...,0_0000_0000,0


In [9]:
#Dataset containing the Train data
train_df = pd.read_csv(train_data, converters={'pre_tokens': eval, 'processed': eval})
train_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,image_name,text_ocr,text_corrected,humour,sarcasm,offensive,motivational,overall_sentiment,processed,tokenized_text,stop_tokens,rem_punct_tokens,pre_tokens
0,0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,1,1,0,0,1,"[look, friend, lightyear, sohalikut, trend, pl...","['look', 'there', 'my', 'friend', 'lightyear',...","['look', '', '', 'friend', 'lightyear', '', ''...","['look', 'friend', 'lightyear', 'sohalikut', '...","[look, friend, lightyear, sohalikut, trend, pl..."
1,1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,0,1,0,1,1,"[best, yearchallenge, complete, less, year, ku...","['the', 'best', 'of', 'yearchallenge', 'comple...","['', 'best', '', 'yearchallenge', 'completed',...","['best', 'yearchallenge', 'completed', 'years'...","[best, yearchalleng, complet, year, kudu, nare..."
2,2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,1,0,0,0,1,"[sam, thorne, strippin, follow, follow, saw, e...","['sam', 'thorne', 'strippin', 'follow', 'follo...","['sam', 'thorne', 'strippin', 'follow', 'follo...","['sam', 'thorne', 'strippin', 'follow', 'follo...","[sam, thorn, strippin, follow, follow, saw, po..."
3,3,3,image_4.png,10 Year Challenge - Sweet Dee Edition,10 Year Challenge - Sweet Dee Edition,1,1,1,1,1,"[year, challenge, sweet, dee, edition]","['year', 'challenge', 'sweet', 'dee', 'edition']","['year', 'challenge', 'sweet', 'dee', 'edition']","['year', 'challenge', 'sweet', 'dee', 'edition']","[year, challeng, sweet, dee, edit]"
4,4,4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,1,1,1,0,0,"[year, challenge, filter, hilarious, year, cha...","['year', 'challenge', 'with', 'no', 'filter', ...","['year', 'challenge', '', '', 'filter', 'hilar...","['year', 'challenge', 'filter', 'hilarious', '...","[year, challeng, filter, hilari, year, challen..."


In [10]:
#Checking the class labels balance in training dataset for task 1 (identifying meme as positive/negative/neutral - (1/-1/0))
train_df['overall_sentiment'].value_counts()

 1    4058
 0    2157
-1     615
Name: overall_sentiment, dtype: int64

In [11]:
#Dataset containing the processed text of test data
test_df = pd.read_csv(test_data, converters={'pre_tokens': eval, 'processed': eval})
test_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Image_name,Image_URL,OCR_extracted_text,corrected_text,processed,tokenized_text,stop_tokens,rem_punct_tokens,pre_tokens
0,0,0,chuck_chuck_norris_meme_10.jpg,https://gtmemes.com/wp-content/uploads/2019/03...,Some magicians can walk on water Chuck Norris...,Some magicians can walk on water Chuck Norris...,"[magician, walk, water, chuck, norris, swim, l...","['some', 'magicians', 'can', 'walk', 'on', 'wa...","['', 'magicians', '', 'walk', '', 'water', 'ch...","['magicians', 'walk', 'water', 'chuck', 'norri...","[magician, walk, water, chuck, norri, swim, land]"
1,1,1,dr_evil_NDBB96K.png,https://i.imgur.com/NDBB96K.png,ONE MILLION DOLLARS made on imgur,ONE MILLION DOLLARS made on imgur,"[one, million, dollar, make, imgur]","['one', 'million', 'dollars', 'made', 'on', 'i...","['', 'million', 'dollars', '', '', 'imgur']","['million', 'dollars', 'imgur']","[million, dollar, imgur]"
2,2,2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,https://media0ch-a.akamaihd.net/83/96/9e457d63...,Me: Mom can my friend sleep over? Mom: That's ...,Me: Mom can my friend sleep over? Mom: That's ...,"[mom, friend, sleep, mom, fine, boy, growingup...","['me', 'mom', 'can', 'my', 'friend', 'sleep', ...","['', 'mom', '', '', 'friend', 'sleep', '', 'mo...","['mom', 'friend', 'sleep', 'mom', 'fine', 'boy...","[mom, friend, sleep, mom, fine, boi, growingup..."
3,3,3,obama_2691536739_469698809820026_263513986_n.jpg,http://politicalmemes.com/wp-content/uploads/2...,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,"[guy, inherit, mess, whine, foxed, thing, guy,...","['this', 'guy', 'inherited', 'mess', 'did', 'h...","['', 'guy', 'inherited', 'mess', '', '', 'whin...","['guy', 'inherited', 'mess', 'whine', 'foxed',...","[gui, inherit, mess, whine, fox, thing, gui, f..."
4,4,4,kim_threat-kim-jong-un-allegedly-working-on-mu...,https://pics.me.me/threat-kim-jong-un-allegedl...,THREAT: Kim Jong Un allegedly working on multi...,THREAT: Kim Jong Un allegedly working on multi...,"[threat, kim, jong, un, allegedly, work, multi...","['threat', 'kim', 'jong', 'un', 'allegedly', '...","['threat', 'kim', 'jong', '', 'allegedly', 'wo...","['threat', 'kim', 'jong', 'allegedly', 'workin...","[threat, kim, jong, allegedli, work, multipl, ..."


In [12]:
#Creating a duplicate copy of train dataframe, so that modifications can be done in copy df if needed
train_df_sub = train_df
train_df_sub.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,image_name,text_ocr,text_corrected,humour,sarcasm,offensive,motivational,overall_sentiment,processed,tokenized_text,stop_tokens,rem_punct_tokens,pre_tokens
0,0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,1,1,0,0,1,"[look, friend, lightyear, sohalikut, trend, pl...","['look', 'there', 'my', 'friend', 'lightyear',...","['look', '', '', 'friend', 'lightyear', '', ''...","['look', 'friend', 'lightyear', 'sohalikut', '...","[look, friend, lightyear, sohalikut, trend, pl..."
1,1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,0,1,0,1,1,"[best, yearchallenge, complete, less, year, ku...","['the', 'best', 'of', 'yearchallenge', 'comple...","['', 'best', '', 'yearchallenge', 'completed',...","['best', 'yearchallenge', 'completed', 'years'...","[best, yearchalleng, complet, year, kudu, nare..."
2,2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,1,0,0,0,1,"[sam, thorne, strippin, follow, follow, saw, e...","['sam', 'thorne', 'strippin', 'follow', 'follo...","['sam', 'thorne', 'strippin', 'follow', 'follo...","['sam', 'thorne', 'strippin', 'follow', 'follo...","[sam, thorn, strippin, follow, follow, saw, po..."
3,3,3,image_4.png,10 Year Challenge - Sweet Dee Edition,10 Year Challenge - Sweet Dee Edition,1,1,1,1,1,"[year, challenge, sweet, dee, edition]","['year', 'challenge', 'sweet', 'dee', 'edition']","['year', 'challenge', 'sweet', 'dee', 'edition']","['year', 'challenge', 'sweet', 'dee', 'edition']","[year, challeng, sweet, dee, edit]"
4,4,4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,1,1,1,0,0,"[year, challenge, filter, hilarious, year, cha...","['year', 'challenge', 'with', 'no', 'filter', ...","['year', 'challenge', '', '', 'filter', 'hilar...","['year', 'challenge', 'filter', 'hilarious', '...","[year, challeng, filter, hilari, year, challen..."


# Handling class imbalance using Random Oversampling and SMOTE techniques
# with Logistic Regression model

In [13]:
'''dummy_param takes a word as parameter and returns the same word
   This function is useful when the sentences are already tokenized
   and these tokenized sentences are given as input to 
   CountVectorizer() and TfidfVectorizer() for creating BOW and TF-IDF'''
def dummy_param(doc):
    return doc

'''since the labels and sentences of test set are in different files (dataframes)
   both these dataframes are merged to form test dataset'''
test_df_sub = pd.merge(test_df[['Unnamed: 0','processed']], true_df[['Unnamed: 0','Sentiment']], on='Unnamed: 0')
test_df_sub.rename(columns = {"Sentiment": "sentiment"}, inplace=True)

'''converting text to word count vectors - BOW
   tokenizer and preprocessor - are set to callable function which takes a word and return the same word
                                since the sentences are already tokenized, parameters  
   ngrame_range - (1,3) considers unigram, bigram and trigram'''
cv = CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3), tokenizer = dummy_param, analyzer="word", preprocessor=dummy_param)
cv_train = cv.fit_transform(train_df_sub['processed'])
cv_test = cv.transform(test_df_sub['processed'])

'''Convert a collection of raw documents to a matrix of TF-IDF features
   tokenizer and preprocessor - are set to callable function which takes a word and return the same word
                                since the sentences are already tokenized, parameters  
   ngrame_range - (1,3) considers unigram, bigram and trigram'''
training_text = pd.Series([sent for sent in train_df_sub['pre_tokens']])
training_target = pd.Series([label for label in train_df_sub['overall_sentiment']])
tv = TfidfVectorizer(stop_words=None, max_features=100000, ngram_range=(1, 3), analyzer='word', tokenizer=dummy_param, preprocessor=dummy_param)
training_tfidf = tv.fit_transform(training_text)

#preparing testing dataset from the test dataframe
testing_text = pd.Series([sent for sent in test_df_sub['processed']])
testing_target = pd.Series([label for label in test_df_sub['sentiment']])
testing_tfidf = tv.transform(testing_text)

#creating an object for RandomOverSampler for random oversampling
ros = RandomOverSampler(random_state=777)

#creating an object for SMOTE for SMOTE oversampling
smt = SMOTE(random_state=777, k_neighbors=5)


In [14]:
samplings = {'No Sampling':'no_sampling', 'ROS':ros, 'SMOTE':smt}
representations = {'TFIDF':training_tfidf, 'BOW':cv_train}

#this loop runs for 3 times for each of - No Sampling, ROS, SMOTE
for sampling_key in samplings:
  sampling = samplings[sampling_key]

  #this loop runs twice for each of the above sampling technique with tfidf features and bow separately
  for representation_key in representations:
    representation = representations[representation_key]

    #stratified k fold is used to have balanced number of labels in each of the folds
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=777)
    accuracy = []
    precision = []
    recall = []
    f1 = []

    X = representation
    Y = training_target

    average_method = 'macro'
    model = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)

    #for each of k folds, model is trained with train fold and tested with test fold
    #precision, recall, f1 and accuracy are performance measures
    for train, test in kfold.split(X, Y):

        #to decide whether to perform oversampling for train dataset or not
        if sampling =='no_sampling':
          X_train = representation
          y_train = training_target
        else:
          X_train, y_train = sampling.fit_sample(X[train], Y[train])
          
        model.fit(X_train, y_train)
        prediction = model.predict(X[test])
        scores = model.score(X[test],Y[test])
        
        accuracy.append(scores * 100)
        precision.append(precision_score(Y[test], prediction, average=average_method)*100)
        recall.append(recall_score(Y[test], prediction, average=average_method)*100)
        f1.append(f1_score(Y[test], prediction, average=average_method)*100)

    print("Representation: {} | Sampling: {}".format(representation_key, sampling_key))
    print("\nTrain dataset")
    print("accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy)))
    print("precision: %.2f%% (+/- %.2f%%)" % (np.mean(precision), np.std(precision)))
    print("recall: %.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall)))
    print("f1 score: %.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1)))

    #to select the feature extraction method for test set
    print("\nTest dataset")
    if representation_key == 'TFIDF':
      X_test = testing_tfidf
    else:
      X_test = cv_test
    y_test = testing_target

    prediction = model.predict(X_test)
    scores = model.score(X_test,y_test)
    print("Accuracy: %.2f%%" % (scores * 100))
    print("precision: %.2f%%" % (precision_score(y_test, prediction, average=average_method)*100))
    print("recall: %.2f%%" % (recall_score(y_test, prediction, average=average_method)*100))
    print("f1 score: %.2f%%" % (f1_score(y_test, prediction, average=average_method)*100))
    print("-"*100)

Representation: TFIDF | Sampling: No Sampling

Train dataset
accuracy: 83.56% (+/- 0.33%)
precision: 59.21% (+/- 0.15%)
recall: 58.86% (+/- 0.37%)
f1 score: 58.13% (+/- 0.31%)

Test dataset
Accuracy: 57.17%
precision: 30.19%
recall: 33.33%
f1 score: 28.02%
----------------------------------------------------------------------------------------------------
Representation: BOW | Sampling: No Sampling

Train dataset
accuracy: 95.29% (+/- 0.78%)
precision: 97.55% (+/- 0.37%)
recall: 90.57% (+/- 1.37%)
f1 score: 93.60% (+/- 1.00%)

Test dataset
Accuracy: 57.93%
precision: 35.72%
recall: 33.28%
f1 score: 26.61%
----------------------------------------------------------------------------------------------------
Representation: TFIDF | Sampling: ROS

Train dataset
accuracy: 54.66% (+/- 0.90%)
precision: 36.12% (+/- 3.01%)
recall: 34.45% (+/- 0.82%)
f1 score: 32.77% (+/- 1.30%)

Test dataset
Accuracy: 48.53%
precision: 32.78%
recall: 32.85%
f1 score: 32.48%
-------------------------------------