# Preparing the datasets for BERT

In [None]:
#Load the libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob
from textblob import Word
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix,classification_report
from sklearn.metrics import recall_score,precision_score,precision_recall_fscore_support

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
#reading csv file
def read_corpus(file):
  return pd.read_csv(file)
  
train_data = '/content/gdrive/MyDrive/NLP_Project/train_data_processed.csv'
test_data = '/content/gdrive/MyDrive/NLP_Project/test_data_processed.csv'
test_true = '/content/gdrive/MyDrive/NLP_Project/Test_Actual_Final.csv'

In [None]:
#Dataset containing the meme ground truth 

true_df = read_corpus(test_true)
#Extracting the first digit (1, 0 , -1) from Labels 

true_df['Sentiment'] = true_df['Labels'].str.split('_').str[0]
true_df['Sentiment'] = true_df['Sentiment'].astype(int)
true_df.head()

Unnamed: 0.1,Unnamed: 0,Image_name,Labels,Sentiment
0,0,chuck_chuck_norris_meme_10.jpg,1_1100_1100,1
1,1,dr_evil_NDBB96K.png,1_0100_0200,1
2,2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,1_1110_1120,1
3,3,obama_2691536739_469698809820026_263513986_n.jpg,0_1111_1121,0
4,4,kim_threat-kim-jong-un-allegedly-working-on-mu...,0_0000_0000,0


In [None]:
#Dataset containing the Train data
train_df = read_corpus(train_data)

train_df.head()

Unnamed: 0.1,Unnamed: 0,image_name,text_ocr,text_corrected,humour,sarcasm,offensive,motivational,overall_sentiment,processed
0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,hilarious,general,not_offensive,not_motivational,very_positive,"['look', 'friend', 'lightyear', 'sohalikut', '..."
1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,not_funny,general,not_offensive,motivational,very_positive,"['best', 'yearchallenge', 'complete', 'less', ..."
2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,very_funny,not_sarcastic,not_offensive,not_motivational,positive,"['sam', 'thorne', 'strippin', 'follow', 'follo..."
3,3,image_4.png,10 Year Challenge - Sweet Dee Edition,10 Year Challenge - Sweet Dee Edition,very_funny,twisted_meaning,very_offensive,motivational,positive,"['year', 'challenge', 'sweet', 'dee', 'edition']"
4,4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,hilarious,very_twisted,very_offensive,not_motivational,neutral,"['year', 'challenge', 'filter', 'hilarious', '..."


In [None]:
train_df['overall_sentiment'].value_counts()

positive         3057
neutral          2157
very_positive    1001
negative          469
very_negative     146
Name: overall_sentiment, dtype: int64

In [None]:
#converting categorical to numerical 

train_df['overall_sentiment'].replace({'very_negative': -1, 'negative': -1, 'neutral': 0, 'positive': 1, 'very_positive': 1}, inplace=True)
train_df['overall_sentiment'].value_counts()

 1    4058
 0    2157
-1     615
Name: overall_sentiment, dtype: int64

In [None]:
print(train_df.humour.value_counts())
print(train_df.sarcasm.value_counts())
print(train_df.offensive.value_counts())
print(train_df.motivational.value_counts())

funny         2394
very_funny    2176
not_funny     1618
hilarious      642
Name: humour, dtype: int64
general            3430
not_sarcastic      1516
twisted_meaning    1499
very_twisted        385
Name: sarcasm, dtype: int64
not_offensive        2657
slight               2536
very_offensive       1424
hateful_offensive     213
Name: offensive, dtype: int64
not_motivational    4421
motivational        2409
Name: motivational, dtype: int64


In [None]:
train_df.humour[train_df['humour']!='not_funny']= 1
train_df.humour[train_df['humour']=='not_funny']= 0
train_df.sarcasm[train_df['sarcasm']!='not_sarcastic']= 1
train_df.sarcasm[train_df['sarcasm']=='not_sarcastic']= 0
train_df.offensive[train_df['offensive']!='not_offensive']= 1
train_df.offensive[train_df['offensive']=='not_offensive']= 0
train_df.motivational[train_df['motivational']!='not_motivational']= 1
train_df.motivational[train_df['motivational']=='not_motivational']= 0

In [None]:
train_df.head()

Unnamed: 0.1,Unnamed: 0,image_name,text_ocr,text_corrected,humour,sarcasm,offensive,motivational,overall_sentiment,processed
0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,1,1,0,0,1,"['look', 'friend', 'lightyear', 'sohalikut', '..."
1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,0,1,0,1,1,"['best', 'yearchallenge', 'complete', 'less', ..."
2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,1,0,0,0,1,"['sam', 'thorne', 'strippin', 'follow', 'follo..."
3,3,image_4.png,10 Year Challenge - Sweet Dee Edition,10 Year Challenge - Sweet Dee Edition,1,1,1,1,1,"['year', 'challenge', 'sweet', 'dee', 'edition']"
4,4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,1,1,1,0,0,"['year', 'challenge', 'filter', 'hilarious', '..."


In [None]:
print(train_df.humour.value_counts())
print(train_df.sarcasm.value_counts())
print(train_df.offensive.value_counts())
print(train_df.motivational.value_counts())

1    5212
0    1618
Name: humour, dtype: int64
1    5314
0    1516
Name: sarcasm, dtype: int64
1    4173
0    2657
Name: offensive, dtype: int64
0    4421
1    2409
Name: motivational, dtype: int64


In [None]:
#Dataset containing the processed text of test data

test_df = read_corpus(test_data)
test_df.head()

Unnamed: 0.1,Unnamed: 0,Image_name,Image_URL,OCR_extracted_text,corrected_text,processed
0,0,chuck_chuck_norris_meme_10.jpg,https://gtmemes.com/wp-content/uploads/2019/03...,Some magicians can walk on water Chuck Norris...,Some magicians can walk on water Chuck Norris...,"['magician', 'walk', 'water', 'chuck', 'norris..."
1,1,dr_evil_NDBB96K.png,https://i.imgur.com/NDBB96K.png,ONE MILLION DOLLARS made on imgur,ONE MILLION DOLLARS made on imgur,"['one', 'million', 'dollar', 'make', 'imgur']"
2,2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,https://media0ch-a.akamaihd.net/83/96/9e457d63...,Me: Mom can my friend sleep over? Mom: That's ...,Me: Mom can my friend sleep over? Mom: That's ...,"['mom', 'friend', 'sleep', 'mom', 'fine', 'boy..."
3,3,obama_2691536739_469698809820026_263513986_n.jpg,http://politicalmemes.com/wp-content/uploads/2...,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,"['guy', 'inherit', 'mess', 'whine', 'foxed', '..."
4,4,kim_threat-kim-jong-un-allegedly-working-on-mu...,https://pics.me.me/threat-kim-jong-un-allegedl...,THREAT: Kim Jong Un allegedly working on multi...,THREAT: Kim Jong Un allegedly working on multi...,"['threat', 'kim', 'jong', 'un', 'allegedly', '..."


# Task1 using BERT

In [None]:
pip install transformers

Collecting transformers
  Downloading transformers-4.9.1-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 8.3 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 57.7 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 55.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 70.5 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully u

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf
import pandas as pd
import numpy as np

In [None]:
train_df_bert = train_df
train_df_bert.head()

Unnamed: 0.1,Unnamed: 0,image_name,text_ocr,text_corrected,humour,sarcasm,offensive,motivational,overall_sentiment,processed
0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,1,1,0,0,1,"['look', 'friend', 'lightyear', 'sohalikut', '..."
1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,0,1,0,1,1,"['best', 'yearchallenge', 'complete', 'less', ..."
2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,1,0,0,0,1,"['sam', 'thorne', 'strippin', 'follow', 'follo..."
3,3,image_4.png,10 Year Challenge - Sweet Dee Edition,10 Year Challenge - Sweet Dee Edition,1,1,1,1,1,"['year', 'challenge', 'sweet', 'dee', 'edition']"
4,4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,1,1,1,0,0,"['year', 'challenge', 'filter', 'hilarious', '..."


In [None]:
train_df_bert['overall_sentiment'].value_counts()

 1    4058
 0    2157
-1     615
Name: overall_sentiment, dtype: int64

In [None]:
train_df_bert['overall_sentiment'].replace({-1: 2}, inplace=True)
train_df_bert['overall_sentiment'].value_counts()

1    4058
0    2157
2     615
Name: overall_sentiment, dtype: int64

In [None]:
test_df_bert = test_df
test_df_bert.head()

Unnamed: 0.1,Unnamed: 0,Image_name,Image_URL,OCR_extracted_text,corrected_text,processed
0,0,chuck_chuck_norris_meme_10.jpg,https://gtmemes.com/wp-content/uploads/2019/03...,Some magicians can walk on water Chuck Norris...,Some magicians can walk on water Chuck Norris...,"['magician', 'walk', 'water', 'chuck', 'norris..."
1,1,dr_evil_NDBB96K.png,https://i.imgur.com/NDBB96K.png,ONE MILLION DOLLARS made on imgur,ONE MILLION DOLLARS made on imgur,"['one', 'million', 'dollar', 'make', 'imgur']"
2,2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,https://media0ch-a.akamaihd.net/83/96/9e457d63...,Me: Mom can my friend sleep over? Mom: That's ...,Me: Mom can my friend sleep over? Mom: That's ...,"['mom', 'friend', 'sleep', 'mom', 'fine', 'boy..."
3,3,obama_2691536739_469698809820026_263513986_n.jpg,http://politicalmemes.com/wp-content/uploads/2...,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,THIS GUY INHERITED A MESS. DID HE WHINE ABOUT ...,"['guy', 'inherit', 'mess', 'whine', 'foxed', '..."
4,4,kim_threat-kim-jong-un-allegedly-working-on-mu...,https://pics.me.me/threat-kim-jong-un-allegedl...,THREAT: Kim Jong Un allegedly working on multi...,THREAT: Kim Jong Un allegedly working on multi...,"['threat', 'kim', 'jong', 'un', 'allegedly', '..."


In [None]:
true_df_bert = true_df
true_df_bert.head()

Unnamed: 0.1,Unnamed: 0,Image_name,Labels,Sentiment
0,0,chuck_chuck_norris_meme_10.jpg,1_1100_1100,1
1,1,dr_evil_NDBB96K.png,1_0100_0200,1
2,2,misog_2109e457d636565e2e06dce39874c5231e1.jpg,1_1110_1120,1
3,3,obama_2691536739_469698809820026_263513986_n.jpg,0_1111_1121,0
4,4,kim_threat-kim-jong-un-allegedly-working-on-mu...,0_0000_0000,0


In [None]:
text = train_df_bert['text_ocr'].values.tolist()
labels = train_df_bert['overall_sentiment'].tolist()

from sklearn.model_selection import train_test_split
training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(text, labels, test_size = 0.1)

In [None]:
#Assign tokenizer object to the tokenizer class

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
#tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




In [None]:
tokenizer([training_sentences[0]], truncation=True,

                            padding=True, max_length=128)

{'input_ids': [[101, 1045, 2179, 3599, 2028, 1042, 1008, 1008, 1008, 2009, 2003, 2026, 5592, 2000, 2017, 1012, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [None]:
#creating the encodings 

train_encodings = tokenizer(training_sentences,
                            truncation=True,
                            padding=True)
val_encodings = tokenizer(validation_sentences,
                            truncation=True,
                            padding=True)
train_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(train_encodings),
                            training_labels
                            ))
val_dataset = tf.data.Dataset.from_tensor_slices((
                            dict(val_encodings),
                            validation_labels
                            ))

In [None]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

#model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased',num_labels=3)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#training

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])
model.fit(train_dataset.shuffle(100).batch(16),
          epochs=10,
          batch_size=16,
          validation_data=val_dataset.shuffle(100).batch(16))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f287b8d3410>

In [None]:
model.save_pretrained("./sentiment")
loaded_model = TFBertForSequenceClassification.from_pretrained("./sentiment")

Some layers from the model checkpoint at ./sentiment were not used when initializing TFBertForSequenceClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at ./sentiment.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [None]:
test_text = test_df_bert['OCR_extracted_text'].values.tolist()
test_text


In [None]:
predictions = []
labels = [0, 1, 2]
for test_sentence in test_text:
  predict_input = tokenizer.encode(test_sentence,
                                  truncation=True,
                                  padding=True,
                                  return_tensors="tf")

  tf_output = loaded_model.predict(predict_input)[0]
  tf_prediction = tf.nn.softmax(tf_output, axis=1)
  label = tf.argmax(tf_prediction, axis=1)
  #label = label.numpy()
  predictions.append(labels[label[0]])
  #print(labels[label[0]])




In [None]:
actual = true_df_bert['Sentiment'].values.tolist()

print(classification_report(actual, predictions))
print(' Test accuracy is {}'.format(accuracy_score(actual, predictions) * 100))
print(" F1 Score: {:.2f}".format(f1_score(actual, predictions, average='macro') * 100))
print(" Precision Score: {:.2f}".format(precision_score(actual, predictions, average='macro') * 100))
print(" Recall Score: {:.2f}".format(recall_score(actual, predictions, average='macro') * 100))
print("\n")

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00       172
           0       0.00      0.00      0.00       586
           1       0.59      1.00      0.74      1101

    accuracy                           0.59      1859
   macro avg       0.20      0.33      0.25      1859
weighted avg       0.35      0.59      0.44      1859

 Test accuracy is 59.225389994620755
 F1 Score: 24.80
 Precision Score: 19.74
 Recall Score: 33.33




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Task2 using BERT 

In [None]:
tokenizer_2 = BertTokenizer.from_pretrained("bert-base-uncased")

model_2 = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#training 

text = train_df_bert['text_ocr'].values.tolist()
test_text = test_df_bert['OCR_extracted_text'].values.tolist() 

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5, epsilon=1e-08)


categories={'humour':0,'sarcasm':1,'offensive':2,'motivational':3}

for category in categories.keys():
  print('******Processing {} category...******'.format(category))

  labels = train_df_bert[category].tolist()
  training_sentences, validation_sentences, training_labels, validation_labels = train_test_split(text, labels, test_size = 0.1)


  train_encodings = tokenizer_2(training_sentences,
                              truncation=True,
                              padding=True)
  val_encodings = tokenizer_2(validation_sentences,
                              truncation=True,
                              padding=True)
  train_dataset = tf.data.Dataset.from_tensor_slices((
                              dict(train_encodings),
                              training_labels
                              ))
  val_dataset = tf.data.Dataset.from_tensor_slices((
                              dict(val_encodings),
                              validation_labels
                              ))

  model_2.compile(optimizer=optimizer, loss=model_2.compute_loss, metrics=['accuracy'])
  model_2.fit(train_dataset.shuffle(100).batch(16),
            epochs=3,
            batch_size=16,
            validation_data=val_dataset.shuffle(100).batch(16))
  

  #testing
  predictions = []
  actual = []
  labels = [0, 1]
  for test_sentence in test_text:
    predict_input = tokenizer_2.encode(test_sentence,
                                    truncation=True,
                                    padding=True,
                                    return_tensors="tf")

    tf_output = model_2.predict(predict_input)[0]
    tf_prediction = tf.nn.softmax(tf_output, axis=1)
    label = tf.argmax(tf_prediction, axis=1)
    predictions.append(labels[label[0]])

  for index, row in test_df_bert.iterrows():
    lab = true_df['Labels'].str.split('_').str[1].str[categories[category]][index]
    lab = int(lab)
    actual.append(lab)
    
  #actual = true_df_bert['Sentiment'].values.tolist()


  print(classification_report(actual, predictions))
  print(' Test accuracy is {}'.format(accuracy_score(actual, predictions) * 100))
  print(" F1 Score: {:.2f}".format(f1_score(actual, predictions, average='macro') * 100))
  print(" Precision Score: {:.2f}".format(precision_score(actual, predictions, average='macro') * 100))
  print(" Recall Score: {:.2f}".format(recall_score(actual, predictions, average='macro') * 100))
  print("\n")


******Processing humour category...******
Epoch 1/3
Epoch 2/3
Epoch 3/3


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00       443
           1       0.76      1.00      0.86      1416

    accuracy                           0.76      1859
   macro avg       0.38      0.50      0.43      1859
weighted avg       0.58      0.76      0.66      1859

 Test accuracy is 76.16998386229156
 F1 Score: 43.24
 Precision Score: 38.08
 Recall Score: 50.00


******Processing sarcasm category...******
Epoch 1/3
Epoch 2/3
Epoch 3/3
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       418
           1       0.78      1.00      0.87      1441

    accuracy                           0.78      1859
   macro avg       0.39      0.50      0.44      1859
weighted avg       0.60      0.78      0.68      1859

 Test accuracy is 77.51479289940828
 F1 Score: 43.67
 Precision Score: 38.76
 Recall Score: 50.00


******Processing offensive category...******
Epoch 1/3
Epoch 2/3
Epoch 3/