In [34]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tqdm import tqdm

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from gensim.models import KeyedVectors

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import LSTM, GRU, Dense, Embedding, Dropout, Bidirectional, InputLayer, Lambda, SimpleRNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import nltk # import package for tokenization

In [3]:
# nltk.download('punkt') # download all spporting function /files for NLTK package
# nltk.download('stopwords') #download Stopwords
# nltk.download('tagsets')
# nltk.help.upenn_tagset()# tagset documentation
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')#lemmatization


In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict #Default Dictionary is imported from collections
from nltk.corpus import wordnet as wn #the corpus reader wordnet is imported.
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer 
from sklearn.preprocessing import scale

In [5]:
tweets = pd.read_csv("sarcasm_dataset.csv")

tweets.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0,The only thing I got from college is a caffein...,1,0.0,1.0,0.0,0.0,0.0,0.0
1,1,I love it when professors draw a big question ...,1,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Remember the hundred emails from companies whe...,1,0.0,1.0,0.0,0.0,0.0,0.0
3,3,Today my pop-pop told me I was not “forced” to...,1,1.0,0.0,0.0,0.0,0.0,0.0
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,1.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# dop NaN columns
tweets = tweets.drop(tweets.columns[0], axis=1)

tweets.tail()

Unnamed: 0,tweet,sarcastic,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
3463,The population spike in Chicago in 9 months is...,0,,,,,,
3464,You'd think in the second to last English clas...,0,,,,,,
3465,I’m finally surfacing after a holiday to Scotl...,0,,,,,,
3466,Couldn't be prouder today. Well done to every ...,0,,,,,,
3467,Overheard as my 13 year old games with a frien...,0,,,,,,


In [7]:
# dropping unnecessary columns for binary classification
tweets1 = tweets.drop(['sarcasm', 'irony', 'satire', 'understatement', 'overstatement', 'rhetorical_question'], axis=1)
tweets1.dropna(how='any', inplace=True)
tweets1.head()

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1


In [8]:
# Convert label from float to int
def transform_float_to_int(value):
    return int(value)

tweets1['sarcastic'] = tweets1.sarcastic.apply(transform_float_to_int)

In [9]:
#Remove number
import re # import all Regular expression functions
tweets1['tweet']=[re.sub('\d','', i)for i in tweets1['tweet']]
tweets1.head(10)

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1
5,"@jimrossignol I choose to interpret it as ""XD""...",1
6,Why would Alexa's recipe for Yorkshire pudding...,1
7,someone hit me w a horse tranquilizer istg ive...,1
8,Loving season of trump does America. Funniest...,1
9,Holly Arnold ??? Who #ImACeleb #MBE nope not ...,1


In [10]:
# Replace punctuations with a white space
import string
tweets1['tweet']=[re.sub('[%s]' % re.escape(string.punctuation), ' ', i) for i in tweets1['tweet']]
tweets1.head(10)

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop pop told me I was not “forced” to...,1
4,VolphanCarol littlewhitty mysticalmanatee I...,1
5,jimrossignol I choose to interpret it as XD ...,1
6,Why would Alexa s recipe for Yorkshire pudding...,1
7,someone hit me w a horse tranquilizer istg ive...,1
8,Loving season of trump does America Funniest...,1
9,Holly Arnold Who ImACeleb MBE nope not ...,1


In [11]:
#Convert into in lower case
tweets1['tweet']=[i.lower() for i in tweets1['tweet']]

In [12]:
tweets1['tweet_wt'] = [word_tokenize(i) for i in tweets1['tweet']]
tweets1.head()

Unnamed: 0,tweet,sarcastic,tweet_wt
0,the only thing i got from college is a caffein...,1,"[the, only, thing, i, got, from, college, is, ..."
1,i love it when professors draw a big question ...,1,"[i, love, it, when, professors, draw, a, big, ..."
2,remember the hundred emails from companies whe...,1,"[remember, the, hundred, emails, from, compani..."
3,today my pop pop told me i was not “forced” to...,1,"[today, my, pop, pop, told, me, i, was, not, “..."
4,volphancarol littlewhitty mysticalmanatee i...,1,"[volphancarol, littlewhitty, mysticalmanatee, ..."


In [13]:
#To show the stop words

stop_words = set(stopwords.words('english'))
#Remove All Stop Word
tweets1['tweet_SW'] = [[i for i in j if not i in stop_words] for j in tweets1['tweet_wt']]# remove the word which is aviable in stopword libr
tweets1.head()

Unnamed: 0,tweet,sarcastic,tweet_wt,tweet_SW
0,the only thing i got from college is a caffein...,1,"[the, only, thing, i, got, from, college, is, ...","[thing, got, college, caffeine, addiction]"
1,i love it when professors draw a big question ...,1,"[i, love, it, when, professors, draw, a, big, ...","[love, professors, draw, big, question, mark, ..."
2,remember the hundred emails from companies whe...,1,"[remember, the, hundred, emails, from, compani...","[remember, hundred, emails, companies, covid, ..."
3,today my pop pop told me i was not “forced” to...,1,"[today, my, pop, pop, told, me, i, was, not, “...","[today, pop, pop, told, “, forced, ”, go, coll..."
4,volphancarol littlewhitty mysticalmanatee i...,1,"[volphancarol, littlewhitty, mysticalmanatee, ...","[volphancarol, littlewhitty, mysticalmanatee, ..."


In [14]:
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. 
#By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN) #Dictionary is created where pos_tag (first letter) are the key values 
tag_map['J'] = wn.ADJ                   #whose values are mapped with the value 
tag_map['V'] = wn.VERB                  #from wordnet dictionary. We have taken the only first letter as we will use it later in the loop.
tag_map['R'] = wn.ADV

In [15]:
lemmatizer = WordNetLemmatizer()

tweets1['tweet_lemma']=[[lemmatizer.lemmatize(word,tag_map[tag[0]]) for word ,tag in pos_tag(i)] for i in tweets1['tweet_SW']] 
tweets1.head()

Unnamed: 0,tweet,sarcastic,tweet_wt,tweet_SW,tweet_lemma
0,the only thing i got from college is a caffein...,1,"[the, only, thing, i, got, from, college, is, ...","[thing, got, college, caffeine, addiction]","[thing, get, college, caffeine, addiction]"
1,i love it when professors draw a big question ...,1,"[i, love, it, when, professors, draw, a, big, ...","[love, professors, draw, big, question, mark, ...","[love, professor, draw, big, question, mark, n..."
2,remember the hundred emails from companies whe...,1,"[remember, the, hundred, emails, from, compani...","[remember, hundred, emails, companies, covid, ...","[remember, hundred, email, company, covid, sta..."
3,today my pop pop told me i was not “forced” to...,1,"[today, my, pop, pop, told, me, i, was, not, “...","[today, pop, pop, told, “, forced, ”, go, coll...","[today, pop, pop, tell, “, force, ”, go, colle..."
4,volphancarol littlewhitty mysticalmanatee i...,1,"[volphancarol, littlewhitty, mysticalmanatee, ...","[volphancarol, littlewhitty, mysticalmanatee, ...","[volphancarol, littlewhitty, mysticalmanatee, ..."


In [16]:
 tweets1['tweet_clean']= tweets1['tweet_lemma'].apply(lambda x: ' '.join(x))

In [17]:
tweets1['tweet_clean'].tail()

3463            population spike chicago month ridiculous
3464    think second last english class year prof woul...
3465    ’ finally surface holiday scotland difficult d...
3466    prouder today well do every student get gcse m...
3467    overheard year old game friend smell like tart...
Name: tweet_clean, dtype: object

# Word Embedding Using Elmo

In [17]:
import os
os.environ['TFHUB_DOWNLOAD_PROGRESS'] = "1"

In [18]:
import tensorflow_hub as hub
import tensorflow as tf

elmo = hub.load("https://tfhub.dev/google/elmo/3")

In [19]:
def elmo_vectors(x):
    embeddings = elmo.signatures['default'](tf.convert_to_tensor(x))['elmo']
    return tf.reduce_mean(embeddings,1)

In [22]:
X = tweets1['tweet_clean']
Y = tweets1['sarcastic']

In [21]:
X_list = [X[i:i+100] for i in range(0,X.shape[0],100)]

In [22]:
X = [elmo_vectors(x) for x in X_list]

In [23]:
elmo_X = np.concatenate(X, axis = 0)

In [24]:
elmo_X.shape

(3467, 1024)

#### save these arrays as it took us a long time to get the ELMo vectors for them. We will save them as pickle files

In [19]:
import pickle

In [26]:
# save elmo_X
pickle_out = open("elmo_X_03032019.pickle","wb")
pickle.dump(elmo_X, pickle_out)
pickle_out.close()

In [20]:
pickle_in = open("elmo_X_03032019.pickle", "rb")
elmo_X = pickle.load(pickle_in)

# Preceptron ML

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(elmo_X, Y,test_size=0.25,random_state=342)

In [24]:
# Applying Preceptron Classifier
model = Perceptron(tol=1e-2, random_state=0, )
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
print(classification_report(Y_test, Y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.77      0.56      0.65       648
           1       0.28      0.51      0.36       219

    accuracy                           0.55       867
   macro avg       0.52      0.53      0.50       867
weighted avg       0.65      0.55      0.58       867



# BiLSTM with 0.2 Dropout and 2 layers


In [25]:
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=1024)
vectorizer.adapt(tweets1['tweet_clean'].to_numpy())
vocab = vectorizer.get_vocabulary()

In [26]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [27]:
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

In [28]:
num_tokens = len(voc) + 2
embedding_dim = 1024
hits = 0
misses = 0

In [None]:
# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    try:
#         embedding_vector = fasttext_embed.get_vector(word)
        embedding_vector = elmo.signatures['default'](tf.convert_to_tensor([word]))['elmo'].reshape(1024)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    except:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

In [None]:
# save elmo_X
pickle_out = open("elmo_embedding_matrix.pickle","wb")
pickle.dump(embedding_matrix, pickle_out)
pickle_out.close()

In [29]:
pickle_in = open("elmo_embedding_matrix.pickle", "rb")
embedding_matrix = pickle.load(pickle_in)

In [30]:
X_train, X_test, Y_train, Y_test = train_test_split(tweets1['tweet_clean'],tweets1['sarcastic'],test_size=0.25,random_state=342)

In [31]:
X_train = vectorizer(np.array([[s] for s in X_train])).numpy()
X_test = vectorizer(np.array([[s] for s in X_test])).numpy()

In [35]:
# BiLSTM Model with single layer and 0.2 dropout rate
model = Sequential()
model.add(InputLayer(input_shape=(None,)))
model.add(Embedding(num_tokens, embedding_dim, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix), trainable=False))
model.add(SimpleRNN(embedding_dim))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.build(X_train)
print(model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 1024)        9350144   
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 1024)              2098176   
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dense (Dense)                (None, 1)                 1025      
Total params: 11,449,345
Trainable params: 2,099,201
Non-trainable params: 9,350,144
_________________________________________________________________
None


In [None]:
batch_size = 32

model.fit(X_train, Y_train, batch_size=batch_size, epochs=1,
          validation_data=(X_test, Y_test))



In [None]:
preds = model.predict(X_test, verbose=0)
predictions = [1 if x > 0.5 else 0 for x in preds]
print(classification_report(Y_test, predictions, zero_division=0))