In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from gensim.models import KeyedVectors

from keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import LSTM, GRU, Dense, Embedding, Dropout, Bidirectional, InputLayer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import nltk # import package for tokenization

In [2]:
# nltk.download('punkt') # download all spporting function /files for NLTK package
# nltk.download('stopwords') #download Stopwords
# nltk.download('tagsets')
# nltk.help.upenn_tagset()# tagset documentation
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')#lemmatization


In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import defaultdict #Default Dictionary is imported from collections
from nltk.corpus import wordnet as wn #the corpus reader wordnet is imported.
from nltk.tag import pos_tag
from nltk.stem import WordNetLemmatizer 
from sklearn.preprocessing import scale

In [4]:
tweets = pd.read_csv("sarcasm_dataset.csv")

tweets.head()

Unnamed: 0.1,Unnamed: 0,tweet,sarcastic,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,0,The only thing I got from college is a caffein...,1,0.0,1.0,0.0,0.0,0.0,0.0
1,1,I love it when professors draw a big question ...,1,1.0,0.0,0.0,0.0,0.0,0.0
2,2,Remember the hundred emails from companies whe...,1,0.0,1.0,0.0,0.0,0.0,0.0
3,3,Today my pop-pop told me I was not “forced” to...,1,1.0,0.0,0.0,0.0,0.0,0.0
4,4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1,1.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# dop NaN columns
tweets = tweets.drop(tweets.columns[0], axis=1)

tweets.tail()

Unnamed: 0,tweet,sarcastic,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
3463,The population spike in Chicago in 9 months is...,0,,,,,,
3464,You'd think in the second to last English clas...,0,,,,,,
3465,I’m finally surfacing after a holiday to Scotl...,0,,,,,,
3466,Couldn't be prouder today. Well done to every ...,0,,,,,,
3467,Overheard as my 13 year old games with a frien...,0,,,,,,


In [6]:
# dropping unnecessary columns for binary classification
tweets1 = tweets.drop(['sarcasm', 'irony', 'satire', 'understatement', 'overstatement', 'rhetorical_question'], axis=1)
tweets1.dropna(how='any', inplace=True)
tweets1.head()

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1


In [7]:
# Convert label from float to int
def transform_float_to_int(value):
    return int(value)

tweets1['sarcastic'] = tweets1.sarcastic.apply(transform_float_to_int)

In [8]:
#Remove number
import re # import all Regular expression functions
tweets1['tweet']=[re.sub('\d','', i)for i in tweets1['tweet']]
tweets1.head(10)

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1
5,"@jimrossignol I choose to interpret it as ""XD""...",1
6,Why would Alexa's recipe for Yorkshire pudding...,1
7,someone hit me w a horse tranquilizer istg ive...,1
8,Loving season of trump does America. Funniest...,1
9,Holly Arnold ??? Who #ImACeleb #MBE nope not ...,1


In [9]:
# Replace punctuations with a white space
import string
tweets1['tweet']=[re.sub('[%s]' % re.escape(string.punctuation), ' ', i) for i in tweets1['tweet']]
tweets1.head(10)

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop pop told me I was not “forced” to...,1
4,VolphanCarol littlewhitty mysticalmanatee I...,1
5,jimrossignol I choose to interpret it as XD ...,1
6,Why would Alexa s recipe for Yorkshire pudding...,1
7,someone hit me w a horse tranquilizer istg ive...,1
8,Loving season of trump does America Funniest...,1
9,Holly Arnold Who ImACeleb MBE nope not ...,1


In [10]:
#Convert into in lower case
tweets1['tweet']=[i.lower() for i in tweets1['tweet']]

In [11]:
tweets1['tweet_wt'] = [word_tokenize(i) for i in tweets1['tweet']]
tweets1.head()

Unnamed: 0,tweet,sarcastic,tweet_wt
0,the only thing i got from college is a caffein...,1,"[the, only, thing, i, got, from, college, is, ..."
1,i love it when professors draw a big question ...,1,"[i, love, it, when, professors, draw, a, big, ..."
2,remember the hundred emails from companies whe...,1,"[remember, the, hundred, emails, from, compani..."
3,today my pop pop told me i was not “forced” to...,1,"[today, my, pop, pop, told, me, i, was, not, “..."
4,volphancarol littlewhitty mysticalmanatee i...,1,"[volphancarol, littlewhitty, mysticalmanatee, ..."


In [12]:
#To show the stop words

stop_words = set(stopwords.words('english'))
#Remove All Stop Word
tweets1['tweet_SW'] = [[i for i in j if not i in stop_words] for j in tweets1['tweet_wt']]# remove the word which is aviable in stopword libr
tweets1.head()

Unnamed: 0,tweet,sarcastic,tweet_wt,tweet_SW
0,the only thing i got from college is a caffein...,1,"[the, only, thing, i, got, from, college, is, ...","[thing, got, college, caffeine, addiction]"
1,i love it when professors draw a big question ...,1,"[i, love, it, when, professors, draw, a, big, ...","[love, professors, draw, big, question, mark, ..."
2,remember the hundred emails from companies whe...,1,"[remember, the, hundred, emails, from, compani...","[remember, hundred, emails, companies, covid, ..."
3,today my pop pop told me i was not “forced” to...,1,"[today, my, pop, pop, told, me, i, was, not, “...","[today, pop, pop, told, “, forced, ”, go, coll..."
4,volphancarol littlewhitty mysticalmanatee i...,1,"[volphancarol, littlewhitty, mysticalmanatee, ...","[volphancarol, littlewhitty, mysticalmanatee, ..."


In [13]:
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. 
#By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN) #Dictionary is created where pos_tag (first letter) are the key values 
tag_map['J'] = wn.ADJ                   #whose values are mapped with the value 
tag_map['V'] = wn.VERB                  #from wordnet dictionary. We have taken the only first letter as we will use it later in the loop.
tag_map['R'] = wn.ADV

In [14]:
lemmatizer = WordNetLemmatizer()

tweets1['tweet_lemma']=[[lemmatizer.lemmatize(word,tag_map[tag[0]]) for word ,tag in pos_tag(i)] for i in tweets1['tweet_SW']] 
tweets1.head()

Unnamed: 0,tweet,sarcastic,tweet_wt,tweet_SW,tweet_lemma
0,the only thing i got from college is a caffein...,1,"[the, only, thing, i, got, from, college, is, ...","[thing, got, college, caffeine, addiction]","[thing, get, college, caffeine, addiction]"
1,i love it when professors draw a big question ...,1,"[i, love, it, when, professors, draw, a, big, ...","[love, professors, draw, big, question, mark, ...","[love, professor, draw, big, question, mark, n..."
2,remember the hundred emails from companies whe...,1,"[remember, the, hundred, emails, from, compani...","[remember, hundred, emails, companies, covid, ...","[remember, hundred, email, company, covid, sta..."
3,today my pop pop told me i was not “forced” to...,1,"[today, my, pop, pop, told, me, i, was, not, “...","[today, pop, pop, told, “, forced, ”, go, coll...","[today, pop, pop, tell, “, force, ”, go, colle..."
4,volphancarol littlewhitty mysticalmanatee i...,1,"[volphancarol, littlewhitty, mysticalmanatee, ...","[volphancarol, littlewhitty, mysticalmanatee, ...","[volphancarol, littlewhitty, mysticalmanatee, ..."


In [15]:
 tweets1['tweet_clean']= tweets1['tweet_lemma'].apply(lambda x: ' '.join(x))

In [16]:
tweets1['tweet_clean'].tail()

3463            population spike chicago month ridiculous
3464    think second last english class year prof woul...
3465    ’ finally surface holiday scotland difficult d...
3466    prouder today well do every student get gcse m...
3467    overheard year old game friend smell like tart...
Name: tweet_clean, dtype: object

# Word Embedding Using Glove

In [23]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile

# glove_file = 'glove.6B.300d.txt'
# tmp_file = get_tmpfile("./glove_word2vec.txt")

# _ = glove2word2vec('glove.6B.300d.txt', tmp_file)

glove_embed = KeyedVectors.load_word2vec_format('glove.6B.300d.txt', binary=False, no_header=True)


#### Next we have to build word vectors for input text in order to average the value of all word vectors using the following function:

In [37]:
#Build word vector set by using the average value of all word vectors , then scale
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size)) #As word vectors are of zero length size value(i.e 300) 
    count = 0 # no. of words with a valid vector in the tweet
    for word in text: #for each word in a tweet
        try:
            vec += glove_embed[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

In [38]:
Size = 300

vecs = np.concatenate([buildWordVector(z, Size) for z in tweets1['tweet_lemma']])
# print("Before Scaling:",vecs[1:2])
vecs = scale(vecs)
# print("After Scaling:",vecs[1:2])

In [39]:
print("Dimension of vector :",vecs.shape)

Dimension of vector : (3467, 300)


# Preceptron ML

In [41]:
X_train, X_test, Y_train, Y_test = train_test_split(vecs,tweets1['sarcastic'],test_size=0.25,random_state=342)

In [42]:
# Applying Preceptron Classifier
model = Perceptron(tol=1e-2, random_state=0, )
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
print(classification_report(Y_test, Y_pred, zero_division=0))

              precision    recall  f1-score   support

           0       0.78      0.84      0.81       648
           1       0.39      0.30      0.34       219

    accuracy                           0.70       867
   macro avg       0.58      0.57      0.57       867
weighted avg       0.68      0.70      0.69       867



# BiLSTM with 0.2 Dropout and 2 layers

In [43]:
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=300)
vectorizer.adapt(tweets1['tweet_clean'].to_numpy())
vocab = vectorizer.get_vocabulary()

In [44]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [46]:
num_tokens = len(voc) + 2
embedding_dim = 300
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    try:
        embedding_vector = glove_embed.get_vector(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            # This includes the representation for "padding" and "OOV"
            embedding_matrix[i] = embedding_vector
            hits += 1
        else:
            misses += 1
    except KeyError:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 6981 words (2148 misses)


In [47]:
X_train, X_test, Y_train, Y_test = train_test_split(tweets1['tweet_clean'],tweets1['sarcastic'],test_size=0.25,random_state=342)

In [48]:
X_train = vectorizer(np.array([[s] for s in X_train])).numpy()
X_test = vectorizer(np.array([[s] for s in X_test])).numpy()

In [52]:
# LSTM Model with two layer and 0.2 dropout rate
model = Sequential()
model.add(InputLayer(input_shape=(None,)))
model.add(Embedding(num_tokens, embedding_dim, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix), trainable=False))
model.add(Bidirectional(LSTM(300, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(300)))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.build(X_train)
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 300)         2739300   
_________________________________________________________________
bidirectional_2 (Bidirection (None, 600)               1442400   
_________________________________________________________________
dropout_2 (Dropout)          (None, 600)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 601       
Total params: 4,182,301
Trainable params: 1,443,001
Non-trainable params: 2,739,300
_________________________________________________________________
None


In [53]:
batch_size = 32

model.fit(X_train, Y_train, batch_size=batch_size, epochs=1,
          validation_data=(X_test, Y_test))



<tensorflow.python.keras.callbacks.History at 0x7fc8acc753d0>

In [54]:
preds = model.predict(X_test, verbose=0)
predictions = [1 if x > 0.5 else 0 for x in preds]
print(classification_report(Y_test, predictions, zero_division=0))

              precision    recall  f1-score   support

           0       0.75      1.00      0.86       648
           1       0.00      0.00      0.00       219

    accuracy                           0.75       867
   macro avg       0.37      0.50      0.43       867
weighted avg       0.56      0.75      0.64       867

