<a href="https://colab.research.google.com/github/Aayush360/Natural_langauge_processing/blob/main/Detecting_Sarcasm_CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import libraries

In [2]:
import pandas as pd
import numpy as np
import json
import re
import gensim
import math
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer


from gensim.models import KeyedVectors
import keras

from keras.models import Sequential,Model
from keras import layers
from keras.layers import Dense,Dropout,Conv1D,GlobalMaxPooling1D

import h5py

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [3]:
def parse_data(file):
  for l in open(file,'r'):
    yield json.loads(l)

data = list(parse_data('Sarcasm_Headlines_Dataset_v2.json'))
df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [5]:
# This is primarily data from news headlines sourced from The Onion and the Huffington Post.

In [6]:
# article link is not much used in our analysis so we can remove it

In [7]:
df.pop('article_link')
df.head()

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...


In [8]:
len(df)

28619

In [12]:
df['is_sarcastic'].unique() # so there are only 2 classes

array([1, 0])

In [9]:
# there are 28619 instances of dataset

In [13]:
# Data Preprocessing

In [10]:
# let us clean our data:
# remove special characters
# keep only alphanumeric data
# remove stop words
# lemmatize our data
# perform case-folding on the data

In [14]:
def text_clean(corpus):
 
    cleaned_corpus = pd.Series()
    for row in corpus:
        qs = []
        for word in row.split():
            p1 = re.sub(pattern='[^a-zA-Z0-9]',repl=' ',string=word)
            p1 = p1.lower()
            qs.append(p1)
        cleaned_corpus = cleaned_corpus.append(pd.Series(' '.join(qs)))
    return cleaned_corpus

In [15]:
def stopwords_removal(corpus):
    stop = set(stopwords.words('english'))
    corpus = [[x for x in x.split() if x not in stop] for x in corpus]
    return corpus

In [16]:
def lemmatize(corpus):
    lem = WordNetLemmatizer()
    corpus = [[lem.lemmatize(x, pos = 'v') for x in x] for x in corpus]
    return corpus

In [18]:
def stem(corpus, stem_type = None):
    if stem_type == 'snowball':
        stemmer = SnowballStemmer(language = 'english')
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    else :
        stemmer = PorterStemmer()
        corpus = [[stemmer.stem(x) for x in x] for x in corpus]
    return corpus

In [19]:
def preprocess(corpus, cleaning = True, stemming = False, stem_type = None, lemmatization = False, remove_stopwords = True):
    
    '''
    Purpose : Function to perform all pre-processing tasks (cleaning, stemming, lemmatization, stopwords removal etc.)
    
    Input : 
    'corpus' - Text corpus on which pre-processing tasks will be performed
    
    'cleaning', 'stemming', 'lemmatization', 'remove_stopwords' - Boolean variables indicating whether a particular task should 
                                                                  be performed or not
    'stem_type' - Choose between Porter stemmer or Snowball(Porter2) stemmer. Default is "None", which corresponds to Porter
                  Stemmer. 'snowball' corresponds to Snowball Stemmer
    
    Note : Either stemming or lemmatization should be used. There's no benefit of using both of them together
    
    Output : Returns the processed text corpus
    
    '''
    if cleaning == True:
        corpus = text_clean(corpus)
    
    if remove_stopwords == True:
        corpus = stopwords_removal(corpus)
    else :
        corpus = [[x for x in x.split()] for x in corpus]
    
    if lemmatization == True:
        corpus = lemmatize(corpus)
        
        
    if stemming == True:
        corpus = stem(corpus, stem_type)
    
    corpus = [' '.join(x) for x in corpus]
        

    return corpus

In [20]:
# text we want to clean is :
df['headline']

0        thirtysomething scientists unveil doomsday clo...
1        dem rep. totally nails why congress is falling...
2        eat your veggies: 9 deliciously different recipes
3        inclement weather prevents liar from getting t...
4        mother comes pretty close to using word 'strea...
                               ...                        
28614         jews to celebrate rosh hashasha or something
28615    internal affairs investigator disappointed con...
28616    the most beautiful acceptance speech this week...
28617    mars probe destroyed by orbiting spielberg-gat...
28618                   dad clarifies this not a food stop
Name: headline, Length: 28619, dtype: object

In [21]:
headlines = preprocess(df['headline'], lemmatization=True, remove_stopwords=True)

  This is separate from the ipykernel package so we can avoid doing imports until


In [22]:
headlines[:4]

['thirtysomething scientists unveil doomsday clock hair loss',
 'dem rep totally nail congress fall short gender racial equality',
 'eat veggies 9 deliciously different recipes',
 'inclement weather prevent liar get work']

In [24]:
# now that we have performed cleaning of the corpus we are ready to train our model
# this is our pretrained Word2Vec model

In [27]:
EMBEDDING_FILE = '/root/input/GoogleNews-vectors-negative300.bin.gz'
!wget -P /root/input/ -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"


--2021-04-29 13:58:13--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.171.104
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.171.104|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘/root/input/GoogleNews-vectors-negative300.bin.gz’


2021-04-29 13:58:35 (72.2 MB/s) - ‘/root/input/GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [28]:
model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [29]:
# now we will be seding the word_vectors each of size 300, to the CNN model
# we need each of our data(headlines) to be standarized i.e should have eaxacly 10 words, if more truncate is less pad 0's in the vecotr to make lenght 300

In [30]:
MAX_LENGTH=10
VECTOR_SIZE=300

In [37]:
def vectorize_data(data):
  vectors=[]
  padding_vector = [0.0]*VECTOR_SIZE

  for i,data_point in enumerate(data):
    data_points_vector = []
    count=0
    tokens = data_point.split()

    for token in tokens:
      if count>=MAX_LENGTH:
        break
      if token in model.wv.vocab:
        data_points_vector.append(model.wv[token])
      count=count+1
    
    if len(data_points_vector)<MAX_LENGTH:
      to_fill = MAX_LENGTH -len(data_points_vector)
      for _ in range(to_fill):
        data_points_vector.append(padding_vector)
    
    vectors.append(data_points_vector)
  return vectors


In [36]:

model.wv['hello'].size # array/vector of length 300

  


300

In [38]:
headlines[:4]

['thirtysomething scientists unveil doomsday clock hair loss',
 'dem rep totally nail congress fall short gender racial equality',
 'eat veggies 9 deliciously different recipes',
 'inclement weather prevent liar get work']

In [39]:
vectorized_headlines = vectorize_data(headlines)

  del sys.path[0]
  


In [43]:
len(vectorized_headlines[0])

10

In [57]:
vectorized_headlines[0][6].size

300

In [58]:
# validation to ensure that the 10 vectors are present for each headline,

In [59]:
for i, vec in enumerate(vectorized_headlines):
  if len(vec)!=MAX_LENGTH:
    print(i)



In [64]:
len(vectorized_headlines)

28619

In [65]:
# splitting the data into trian and testset

In [66]:
train_div = math.floor(0.7*len(vectorized_headlines))
train_div

20033

In [67]:
X_train = vectorized_headlines[:train_div]
y_train = df['is_sarcastic'][:train_div]
X_test = vectorized_headlines[train_div:]
y_test = df['is_sarcastic'][train_div:]

In [69]:
print('training data size is ',len(X_train))
print('test data size is',len(X_test))
print('trian label size is', len(y_train))
print('test label size is', len(y_test))

training data size is  20033
test data size is 8586
trian label size is 20033
test label size is 8586


In [70]:
# reshaping the data to feed into CNN model

In [71]:
X_train = np.reshape(X_train,(len(X_train),MAX_LENGTH,VECTOR_SIZE))
X_test = np.reshape(X_test,(len(X_test),MAX_LENGTH,VECTOR_SIZE))

y_train = np.array(y_train)
y_test = np.array(y_test)

In [72]:
## Building the model

In [73]:
# definign hyperparameters

In [74]:
FILTERS = 8
KERNEL_SIZE = 3
HIDDEN1_NODES =10
HIDDEN2_NODES = 5
DROPOUT = 0.35
NUM_EPOCHS=10
BATCH_SIZE = 50

In [75]:
model = Sequential()
model.add(Conv1D(FILTERS,KERNEL_SIZE,strides=1,padding='same',activation='relu',input_shape=(MAX_LENGTH,VECTOR_SIZE)))
model.add(GlobalMaxPooling1D())

In [76]:
# we have used 1 dimensional convolutions due to signal dimensionality associated with text data

In [77]:
model.add(Dense(HIDDEN1_NODES,activation='relu'))
model.add(Dropout(DROPOUT))
model.add(Dense(HIDDEN2_NODES,activation='relu'))
model.add(Dropout(DROPOUT))

model.add(Dense(1,activation='sigmoid'))

In [78]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 10, 8)             7208      
_________________________________________________________________
global_max_pooling1d (Global (None, 8)                 0         
_________________________________________________________________
dense (Dense)                (None, 10)                90        
_________________________________________________________________
dropout (Dropout)            (None, 10)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 55        
_________________________________________________________________
dropout_1 (Dropout)          (None, 5)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 6

In [79]:
#We have 7,359 trainable parameters in our model

In [80]:
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['accuracy'])

In [81]:
training_history = model.fit(X_train,y_train,epochs=NUM_EPOCHS,batch_size=BATCH_SIZE)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [82]:
# evaluating and saving our model

In [83]:
loss,accuracy = model.evaluate(X_test,y_test,verbose=False)
print('test acc is: {:4f}'.format(accuracy))

test acc is: 0.755183


In [85]:
# You can fine-tune various parameters and add/delete layers to obtain other result