In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import RMSprop
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from random import shuffle
from statistics import mean

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [35]:
dataset = pd.read_csv("/content/drive/MyDrive/iSarcasm/train.En.csv")[["tweet", "sarcastic"]]

dataset = dataset.dropna(axis = 0)
dataset.reset_index(drop=True, inplace=True)

dataset.info()
print(dataset.iloc[1062])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3467 entries, 0 to 3466
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet      3467 non-null   object
 1   sarcastic  3467 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 54.3+ KB
tweet        Vaccine dose 1. Thank you, science.
sarcastic                                      0
Name: 1062, dtype: object


In [36]:
dataset.sarcastic.value_counts()

0    2600
1     867
Name: sarcastic, dtype: int64

In [37]:
dataset.head()

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1


In [38]:
X_data = dataset.tweet
Y_data = dataset.sarcastic

In [39]:
vocab_size = 10000
embedding_dim = 16
max_length = 150
trunc_type = 'post'


tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(X_data)
sequences = tokenizer.texts_to_sequences(X_data)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

X = padded
Y = Y_data

In [45]:
model_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_lstm.summary()

Model: "sequential_24"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_24 (Embedding)    (None, 150, 16)           160000    
                                                                 
 bidirectional_24 (Bidirecti  (None, 64)               12544     
 onal)                                                           
                                                                 
 dense_48 (Dense)            (None, 6)                 390       
                                                                 
 dense_49 (Dense)            (None, 1)                 7         
                                                                 
Total params: 172,941
Trainable params: 172,941
Non-trainable params: 0
_________________________________________________________________


In [46]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, shuffle=True)

fold_no = 1
for train, test in kfold.split(X):
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  class_weights = {1:3, 0:1}
  train = train.tolist()
  test = test.tolist()
  shuffle(test)
  shuffle(train)

  model_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  model_lstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy', f1_m])
  
  history = model_lstm.fit(X[train], Y[train], batch_size=32, epochs=5, validation_data=(X[test], Y[test]), class_weight=class_weights,shuffle=True)
  
  fold_no = fold_no + 1

------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
------------------------------------------------------------------------
Training for fold 3 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
------------------------------------------------------------------------
Training for fold 4 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
------------------------------------------------------------------------
Training for fold 5 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
------------------------------------------------------------------------
Training for fold 6 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
------------------------------------------------------------------------
Training for fold 7 ...
Epoch 1/5
Epoch 2/5
E

In [49]:
history.history

{'accuracy': [0.2566485106945038,
  0.31592437624931335,
  0.5901954770088196,
  0.784043550491333,
  0.9045177698135376],
 'f1_m': [0.3986417353153229,
  0.41430947184562683,
  0.5412946939468384,
  0.6958963871002197,
  0.8106317520141602],
 'loss': [1.0464783906936646,
  1.035431981086731,
  0.9223549365997314,
  0.9901016354560852,
  0.6275005340576172],
 'val_accuracy': [0.21098266541957855,
  0.34682080149650574,
  0.5462427735328674,
  0.6705202460289001,
  0.6560693383216858],
 'val_f1_m': [0.3416599929332733,
  0.3660137355327606,
  0.36833885312080383,
  0.35634878277778625,
  0.36536893248558044],
 'val_loss': [0.7001060843467712,
  0.7149655818939209,
  0.8182501196861267,
  0.730193555355072,
  0.7769067287445068]}