In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import RMSprop
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from random import shuffle
from statistics import mean

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
dataset = pd.read_csv("/content/drive/MyDrive/isarcasm/train.En.csv")[["tweet", "sarcastic"]]

dataset = dataset.dropna(axis = 0)
dataset.reset_index(drop=True, inplace=True)

dataset.info()
print(dataset.iloc[1062])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3467 entries, 0 to 3466
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet      3467 non-null   object
 1   sarcastic  3467 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 54.3+ KB
tweet        Vaccine dose 1. Thank you, science.
sarcastic                                      0
Name: 1062, dtype: object


In [None]:
dataset.sarcastic.value_counts()

0    2600
1     867
Name: sarcastic, dtype: int64

In [None]:
dataset.head()

Unnamed: 0,tweet,sarcastic
0,The only thing I got from college is a caffein...,1
1,I love it when professors draw a big question ...,1
2,Remember the hundred emails from companies whe...,1
3,Today my pop-pop told me I was not “forced” to...,1
4,@VolphanCarol @littlewhitty @mysticalmanatee I...,1


In [None]:
X_data = dataset.tweet
Y_data = dataset.sarcastic

In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 150
trunc_type = 'post'


tokenizer = Tokenizer(num_words = vocab_size)
tokenizer.fit_on_texts(X_data)
sequences = tokenizer.texts_to_sequences(X_data)
padded = pad_sequences(sequences,maxlen=max_length, truncating=trunc_type)

X = padded
Y = Y_data

# LSTM

In [None]:
model_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_lstm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 16)           160000    
_________________________________________________________________
bidirectional (Bidirectional (None, 32)                4224      
_________________________________________________________________
dense (Dense)                (None, 6)                 198       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 164,429
Trainable params: 164,429
Non-trainable params: 0
_________________________________________________________________


In [None]:
from sklearn.model_selection import KFold
kfold = KFold(n_splits=5, shuffle=True)

fold_no = 1
for train, test in kfold.split(X):
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  class_weights = {1:2, 0:0.67}
  train = train.tolist()
  test = test.tolist()
  shuffle(test)
  shuffle(train)

  model_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
  ])

  model_lstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy', f1_m])
  
  history = model_lstm.fit(X[train], Y[train], batch_size=32, epochs=5, validation_data=(X[test], Y[test]), class_weight=class_weights,shuffle=True)
  
  fold_no = fold_no + 1
  
  

------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
------------------------------------------------------------------------
Training for fold 3 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
------------------------------------------------------------------------
Training for fold 4 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
------------------------------------------------------------------------
Training for fold 5 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
fold_no = 1
model_with_weights_f1s = list()

for train, test in kfold.split(X):
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  train = train.tolist()
  test = test.tolist()
  shuffle(test)
  shuffle(train)

  model_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
  ])


  model_lstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy', f1_m])

  class_weights = {1:2, 0:0.67}
  history = model_lstm.fit(X[train], Y[train], batch_size=32, epochs=10, class_weight=class_weights,shuffle=True)
  
  scores = model_lstm.evaluate(X[test], Y[test], verbose=0)
  print(f'Score for fold {fold_no}: \n{model_lstm.metrics_names[0]} of {scores[0]};\n {model_lstm.metrics_names[1]} of {scores[1]*100}%;\n{model_lstm.metrics_names[2]} of {scores[2]};')
  model_with_weights_f1s.append(scores[2])
  fold_no = fold_no + 1

print(f'average of folds f1s is : {mean(model_with_weights_f1s)}')

------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 1: 
loss of 1.2571284770965576;
 accuracy of 62.39193081855774%;
f1_m of 0.29269006848335266;
------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 2: 
loss of 1.5422310829162598;
 accuracy of 68.1556224822998%;
f1_m of 0.26367107033729553;
------------------------------------------------------------------------
Training for fold 3 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 3: 
loss of 1.4514930248260498;
 accuracy of 67.6767647266388%;
f1_m of 0.29151976108551025;
---------------------------------------------------

In [None]:
fold_no = 1
model_without_weights_f1s = list()

for train, test in kfold.split(X):
  print('------------------------------------------------------------------------')
  print(f'Training for fold {fold_no} ...')

  train = train.tolist()
  test = test.tolist()
  shuffle(test)
  shuffle(train)

  model_lstm = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(16)),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
  ])


  model_lstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy', f1_m])

  history = model_lstm.fit(X[train], Y[train], batch_size=32, epochs=10,shuffle=True)
  
  scores = model_lstm.evaluate(X[test], Y[test], verbose=0)
  print(f'Score for fold {fold_no}: \n{model_lstm.metrics_names[0]} of {scores[0]};\n {model_lstm.metrics_names[1]} of {scores[1]*100}%;\n{model_lstm.metrics_names[2]} of {scores[2]};')
  model_without_weights_f1s.append(scores[2])
  fold_no = fold_no + 1

print(f'average of folds f1s is : {mean(model_without_weights_f1s)}')

------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 1: 
loss of 1.2235075235366821;
 accuracy of 69.30835843086243%;
f1_m of 0.2975965440273285;
------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 2: 
loss of 1.2992018461227417;
 accuracy of 64.69740867614746%;
f1_m of 0.2606031596660614;
------------------------------------------------------------------------
Training for fold 3 ...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 3: 
loss of 1.3538717031478882;
 accuracy of 69.84127163887024%;
f1_m of 0.24598611891269684;
---------------------------------------------------