In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [56]:
data = pd.read_csv('normalized_mustard_dataset.csv')
data.head()

Unnamed: 0,utterance,speaker,context,context_speakers,show,sarcasm
0,It's just a privilege to watch your mind at work.,SHELDON,['I never would have identified the fingerprin...,"['LEONARD', 'SHELDON']",BBT,True
1,I don't think I'll be able to stop thinking ab...,PENNY,['This is one of my favorite places to kick ba...,"['HOWARD', 'PENNY', 'HOWARD', 'HOWARD', 'HOWAR...",BBT,True
2,"Since it's not bee season, you can have my epi...",SHELDON,"['Here we go. Pad thai, no peanuts.', 'But doe...","['LEONARD', 'HOWARD', 'LEONARD']",BBT,False
3,"Lois Lane is falling, accelerating at an initi...",SHELDON,['A marathon? How many Superman movies are the...,"['PENNY', 'SHELDON', 'PENNY', 'SHELDON', 'SHEL...",BBT,False
4,I'm just inferring this is a couch because the...,SHELDON,"[""Great Caesar's ghost, look at this place."", ...","['SHELDON', 'LEONARD', 'SHELDON', 'SHELDON', '...",BBT,True


In [57]:
embedding_size = 16
max_length = 60

required_data = data[['utterance', 'sarcasm']]


In [58]:
data['sarcasm'].replace({True:1,False:0},inplace=True)
data.head()

Unnamed: 0,utterance,speaker,context,context_speakers,show,sarcasm
0,It's just a privilege to watch your mind at work.,SHELDON,['I never would have identified the fingerprin...,"['LEONARD', 'SHELDON']",BBT,1
1,I don't think I'll be able to stop thinking ab...,PENNY,['This is one of my favorite places to kick ba...,"['HOWARD', 'PENNY', 'HOWARD', 'HOWARD', 'HOWAR...",BBT,1
2,"Since it's not bee season, you can have my epi...",SHELDON,"['Here we go. Pad thai, no peanuts.', 'But doe...","['LEONARD', 'HOWARD', 'LEONARD']",BBT,0
3,"Lois Lane is falling, accelerating at an initi...",SHELDON,['A marathon? How many Superman movies are the...,"['PENNY', 'SHELDON', 'PENNY', 'SHELDON', 'SHEL...",BBT,0
4,I'm just inferring this is a couch because the...,SHELDON,"[""Great Caesar's ghost, look at this place."", ...","['SHELDON', 'LEONARD', 'SHELDON', 'SHELDON', '...",BBT,1


In [74]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['utterance'].values)
train = tokenizer.texts_to_sequences(data['utterance'].values)
padded_train_sequences = pad_sequences(train, maxlen=max_length, padding='post')
y_train = to_categorical(data['sarcasm'], num_classes=2)

vocab_size = len(tokenizer.word_index)

In [75]:
x_train,x_test,y_train,y_test = train_test_split(padded_train_sequences,y_train, test_size=0.2, random_state=42)

In [61]:
from tqdm import tqdm
import codecs

In [62]:
def load_fasttext_vectors2(fname):
    embeddings_index = {}
    f = codecs.open(fname, encoding='utf-8')
    for line in tqdm(f):
        values = line.rstrip().rsplit(' ')
        ft_word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[ft_word] = coefs
    f.close()
    return embeddings_index

In [63]:
w2v_model =  load_fasttext_vectors2("wiki-news-300d-1M.vec")

999995it [01:28, 11283.37it/s]


In [76]:
from tensorflow.keras.layers import Dense, Flatten, LSTM, Bidirectional, Conv1D, MaxPooling1D, Input, Embedding
from tensorflow.keras.models import Model

In [77]:
words_not_found = []
nb_words = len(tokenizer.word_index)
embedding_matrix = np.zeros((nb_words + 1, 300))
for word, i in tokenizer.word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = w2v_model.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)

In [78]:
input_data = Input(shape=(max_length,), name='main_input')
embedding_layer = Embedding(vocab_size + 1, 300, weights=[embedding_matrix], trainable=False)(input_data)
conv_1 = Conv1D(filters=50, kernel_size=4, activation='relu')(embedding_layer)
max_1 = MaxPooling1D(pool_size=2)(conv_1)
conv_2 = Conv1D(filters=100, kernel_size=3, activation='relu')(max_1)
max_2 = MaxPooling1D(pool_size=2)(conv_2)

lstm_layer = Bidirectional(LSTM(128,return_sequences=True))(max_2)

flatten = Flatten()(lstm_layer)
dense = Dense(100, activation='relu', name='fully_connected')(flatten)
out = Dense(2, activation='softmax')(dense)

model = Model(inputs=[input_data], outputs=[out])

print(model.summary())

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      [(None, 60)]              0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 60, 300)           610800    
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 57, 50)            60050     
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 28, 50)            0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 26, 100)           15100     
_________________________________________________________________
max_pooling1d_11 (MaxPooling (None, 13, 100)           0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, 13, 256)           2344

In [79]:
from tensorflow.keras.optimizers import Adam

In [82]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=0.0001), metrics=['accuracy'])
history = model.fit(x_train, y_train,  batch_size=64, epochs=5, verbose=1)

Train on 552 samples
Epoch 6/20


2022-02-27 03:00:49.795291: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference___backward_standard_lstm_47618_48103_specialized_for_StatefulPartitionedCall_at___inference_distributed_function_48828' and '__inference___backward_cudnn_lstm_with_fallback_46815_46997' both implement 'lstm_5a046278-f3c2-45d3-b453-1b8327956995' but their signatures do not match.


Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [87]:
user_statement = input("Tell me something sarcastic: \n")
tokenized_statement = tokenizer.texts_to_sequences(user_statement)
tokenized_statement = pad_sequences(tokenized_statement, maxlen=max_length)
output = model.predict(tokenized_statement)[0]

if np.argmax(output) == 0:
    print("Non-sarcastic")
elif np.argmax(output) == 1:
    print("Sarcasm")
print(output)

Non-sarcastic
[0.89285547 0.10714462]


In [93]:
model.save("text-model.h5", overwrite=True, save_format='h5')