In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [3]:
data = pd.read_csv('../normalized_mustard_dataset.csv')
data.head()

Unnamed: 0,file_name,utterance,context,sarcasm
0,1_60.wav,It's just a privilege to watch your mind at work.,['I never would have identified the fingerprin...,True
1,1_70.wav,I don't think I'll be able to stop thinking ab...,['This is one of my favorite places to kick ba...,True
2,1_80.wav,"Since it's not bee season, you can have my epi...","['Here we go. Pad thai, no peanuts.', 'But doe...",False
3,1_90.wav,"Lois Lane is falling, accelerating at an initi...",['A marathon? How many Superman movies are the...,False
4,1_105.wav,I'm just inferring this is a couch because the...,"[""Great Caesar's ghost, look at this place."", ...",True


In [4]:
embedding_size = 16
max_length = 60
padding_type = 'post'

In [5]:
data['sarcasm'].replace({True:1,False:0},inplace=True)
data.head()

Unnamed: 0,file_name,utterance,context,sarcasm
0,1_60.wav,It's just a privilege to watch your mind at work.,['I never would have identified the fingerprin...,1
1,1_70.wav,I don't think I'll be able to stop thinking ab...,['This is one of my favorite places to kick ba...,1
2,1_80.wav,"Since it's not bee season, you can have my epi...","['Here we go. Pad thai, no peanuts.', 'But doe...",0
3,1_90.wav,"Lois Lane is falling, accelerating at an initi...",['A marathon? How many Superman movies are the...,0
4,1_105.wav,I'm just inferring this is a couch because the...,"[""Great Caesar's ghost, look at this place."", ...",1


In [6]:
utterances = data['utterance'].values
sarcasm_states = data['sarcasm'].values

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(utterances)
train = tokenizer.texts_to_sequences(utterances)
padded_train_sequences = pad_sequences(train, maxlen=max_length, padding=padding_type)
# y_text_train = to_categorical(sarcasm_states, num_classes=2)
label_encoder = LabelEncoder()
y_text_train = to_categorical(label_encoder.fit_transform(sarcasm_states))

vocab_size = len(tokenizer.word_index)

In [8]:
data.tail(5)

Unnamed: 0,file_name,utterance,context,sarcasm
685,2_169.wav,"Hes not right for the part, and if I suggest h...","['What am I gonna do now?', 'Just pass the tap...",1
686,2_235.wav,"Oh yeah he has a caretaker his older brother, ...","['Helo! Anybody in there order a celebrity?', ...",0
687,2_34.wav,Is it me or the greetings gone downhill around...,"['Hey', 'You son of a bitch!']",1
688,2_608.wav,"You are right, by saying nice, I am virtually ...","['Did I go to this school?', ""Hey, there's Mis...",1
689,2_524.wav,"Yes and we are ""very"" excited about it.","[""Anyway, if you don't feel like being alone t...",1


In [9]:
X_text_train, X_text_test, y_text_train, y_text_test = train_test_split(padded_train_sequences, y_text_train, test_size=0.2, random_state=42)

In [10]:
from tqdm import tqdm
import codecs

In [11]:
def load_fasttext_vectors2(fname):
    embeddings_index = {}
    f = codecs.open(fname, encoding='utf-8')
    for line in tqdm(f):
        values = line.rstrip().rsplit(' ')
        ft_word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[ft_word] = coefs
    f.close()
    return embeddings_index

In [12]:
w2v_model =  load_fasttext_vectors2("wiki-news-300d-1M.vec")

FileNotFoundError: [Errno 2] No such file or directory: 'wiki-news-300d-1M.vec'

In [11]:
from tensorflow.keras.layers import Dense, Flatten, LSTM, Bidirectional, Conv1D, MaxPooling1D, Input, Embedding
from tensorflow.keras.models import Model

In [12]:
words_not_found = []
nb_words = len(tokenizer.word_index)
embedding_matrix = np.zeros((nb_words + 1, 300))
for word, i in tokenizer.word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = w2v_model.get(word)
    if (embedding_vector is not None) and len(embedding_vector) > 0:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        words_not_found.append(word)

In [115]:
input_data = Input(shape=(max_length,), name='main_input')
embedding_layer = Embedding(vocab_size + 1, 300, weights=[embedding_matrix], trainable=False)(input_data)
conv_1 = Conv1D(filters=50, kernel_size=4, activation='relu')(embedding_layer)
max_1 = MaxPooling1D(pool_size=2)(conv_1)
lstm_layer = Bidirectional(LSTM(64,return_sequences=True))(max_1)

flatten = Flatten()(lstm_layer)
dense = Dense(75, activation='relu', name='fully_connected')(flatten)
out = Dense(2, activation='softmax')(dense)

model = Model(inputs=[input_data], outputs=[out])
model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
main_input (InputLayer)      [(None, 60)]              0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 60, 300)           610800    
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 57, 50)            60050     
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 28, 50)            0         
_________________________________________________________________
bidirectional_6 (Bidirection (None, 28, 128)           58880     
_________________________________________________________________
flatten_6 (Flatten)          (None, 3584)              0         
_________________________________________________________________
fully_connected (Dense)      (None, 75)                2688

In [117]:
model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])

In [118]:
history = model.fit(X_text_train, y_text_train, batch_size=64, epochs=5, verbose=1)

Train on 552 samples
Epoch 1/5


2022-04-29 12:45:19.790461: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference___backward_standard_lstm_326380_326865' and '__inference___backward_standard_lstm_326380_326865_specialized_for_StatefulPartitionedCall_at___inference_distributed_function_327565' both implement 'lstm_1a94dd67-dadb-4876-bd82-00b13a34fc2c' but their signatures do not match.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [119]:
user_statement = input("Tell me something sarcastic: \n")
tokenized_statement = tokenizer.texts_to_sequences([user_statement])
tokenized_statement = pad_sequences(tokenized_statement, maxlen=max_length, padding=padding_type)
output = model.predict(tokenized_statement)[0]

if np.argmax(output) == 0:
    print("Non-sarcastic")
elif np.argmax(output) == 1:
    print("Sarcasm")
print(output)

2022-04-29 12:45:30.697748: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference_cudnn_lstm_with_fallback_328077' and '__inference_standard_lstm_327966_specialized_for_model_6_bidirectional_6_forward_lstm_6_StatefulPartitionedCall_at___inference_distributed_function_328757' both implement 'lstm_c8d77348-87e3-41af-af44-f1fb196655d5' but their signatures do not match.


Non-sarcastic
[0.8250987  0.17490135]


In [103]:
model.save("../models/text_model.h5", overwrite=True, save_format='h5')