In [11]:
from os import chdir, environ, mkdir, listdir
from shutil import rmtree
from os.path import exists
import tensorflow as tf

disable_gpu = False
if (disable_gpu): 
    environ["CUDA_VISIBLE_DEVICES"]="-1"
else:
    gpus = tf.config.experimental.list_physical_devices('GPU')
    for gpu in gpus:
        tf.config.experimental.set_memory_growth(gpu, True)

from typing import *
import numpy as np
import pandas as pd
import random as rd
from time import time

print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [7]:
# load datasets
def load_datasets(datasets_paths: Tuple[str]) -> Tuple[pd.DataFrame]:
    dataframes =  list(map(lambda path: pd.read_csv(path, sep='\t'), datasets_paths))
    # cast comments to str
    for dataframe in range(len(dataframes)):
        keys = list(dataframes[dataframe].keys())
        if ('comment' in keys): dataframes[dataframe]['comment'] = dataframes[dataframe]['comment'].astype(str)
        if ('parent_comment' in keys): dataframes[dataframe]['parent_comment'] = dataframes[dataframe]['parent_comment'].astype(str)
    return dataframes

train, test = load_datasets(('dataset/final_train.csv', 'dataset/final_test.csv'))

In [8]:
# default model parameters
class defaults:
    # mostly optimized parameters for testing
    use_parent_comment = False
    use_early_stopping = True
    meta_features = ['subreddit', 'score', 'ups', 'downs', 'time', 'day_of_week',
        'comment_length', 'smileys', 'sarcasm_indicators', 'caps_lock', 'letter_duplication']
    rnn_dimension = 128
    dense_dimension = 64
    training_epochs = 15
    batch_size = 16
    training_set_size = 300000   # max number comments
    testing_set_ratio = .2       # percent of training_set_size
    vocab_size = 4000
    random_seed = 31

In [9]:
class sarcasm_detector:
    """
    wrapper class for sarcasm detection in comment text
    https://www.kaggle.com/danofer/sarcasm

    uses 'train' and 'test' as datasets
    
    args:
        load:              load model from disk,        defaults to False,            bool
        use_parent_comment use parent_comment for pred. defaults to class 'defaults', bool
        use_early_stopping use early_stopping for pred. defaults to class 'defaults', bool
        meta_features:     List of meta features,       defaults to class 'defaults', list of strings
        rnn_dimension:     RNN dimensions,              defaults to class 'defaults', int
        dense_dimension:   dense layer dimensions,      defaults to class 'defaults', int
        training_epochs:   number of training epochs,   defaults to class 'defaults', int
        batch_size:        batch size,                  defaults to class 'defaults', int
        training_set_size: size of training dataset,    defaults to class 'defaults', int
                                                         -1 for maximum size
        testing_set_ratio: % of training_set_size,      defaults to class 'defaults', float
        vocab_size:        size of vocabulary,          defaults to class 'defaults', int
        random_seed:       random seed for determinism, defaults to class 'defaults', int
        save:              save fitted model,           defaults to class 'defaults', bool
        overwrite:         replace model on disk,       defaults to class 'defaults', bool
        *name:             name of model,               defaults to class 'defaults', str

    public functions:
        predict(dataset: DataFrame, predict_range: Tuple):
            - predicts label and compares to truth -
            dataset: dataset to predict of, optional. default: test_dataset
            predict_range: Tuple of (start, end), optional. default: (0, 50)
                            -1 for full dataset

            returns tuple of (predictions: List, evaluation)
        
        evaluate(dataset: DataFrame):
            - alias of tf.model.evaluate() with default params on test_dataset -
            dataset: dataset to predict of, optional. default: test_dataset

            returns evaluation

        create_model_and_fit()
            - model creation an training included -

            returns fitted model
    """

    # model parameters
    _use_parent_comment: bool
    _use_early_stopping: bool
    _meta_features: List[int]
    _metadata_dimension: int
    _rnn_dimension: int
    _dense_dimension: int
    _training_epochs: int
    _batch_size: int
    _training_set_size: int
    _testing_set_size: int
    _testing_set_ratio: float
    _vocab_size: int
    _random_seed: int

    # model settings
    _save: bool
    _name: str
    _fitted: bool
    _overwrite: bool

    # datasets
    train_dataset: pd.DataFrame
    test_dataset: pd.DataFrame

    # encoders
    _comment_encoder: tf.keras.layers.TextVectorization
    _parent_comment_encoder: tf.keras.layers.TextVectorization

    # submodels, models and layers
    _comment_model: tf.keras.Sequential
    _parent_comment_model: tf.keras.Sequential
    _metadata_model: tf.keras.Sequential
    _combined: tf.keras.layers.Concatenate
    _dense: tf.keras.layers.Dense
    _classifier: tf.keras.layers.Dense
    model: tf.keras.Model

    # training history
    _history: tf.keras.callbacks.History
    _vocabulary_coverage: List[float]
    _time: float

    def __init__(self, load=False, use_parent_comment=None, use_early_stopping=None, meta_features=None, rnn_dimension=None, dense_dimension=None, training_epochs=None,
        batch_size=None, training_set_size=None, testing_set_ratio=None, vocab_size=None, random_seed=None, save=False, overwrite=False, name=None) -> None:
        assert (name != None), "model name is required! use: name=<name>"
        if (load == True):
            # model gets loaded from file
            assert (exists(f'models/{name}')), 'requested model does not exist!'
            self.model   = tf.keras.models.load_model(f'models/{name}')
            self._name   = name
            self._fitted = True
            self.train_dataset, self.test_dataset = load_datasets((f'models/{name}/train_dataset.csv', f'models/{name}/test_dataset.csv'))
            features = list(self.train_dataset.keys())[2:]
            if ('parent_comment' in features):
                self._use_parent_comment = True
                self._meta_features = features[1:]
            else:
                self._use_parent_comment = False
                self._meta_features = features
            self._metadata_dimension = len(self._meta_features)
        else:
            # model gets created and trained
            training_set_size_maximum = min(min(train['label'].value_counts()), min(test['label'].value_counts())*5)*2
            training_set_size_maximum -= training_set_size_maximum % 100

            if (training_set_size == -1): training_set_size = training_set_size_maximum

            self._use_parent_comment = use_parent_comment if (use_parent_comment != None) else defaults.use_parent_comment
            self._use_early_stopping = use_early_stopping if (use_early_stopping != None) else defaults.use_early_stopping
            self._meta_features      = meta_features      if (meta_features != None)      else defaults.meta_features
            self._rnn_dimension      = rnn_dimension      if (rnn_dimension != None)      else defaults.rnn_dimension
            self._dense_dimension    = dense_dimension    if (dense_dimension != None)    else defaults.dense_dimension
            self._training_epochs    = training_epochs    if (training_epochs != None)    else defaults.training_epochs
            self._batch_size         = batch_size         if (batch_size != None)         else defaults.batch_size
            self._training_set_size  = training_set_size  if (training_set_size != None)  else defaults.training_set_size
            self._vocab_size         = vocab_size         if (vocab_size != None)         else defaults.vocab_size
            self._random_seed        = random_seed        if (random_seed != None)        else defaults.random_seed
            self._testing_set_ratio  = testing_set_ratio  if (testing_set_ratio != None)  else defaults.testing_set_ratio
            self._save               = save               if (save != None)               else False
            self._overwrite          = overwrite          if (overwrite != None)          else False
            self._name               = name               if (name != None)               else None    # redundant but ocd is killing me
            
            self._testing_set_size   = int(self._testing_set_ratio * self._training_set_size)
            self._metadata_dimension = len(self._meta_features)
            self._fitted             = False

            # make sure parameters will work
            assert ((self._training_set_size % 2) == 0) and ((self._testing_set_size % 2) == 0) and (self._testing_set_size <= train.shape[0]), 'invalid training_set_size!'
            assert not (self._overwrite == False and self._save == True and exists(f'models/{self._name}')), "save is set True but model with that name already exists!\nuse overwrite=True or delete model before creating instance."
            if (self._save and not exists('models/' + self._name)): mkdir(f'models/{self._name}')
            
            # warning
            if ((self._metadata_dimension % 2) != 0): print('length of meta_features *should* be an even number!')
        print(f'sarcasm_detector \'{self._name}\' initialized!')

    def _evaluate_vocab(self, text_encoder: tf.keras.layers.TextVectorization, comments: pd.Series) -> float:
        # evaluate the encoder with given comment data
        vocab = np.array(text_encoder.get_vocabulary())
        encoded_comment = vocab[text_encoder(comments).numpy()]
        missrate = 1 - (np.sum(encoded_comment == '[UNK]') / np.count_nonzero(encoded_comment))
        return missrate

    def predict(self, dataset=None, predict_range=(0, 50)) -> Tuple:
        # predicts on dataset and evaluates binary decission
        assert(self._fitted), "model needs to be fitted first!"  
        dataset = dataset if (type(dataset) == pd.core.frame.DataFrame) else self.test_dataset
        dataset = dataset[list(self.train_dataset.keys())]
        predict_range = predict_range if (predict_range != -1) else (0, dataset.shape[0])
        # uses only labels the model was fitted for, prevents tensor shape mismatch
        inputs = [dataset['comment'][predict_range[0]:predict_range[1]]]
        if (self._use_parent_comment): inputs.append(dataset['parent_comment'][predict_range[0]:predict_range[1]])
        if (self._metadata_dimension > 0): inputs.append(dataset[self._meta_features][predict_range[0]:predict_range[1]])
        predictions = list(self.model.predict([inputs]).flatten())
        truths = list(dataset['label'][predict_range[0]:predict_range[1]])
        evaluation = sum(map(lambda x: 1 if (int(round(predictions[x], 0)) == truths[x]) else 0, range(predict_range[1] - predict_range[0]))) / (predict_range[1] - predict_range[0])
        print_arr = np.array(list(map(lambda x: [truths[x], int(round(predictions[x], 0))], range(predict_range[1] - predict_range[0]))))
        return (print_arr, evaluation)

    def evaluate(self, dataset=None):
        # evaluates model
        assert(self._fitted), "model needs to be fitted first!"
        dataset = dataset if (type(dataset) == pd.core.frame.DataFrame) else self.test_dataset    
        dataset = dataset[list(self.train_dataset.keys())]
        # uses only labels the model was fitted for, prevents tensor shape mismatch
        inputs = [dataset['comment']]
        if (self._use_parent_comment): inputs.append(dataset['parent_comment'])
        if (self._metadata_dimension > 0): inputs.append(dataset[self._meta_features])
        return self.model.evaluate(x=inputs, y=dataset['label'])

    def create_model_and_fit(self) -> tf.keras.Model:
        starting_time = time()
        # prepare scrambled datasets and balance label 1/0
        # uses 'train' and 'test' datasets
        print(f'{self._name}: 1/7 prepare scrambled datasets and balance label 1/0 ...')
        train_datasets_0_1 = [train[train.label == 0].sample(int(self._training_set_size/2), random_state=self._random_seed), 
            train[train.label == 1].sample(int(self._training_set_size/2), random_state=self._random_seed)]
        test_datasets_0_1 = [test[test.label == 0].sample(int(self._testing_set_size/2), random_state=self._random_seed), 
            test[test.label == 1].sample(int(self._testing_set_size/2), random_state=self._random_seed)]

        _train_dataset = train_datasets_0_1[0].append(train_datasets_0_1[1]).sample(frac=1)
        _test_dataset = test_datasets_0_1[0].append(test_datasets_0_1[1]).sample(frac=1)

        features = ['label', 'comment', 'parent_comment'] if self._use_parent_comment else ['label', 'comment']
        self.train_dataset = _train_dataset[features + self._meta_features]
        self.test_dataset = _test_dataset[features + self._meta_features]

        # create vocabulary of the 'vocab_size' most used words
        print(f'{self._name}: 2/7 create vocabulary of the {self._vocab_size} most used words ...')
        rd.seed(self._random_seed)
        self._comment_encoder = tf.keras.layers.TextVectorization(
            max_tokens=self._vocab_size,
            name=self._name + '_comment_encoder')
        self._comment_encoder.adapt(self.train_dataset['comment'])
        if (self._use_parent_comment):
            rd.seed(self._random_seed)
            self._parent_comment_encoder = tf.keras.layers.TextVectorization(
                max_tokens=self._vocab_size,
                name=self._name + '_parent_comment_encoder')
            self._parent_comment_encoder.adapt(self.train_dataset['parent_comment'])

        # evaluate vocabulary coverage
        print(f'{self._name}: 3/7 evaluate vocabulary coverage ...')
        self._vocabulary_coverage = [self._evaluate_vocab(self._comment_encoder, self.train_dataset['comment'].head(10000))]
        if (self._use_parent_comment):
            self._vocabulary_coverage.append(self._evaluate_vocab(self._parent_comment_encoder, self.train_dataset['parent_comment'].head(10000)))

        # create models
        print(f'{self._name}: 4/7 create models ...')
        self._comment_model = tf.keras.Sequential([
            self._comment_encoder,
            tf.keras.layers.Embedding(
                input_dim=len(self._comment_encoder.get_vocabulary()),
                output_dim=self._rnn_dimension,
                #mask_zero=True,
                name=f'{self._name}_comment_embedding'),
            tf.keras.layers.Dropout(0.2),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self._rnn_dimension, name=f'{self._name}_comment_LSTM')),
            tf.keras.layers.Dropout(0.2)
        ])
        if (self._use_parent_comment):
            self._parent_comment_model = tf.keras.Sequential([
                self._parent_comment_encoder,
                tf.keras.layers.Embedding(
                    input_dim=len(self._parent_comment_encoder.get_vocabulary()),
                    output_dim=self._rnn_dimension,
                    #mask_zero=True,
                    name=f'{self._name}_parent_comment_embedding'),
                tf.keras.layers.Dropout(0.2),
                tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(self._rnn_dimension, name=f'{self._name}_parent_comment_LSTM')),
                tf.keras.layers.Dropout(0.2)
            ])
        if (self._metadata_dimension > 0):
            self._metadata_model = tf.keras.Sequential([tf.keras.layers.Input(shape=(self._metadata_dimension,), name=f'{self._name}_metadata_input')])

        # concatenate metadata with comment text and compile final model
        print(f'{self._name}: 5/7 concatenate metadata with comment text and compile final model ...')
        outputs = [self._comment_model.output]
        inputs = [self._comment_model.input]
        if (self._use_parent_comment): 
            outputs.append(self._parent_comment_model.output)
            inputs.append(self._parent_comment_model.input)
        if (self._metadata_dimension > 0): 
            outputs.append(self._metadata_model.output)
            inputs.append(self._metadata_model.input)
        if (len(outputs) > 1): 
            self._combined = tf.keras.layers.Concatenate(axis=1, name=f'{self._name}_concat')(outputs)
            self._dense = tf.keras.layers.Dense(self._dense_dimension, activation = 'relu', name=f'{self._name}_dense') (self._combined)
        else:
            self._dense = tf.keras.layers.Dense(self._dense_dimension, activation = 'relu', name=f'{self._name}_dense') (self._comment_model.output)
        self._classifier = tf.keras.layers.Dense(1, activation = 'sigmoid', name=f'{self._name}_classifier') (self._dense)
        self.model = tf.keras.Model(inputs, self._classifier)
        self.model.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3), metrics=tf.keras.metrics.BinaryAccuracy())

        # fit model
        print(f'{self._name}: 6/7 fit model ...')
        input_x = [self.train_dataset['comment']]
        val_x = [self.test_dataset['comment']]
        callbacks = []
        if (self._use_parent_comment): 
            input_x.append(self.train_dataset['parent_comment'])
            val_x.append(self.test_dataset['parent_comment'])
        if (self._metadata_dimension > 0): 
            input_x.append(self.train_dataset[self._meta_features])
            val_x.append(self.test_dataset[self._meta_features])
        if (self._save): callbacks.append(tf.keras.callbacks.CSVLogger(f'models/{self._name}/history.log'))
        if (self._use_early_stopping): callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True))
        self._history = self.model.fit(
            x=input_x, y=self.train_dataset['label'],
            validation_data=(val_x, self.test_dataset['label']),
            epochs=self._training_epochs, batch_size=self._batch_size,
            callbacks=callbacks
        )

        self._fitted=True
        self._time = (time() - starting_time) / 60

        # save model output
        if (self._save):
            print(f'{self._name}: 7/7 save model output ...')
            
            # model data
            tf.keras.models.save_model(self.model, f'models/{self._name}')

            # datasets
            self.train_dataset.to_csv(path_or_buf=f'models/{self._name}/train_dataset.csv', sep='\t', index=False)
            self.test_dataset.to_csv(path_or_buf=f'models/{self._name}/test_dataset.csv', sep='\t', index=False)

            # training log
            loss = self._history.history['loss']
            accuracy = self._history.history['binary_accuracy']
            val_loss = self._history.history['val_loss']
            val_accuracy = self._history.history['val_binary_accuracy']
            with open('models/' + self._name + '/train_log.txt', 'w') as log_file:
                log_file.write('model name: %s\n' % self._name)
                log_file.write('  parameters:\n')
                log_file.write('    use_parent_comment: %s\n' % str(self._use_parent_comment))
                log_file.write('    input_features: %s\n' % self._meta_features)
                log_file.write('    metadata_dimension: %d\n' % self._metadata_dimension)
                log_file.write('    rnn_dimension: %d\n' % self._rnn_dimension)
                log_file.write('    dense_dimension: %d\n' % self._dense_dimension)
                log_file.write('    training_epochs: %d\n' % self._training_epochs)
                log_file.write('    batch_size: %d\n' % self._batch_size)
                log_file.write('    training_set_size: %d\n' % self._training_set_size)
                log_file.write('    testing_set_size: %d\n' % self._testing_set_size)
                log_file.write('    vocab_size: %d\n' % self._vocab_size)
                log_file.write('    random_seed: %d\n' % self._random_seed)
                log_file.write('  history:\n')
                log_file.write('    comment encoder vocab coverage %s:\n' % self._vocabulary_coverage)
                log_file.write('    elapsed time: %6.2fmin\n' % self._time)
                for epoch in range(len(loss)):
                    log_file.write('    epoch: %2d loss: %10.5f accuracy: %10.5f validation-loss: %10.5f validation-accuracy %10.5f\n' % (epoch+1, loss[epoch], accuracy[epoch], val_loss[epoch], val_accuracy[epoch]))
        else: print(f'{self._name}: 7/7 skip saving model output ...')
        print('done.')
        return self.model

In [13]:
#test class
tester = sarcasm_detector(name='tester',training_epochs=2, training_set_size=1000, save=True, overwrite=True)
tester.create_model_and_fit()
tester.predict()
tester.evaluate()
del tester
tester = sarcasm_detector(name='tester', load=True)
tester.predict()
tester.evaluate()
del tester
rmtree('models/tester')
print('success!')

length of meta_features *should* be an even number!
sarcasm_detector 'tester' initialized!
tester: 1/7 prepare scrambled datasets and balance label 1/0 ...
tester: 2/7 create vocabulary of the 4000 most used words ...
tester: 3/7 evaluate vocabulary coverage ...
tester: 4/7 create models ...
tester: 5/7 concatenate metadata with comment text and compile final model ...
tester: 6/7 fit model ...
Epoch 1/2
Epoch 2/2
tester: 7/7 save model output ...




INFO:tensorflow:Assets written to: models/tester\assets


INFO:tensorflow:Assets written to: models/tester\assets


done.
sarcasm_detector 'tester' initialized!
success!


In [None]:
# create models an fit them
# doesn't keep instance but saves model to disk
sarcasm_detector(name='default', save=True, overwrite=True).create_model_and_fit()
sarcasm_detector(name='batch_size_8', batch_size=8, save=True, overwrite=True).create_model_and_fit()
sarcasm_detector(name='only_comment', meta_features=[], save=True, overwrite=True).create_model_and_fit()
sarcasm_detector(name='rnn_dimensions_64', rnn_dimension=64, save=True, overwrite=True).create_model_and_fit()
sarcasm_detector(name='vocab_size_2000', vocab_size=2000, save=True, overwrite=True).create_model_and_fit()
sarcasm_detector(name='with_parent_comment', use_parent_comment=True, meta_features=['subreddit', 'score', 'ups', 'downs', 'time', 'day_of_week', 'comment_length',
        'parent_comment_length', 'smileys', 'smileys_parent',
        'sarcasm_indicators', 'sarcasm_indicators_parent', 'caps_lock',
        'caps_lock_parent', 'letter_duplication', 'letter_duplication_parent'], save=True, overwrite=True).create_model_and_fit()
sarcasm_detector(rnn_dimension=256, name='rnn_dimensions_256', save=True, overwrite=True).create_model_and_fit()

sarcasm_detector(name='full_with_parent_comment', training_set_size=-1, use_parent_comment=True, meta_features=['subreddit', 'score', 'ups', 'downs', 'time', 'day_of_week', 'comment_length',
        'parent_comment_length', 'smileys', 'smileys_parent',
        'sarcasm_indicators', 'sarcasm_indicators_parent', 'caps_lock',
        'caps_lock_parent', 'letter_duplication', 'letter_duplication_parent'], save=True, overwrite=True).create_model_and_fit()
sarcasm_detector(name='full_default', training_set_size=-1, save=True, overwrite=True).create_model_and_fit()

In [None]:
# save predictions of model for later visualizations
model_list = listdir('models')

for model_name in model_list:
    model_class = sarcasm_detector(load=True, name=model_name)
    predictions, evaluation = model_class.predict(predict_range=-1)
    np.save(f'models/{model_name}/predictions.npy', predictions)
    np.save(f'models/{model_name}/evaluation.npy', evaluation)
    del model_class