In [None]:
from data_processing import *
from utils import *

In [None]:
dict_filename = "word_list.txt"
word_filename = "word_list_freq.txt"
tweets = ["I think i need a #theoryofadeadman intervention. Literally I've been listening to them for 2 weeks straight. #sendhelp #notreally @TOADM #sarcasm",
          "You can see the love in her eyes #not :) ♥♥♥ #sarcasm",
          "@bintSaquib @bintsiddique oh your up north that's why, well here down south it was hottttt I hardly got any sleep coz of it lmaoooo", 
          "It seems as though things are escalating in San Diego, so please everyone stay safe #notallmuslimsarebad #PrayForPeace"]

### Cleaning process for a set of tweets
<img src="pictures/pipeline_clean_tokens.png">

In [None]:
clean_tweets = initial_clean(tweets, path + "/res/demo/demo_clean_tweets.txt", word_filename, 
                             word_file_is_dict=True, split_hashtag_method=split_hashtags2)

In [None]:
for tw, clean_tw in zip(tweets, clean_tweets):
    print("%s\n%s\n" % (tw, clean_tw))

### Hashtag splitter - based on word frequencies and a convergence score

In [None]:
word_list = utils.load_dictionary(path + "/res/" + word_filename)
split_hashtags2("#HillaryClinton", word_list, verbose=True)
split_hashtags2("#hillaryclinton", word_list, verbose=True)
split_hashtags2("#BajiraoMastani", word_list, verbose=True)
split_hashtags2("#bajiraomastani", word_list, verbose=True)
split_hashtags2("#neverknowwhatyouhave", word_list, verbose=True)
split_hashtags2("#drinkwinecauseitseasier", word_list, verbose=True)
split_hashtags2("#thingspresenterssay", word_list, verbose=True)

In [None]:
# Load tokens and pos tags for the previously cleaned tweets
tweets = load_file(path + "/res/demo/tokens_demo.txt").split("\n")
pos_tags = load_file(path + "/res/demo/pos_demo.txt").split("\n")

### Grammatical cleaning process for tweets
<img src="pictures/pipeline_gramm_tokens.png">

In [None]:
gramm_tweets = grammatical_clean(tweets, pos_tags, path + "/res/" + dict_filename, path + '/res/demo/demo_gramm_tweets.txt', 
                                 translate_emojis=False, replace_slang=False, lowercase=False)

In [None]:
for clean_tw, gramm_tw in zip(clean_tweets, gramm_tweets):
    print("%s\n%s\n" % (clean_tw, gramm_tw))

In [None]:
finest_gramm_tweets = grammatical_clean(tweets, pos_tags, path + "/res/" + dict_filename, path + '/res/demo/demo_finest_gramm_tweets.txt',
                                 translate_emojis=True, replace_slang=True, lowercase=True)

In [None]:
for clean_tw, gramm_tw in zip(clean_tweets, finest_gramm_tweets):
    print("%s\n%s\n" % (clean_tw, gramm_tw))

### Strict cleaning process for tweets
<img src="pictures/pipeline_strict_tokens.png">

In [None]:
strict_tweets = strict_clean(tweets, path + '/res/demo/demo_strict_tweets.txt')

In [None]:
for tw, strict_tw in zip(tweets, strict_tweets):
    print("%s\n%s\n" % (tw, strict_tw))

### Datasets

<table style="width:100%">
  <tr>
    <th rowspan="2">Corpus</th>
    <th colspan="2">Train Set</th> 
    <th colspan="2">Test Set</th>
  </tr>
  <tr>
    <th>Sarcastic</th>
    <th>Non-sarcastic</th>
    <th>Sarcastic</th>
    <th>Non-sarcastic</th>
  </tr>
  <tr>
    <td>Ghosh</td>
    <td>24453</td> 
    <td>26736</td>
    <td>1419</td>
    <td>2323</td>
  </tr>
  <tr>
    <td>Riloff</td>
    <td>215</td> 
    <td>1153</td>
    <td>93</td>
    <td>495</td>
  </tr>
  <tr>
    <td>SarcasmDetector</td>
    <td>26739</td> 
    <td>167235</td>
    <td>2971</td>
    <td>18582</td>
  </tr>
  <tr>
    <td>Ptacek</td>
    <td>9200</td> 
    <td>5140</td>
    <td>2300</td>
    <td>1285</td>
  </tr>
</table>


## Feature Extraction

#### Pragmatic Features

- tweet length in characters
- tweet length in tokens
- average token size
- count of capitalized words
- count of user-specific markers (punctuation, hashtags, user mentions, emojis and laughter)
- count of intensifiers (strong affirmatives, negations, interjections)
- total of 6 pragmatic features

#### Word uni-grams
- Word Unigrams (5718 features) on a heavily filtered vocabulary
- memory requirements too high to afford higher n-grams than unigrams

#### POS n-grams
- based on CMU Twitter Part-of-Speech Tagger 
- POS Unigrams (25 features)
- POS Bigrams (483 features)
- POS Trigrams (3515 features) 

#### Sentiment
- underlying sentiment of emojis
- underlying sentiment of words
- subjectivity of words (weak/strong)
- total number of words with underlying sentiment
- Sentiment intensity analyser for the whole tweet (Vader score)
- SentiWordNet average score for positive, negative and neutral sentiment words
- total of 16 sentiment features

#### Topics
- based on the train set, obtain a number of topics by training an LDA model
- a feature = the probability of a topic in the current tweet (n topics → n features)
- for a new tweet: the previously trained LDA model is loaded, the tweet is converted to doc2bow, the distributions of the topics are predicted and used as features 

<img src="pictures/topic_distribution_just_nouns.png">

<img src="pictures/topics_top10words.png">

#### Similarity-measure between words
- two features: the biggest and the smallest cosine similarity scores
- corresponding to the most similar and most dissimilar pairs of words

<img src="pictures/cosine_sim.png">

## Model Training Pipeline

<img src="pictures/training_pipeline.png">

In [None]:
embedding_dim = 100
word2vec_map = utils.load_vectors(filename='glove.6B.%dd.txt' % embedding_dim)

In [None]:
# Load the tokens, pos tags and labels for the train and test set in the specified dataset
train_tokens, train_pos, train_labels, test_tokens, test_pos, test_labels = get_dataset(dataset="demo")

In [None]:
from ml_models_demo import *

In [None]:
train_features, test_features = process_features(train_tokens, train_pos, test_tokens, test_pos, word2vec_map, verbose=True)

In [None]:
# From left to right, set to true if you want the feature to be active:
# [Pragmatic, Lexical-grams, POS-grams, Sentiment, LDA topics, Similarity]
feature_options = [[False, False, True, True, True, True], [False, False, False, True, False, True], 
                   [False, True, False, False, False, False], [False, False, True, True, False, False]]

In [None]:
# Run the models
ml_model(train_features, test_features, train_labels, test_labels, feature_options)

## Results

<br>
<font size="3" color="red">
- have a picture here with the results, highlighting the best/worst results<br>
- have here a picture/ plot/ graph on how you compare to other researchers/studies<br>
- have smth like mine vs. thier<br>
</font>
<img src="pictures/ml_analysis.png">

## Word embeddings - GLoVe

In [None]:
# Setting for the embeddings
init_unk = True
var = None
weighted = False

# Make all words lower-case for word embeddings
x_train = [t.lower() for t in train_tokens]
x_test = [t.lower() for t in test_tokens]

In [None]:
# Get word embeddings for the train and test sets
x_train_word_emb = utils.get_tweets_embeddings(x_train, word2vec_map, embedding_dim, 
                                               init_unk=init_unk, variance=var, weighted_average=weighted)
x_test_word_emb = utils.get_tweets_embeddings(x_test, word2vec_map, embedding_dim,
                                              init_unk=init_unk, variance=var, weighted_average=weighted)

In [None]:
classifiers.linear_svm(x_train_word_emb, train_labels, x_test_word_emb, test_labels, class_ratio='balanced')

## Emoji embeddings - emoji2vec
<img src="pictures/emoji2vec.png">

In [None]:
emoji2vec_map = utils.load_vectors(filename='emoji_embeddings_%dd.txt' % embedding_dim)

In [None]:
utils.complete_analogy('👪', '👦', '👧', emoji2vec_map)
utils.complete_analogy('👑', '🚹', '🚺', emoji2vec_map)

## DeepMoji - predicting emojis
<img src="pictures/deepmojis.png">

In [None]:
# Load predicted emojis for each tweet
x_train_emojis = utils.get_demo_emojis("train.txt", x_train)
x_test_emojis = utils.get_demo_emojis("test.txt", x_test)

In [None]:
# Get the emoji embeddings for the train and test sets
x_train_emoji_emb = utils.get_tweets_embeddings(x_train_emojis, emoji2vec_map, embedding_dim,
                                                init_unk=init_unk, variance=var, weighted_average=weighted)
x_test_emoji_emb = utils.get_tweets_embeddings(x_test_emojis, emoji2vec_map, embedding_dim,
                                               init_unk=init_unk, variance=var, weighted_average=weighted)

In [None]:
classifiers.linear_svm(x_train_emoji_emb, train_labels, x_test_emoji_emb, test_labels, class_ratio='balanced')

## Word + Emoji embeddings

In [None]:
# Obtain features by concatenating word embeddings with all emoji embeddings
x_train_features_concat = []
for t, e in zip(x_train_word_emb, x_train_emoji_emb):
    x_train_features_concat.append(np.concatenate((t, e), axis=0))
print("\nShape of concatenated train features: ", np.array(x_train_features_concat).shape)

x_test_features_concat = []
for t, e in zip(x_test_word_emb, x_test_emoji_emb):
    x_test_features_concat.append(np.concatenate((t, e), axis=0))    
print("\nShape of concatenated test features: ", np.array(x_test_features_concat).shape)

In [None]:
classifiers.linear_svm(x_train_features_concat, train_labels, x_test_features_concat, test_labels, class_ratio='balanced')

<img src="pictures/embedding_analysis.png">

## Comparison with BoW

<img src="pictures/bow_analysis.png">

## Simple Neural Network instead of SVM

<img src="pictures/embeddings_dnn_boxplot.png">

<img src="pictures/bow_nn_boxplot.png">

## How are these decisions based on words actually taken?

- are the key features in labeling taken by identifying sarcastic words?

- for each word, calculate the number of times it belongs to a sarcastic tweet (ns)
- for each word, calculate the number of times it belongs to a regular tweet (nr)
- for each word, calculate its relative frequence of occurance (nf ∊ [-1, 1])

$$nf = \frac{ns + nr}{nf - nr}$$

- to classify a tweet, add the individual relative frequency for each word
- if the sum is greater than 0, predict it as sarcastic
- if the sum is less than 0, predict it as regular

In [None]:
from bag_of_words import rule_based_comparison

In [None]:
rule_based_comparison(x_train, train_labels, x_test, test_labels)

In [None]:
rule_based_comparison(x_train_emojis, train_labels, x_test_emojis, test_labels)

## Deep Neural Networks

### Embedding Layers
- can learn embeddings based on my corpus
- or can use pre-trained embeddings

<img src="pictures/emb_layer.png">

### Long Short Term Memory (LSTM)

<img src="pictures/lstm.png">
<img src="pictures/lstm_sequence.png">
<img src="pictures/lstm sequence classifier.png">
<img src="pictures/rnn_cell_backpropagation.png">

In [None]:
# Load LSTM model
from IPython.display import HTML
from data_prep_for_visualization import *
model, index_to_word, x_test = train_lstm_for_visualization()

In [None]:
# Visualization of the LSTM hidden units
# Other nice test examples: 935, 996, 1022, 1063, 2118, 3473
one_tweet_visualization(model, x_test, index_to_word, tweet_number=2118, verbose=True)

In [None]:
HTML(filename= path + '/plots/html_visualizations/lstm_layer_vis_0.html')

##### Does the LSTM-based model take into account the word order? And how are the embeddings affecting the LSTM-based model?

<img src="pictures/lstm_results_ghosh.png">

<img src="pictures/lstm_results_all.png">

## LSTM with an Attention Mechanism
<img src="pictures/attention_mechanism.png">
<img src="pictures/attention_model.png">
<img src="pictures/attention_results_ghosh.png">
<img src="pictures/attention_results_all.png">

<img src="pictures/attention_results.png">

In [None]:
# Visualization of the attention mechanism on clean data
HTML(filename= path + '/plots/html_visualizations/attention_vis.html')

In [None]:
# Visualization of the attention mechanism on grammatical data
# HTML(filename= path + '/plots/html_visualizations/attention_vis_grammatical.html')

## Comparing my results with previous studies (same datasets, different approaches)

#### Ghosh et al.
- paper: "Fracking Sarcasm using Neural Network" (2016)
- best f-score using recursive SVMs, usign BOW + POS: 0.663
- best f-score using recursive SVMs, usign BOW + POS + Sentiment: 0.691
- best f-score using recursive SVMs, usign BOW + POS + Sentiment + HT-splitter: 0.732
- best f-score for CNN + LSTM + DNN: 0.921
- best f-score for LSTM + LSTM: 0.879

#### Riloff et al.
- paper: "Sarcasm as Contrast between a Positive Sentiment and Negative Situation" (2013)
- best f-score was 0.51, using contrast(+VPs, –Situations), ordered & contrast(+Preds, –Situations)

#### SarcasmDetector (author is Mathieu Cliche, 2014)
- www.thesarcasmdetector.com
- obtained f-scores in range 0.50 - 0.55 (SVM-based model, used sentiment, topics and n-grams)

In [None]:
import pandas as pd

fig, ax = plt.subplots(1, 1)
d = {'Ghosh': [0.66, 0.75, 0.83], 'Riloff': [0.83, 0.84, 0.98], 'SarDet': [0.67, 0.87, 0.90]}
df = pd.DataFrame(data=d, index=["SVM", "LSTM", "Attention"])
ax.get_xaxis().set_visible(False)
df.plot(table=True, ax=ax, figsize=(12, 8), linewidth=5, marker='o')