## Import libraries

In [None]:
import pandas as pd
import preprocessor as p
import functools
import operator
import numpy as np
import re
import tensorflow_hub as hub

from emoji import *
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModel, TFAutoModel, BertTokenizer, BertModel
from gensim.models import KeyedVectors
from sentence_transformers import SentenceTransformer
from sklearn.metrics import precision_recall_fscore_support
from scipy.stats import ttest_ind

In [2]:
# custom packages

import sys
sys.path.append('./code')

from data_preprocessing import *
from tweets_embedding import *
from frnn_owa_eval import *

## Analyze data

In [298]:
# Read dataset

irony_train = pd.read_csv('./data/train.En.csv', index_col=0)

In [299]:
# Loon on the dataset

irony_train.head(1)

Unnamed: 0,tweet,sarcastic,rephrase,sarcasm,irony,satire,understatement,overstatement,rhetorical_question
0,The only thing I got from college is a caffein...,1,"College is really difficult, expensive, tiring...",0.0,1.0,0.0,0.0,0.0,0.0


In [43]:
#Look on tweets

irony_train['tweet'][:10]

0    The only thing I got from college is a caffeine addiction                                                                                                                                                                                     
1    I love it when professors draw a big question mark next to my answer on an exam because I’m always like yeah I don’t either ¯\_(ツ)_/¯                                                                                                         
2    Remember the hundred emails from companies when Covid started getting real? I’ve gotten three in regards to support for protests. And only @SavageXFenty shared helpful links and actually said black lives matter... we love capitalism 🥰🙌🏼  
3    Today my pop-pop told me I was not “forced” to go to college 🙃 okay sure sureeee                                                                                                                                                              
4    @VolphanCarol @litt

In [4]:
# Datasets charachteristics

print('Number of train instances: ', len(irony_train)) 
print('Size of the sarcastic class: ', len(irony_train[irony_train.sarcastic == 1])) 
print('Size of the non-sarcastic class: ', len(irony_train[irony_train.sarcastic == 0])) 

Number of train instances:  3468
Size of the irony class:  867
Size of the non-irony class:  2601


In [300]:
# For the Subtask A we need only binary classes presented at "sarcastic" column

irony_train_select = irony_train[['tweet', 'sarcastic']]

### Data preprocessing

In [9]:
# Apply cleaning function on tweets

irony_train_select['cleaned_tweet'] = irony_train_select['tweet'].apply(lambda x: clean_tweet(x))

In [7]:
# Delete stopwords

stop_words = list(set(stopwords.words('english')))

irony_train_select['cleaned_tweet_wt_stopwords'] = irony_train_select['cleaned_tweet'].apply(lambda x:
                                         ' '.join([i for i in x.split(' ') if i not in stop_words]))

## Apply text embedding techniques 

**roBERTa**

In [None]:
# Upload the pre-downloaded irony-based Twitter-roBERTa model 

model_roberta_path = r"./model/twitter-roberta-base-irony"
tokenizer_roberta = AutoTokenizer.from_pretrained(model_roberta)
model_roberta = TFAutoModel.from_pretrained(model_roberta_path)

In [17]:
# Apply roBERTa-based

irony_train_select['Vector_roBERTa'] = irony_train_select['tweet'].apply(lambda x: 
                                                     get_vector_roberta(x, tokenizer_roberta, model))

irony_train_select['Vector_roBERTa_cleaned'] = irony_train_select['cleaned_tweet'].apply(lambda x: 
                                                     get_vector_roberta(x, tokenizer_roberta, model))
irony_train_select['Vector_roBERTa_wt_stopwords'] = 
                    irony_train_select['cleaned_tweet_wt_stopwords'].apply(lambda x: 
                                                   get_vector_roberta(x, tokenizer_roberta, model))

**Word2Vec**

In [None]:
# Upload the pre-downloaded Word2Vec model 

w2v_path = './model/GoogleNews-vectors-negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(w2v_path, binary=True)

In [None]:
# Apply Word2Vec 

irony_train_select["Vector_w2v"] = irony_train_select['tweet'].apply(lambda x: 
                                                                     get_vector_w2v(x, w2v_model))
irony_train_select["Vector_w2v_cleaned"] = irony_train_select['cleaned_tweet'].apply(lambda x: 
                                                                     get_vector_w2v(x, w2v_model))
irony_train_select["Vector_w2v_wt_stopwords"] = 
        irony_train_select['cleaned_tweet_wt_stopwords'].apply(lambda x: get_vector_w2v(x, w2v_model))

**USE**

In [None]:
# Upload Universal Sentence Encoder  (USE)

model_use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-large/5")

In [106]:
# Apply USE

irony_train_select['Vector_use'] = irony_train_select['tweet'].apply(lambda x: 
                                                                     get_vector_use(x, model_use))
irony_train_select['Vector_use_cleaned'] = irony_train_select['cleaned_tweet'].apply(lambda x: 
                                                                         get_vector_use(x, model_use))
irony_train_select['Vector_use_wt_stopwords'] = 
        irony_train_select['cleaned_tweet_wt_stopwords'].apply(lambda x: get_vector_use(x, model_use))

**SBERT**

In [None]:
# Upload Sentence-SBERT

model_sbert = SentenceTransformer('distilbert-base-nli-mean-tokens')

In [107]:
# Apply SBERT

irony_train_select['Vector_sbert'] = irony_train_select['tweet'].apply(lambda x: 
                                                                   get_vector_sbert(x, model_sbert))
irony_train_select['Vector_sbert_cleaned'] = irony_train_select['cleaned_tweet'].apply(lambda x: 
                                                                   get_vector_sbert(x, model_sbert))
irony_train_select['Vector_sbert_wt_stopwords'] = 
    irony_train_select['cleaned_tweet_wt_stopwords'].apply(lambda x: get_vector_sbert(x, model_sbert))

**BERT**

In [None]:
# Upload BERT

tokenizer_bert = BertTokenizer.from_pretrained('bert-base-uncased')
model_bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

In [108]:
# Apply BERT 

irony_train_select['Vector_bert'] = irony_train_select['tweet'].apply(lambda x: 
                                                      get_vector_bert(x, tokenizer_bert, model_bert))
irony_train_select['Vector_bert_cleaned'] = irony_train_select['cleaned_tweet'].apply(lambda x: 
                                                      get_vector_bert(x, tokenizer_bert, model_bert))
irony_train_select['Vector_bert_wt_stopwords'] = 
                    irony_train_select['cleaned_tweet_wt_stopwords'].apply(lambda x: 
                                                    get_vector_bert(x, tokenizer_bert, model_bert))

**DeepMoji**

In [None]:
# Apply DeepMoji

irony_train_select['Vector_deepmoji'] = get_vectors_deepmoji(irony_train_select, 'tweet')
irony_train_select['Vector_deepmoji_cleaned'] = get_vectors_deepmoji(irony_train_select, 'cleaned_tweet')
irony_train_select['Vector_deepmoji_wt_stopwords'] = get_vectors_deepmoji(irony_train_select, 'cleaned_tweet_wt_stopwords')

## Apply FRNN-OWA 

### On the single embedding methods

In [7]:
# We will use 5-fold cross-validation

K_fold = 5

We cleaned cells' outputs to reduce size of the Jupyter Notebook

**roBERTa**

In [None]:
# Original tweets

f1_list = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, ['Vector_roBERTa'], 
                                                    'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list.append(f1_irony)
    print(k, f1_irony)

In [None]:
# Cleaned tweets

f1_list_cleaned = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, 
                        ['Vector_roBERTa_cleaned'], 'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list_cleaned.append(f1_irony)
    print(k, f1_irony)

In [None]:
# Cleaned tweets without stopwords

f1_list_wt_sw = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, 
                    ['Vector_roBERTa_wt_stopwords'], 'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list_wt_sw.append(f1_irony)
    print(k, f1_irony)

In [136]:
# Perform t-test to compare results
# If 0.05 > p-value - arrays are different

print(ttest_ind(f1_list, f1_list_cleaned))
print(ttest_ind(f1_list_cleaned, f1_list_wt_sw))
print(ttest_ind(f1_list, f1_list_wt_sw))

Ttest_indResult(statistic=-0.04680521585228239, pvalue=0.9631838653423619)
Ttest_indResult(statistic=20.120009270873474, pvalue=8.686849539224575e-14)
Ttest_indResult(statistic=20.401014807526536, pvalue=6.83542870303569e-14)


Original or cleaned are the same. W/t stopwords are worse. 

The best: original tweets with k = 5, F1 = 0.3722

**BERT**

In [None]:
# Original tweets

f1_list = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, ['Vector_bert'], 
                                                    'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list.append(f1_irony)
    print(k, f1_irony)

In [None]:
# Cleaned tweets

f1_list_cleaned = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, 
                        ['Vector_bert_cleaned'], 'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list_cleaned.append(f1_irony)
    print(k, f1_irony)

In [None]:
# Cleaned tweets without stopwords

f1_list_wt_sw = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, 
                    ['Vector_bert_wt_stopwords'], 'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list_wt_sw.append(f1_irony)
    print(k, f1_irony)

In [144]:
print(ttest_ind(f1_list, f1_list_cleaned))
print(ttest_ind(f1_list_cleaned, f1_list_wt_stopwords))
print(ttest_ind(f1_list, f1_list_wt_stopwords))

Ttest_indResult(statistic=1.3844085091913392, pvalue=0.1801161254898383)
Ttest_indResult(statistic=-4.556261390751205, pvalue=0.00015504283655645406)
Ttest_indResult(statistic=-2.8396722723654793, pvalue=0.009534682719594437)


Embeddings for cleaned tweets without stopwords are better.

The best: k = 5, F1 = 0.2351

**SBERT**

In [None]:
# Original tweets

f1_list = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, ['Vector_sbert'], 
                                                    'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list.append(f1_irony)
    print(k, f1_irony)

In [None]:
# Cleaned tweets

f1_list_cleaned = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, 
                        ['Vector_sbert_cleaned'], 'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list_cleaned.append(f1_irony)
    print(k, f1_irony)

In [None]:
# Cleaned tweets without stopwords

f1_list_wt_sw = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, 
                    ['Vector_sbert_wt_stopwords'], 'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list_wt_sw.append(f1_irony)
    print(k, f1_irony)

In [148]:
#if 0.05 > p-value - arrays are different

print(ttest_ind(f1_list, f1_list_cleaned))
print(ttest_ind(f1_list_cleaned, f1_list_wt_stopwords))
print(ttest_ind(f1_list, f1_list_wt_stopwords))

Ttest_indResult(statistic=0.34927406544647377, pvalue=0.7302045757668506)
Ttest_indResult(statistic=1.176300680212196, pvalue=0.25204665429833584)
Ttest_indResult(statistic=1.3738676445845028, pvalue=0.18332052919051617)


They all are the same.

The best is for original tweets, with k = 7, F1 = 0.1618

**USE**

In [None]:
# Original tweets

f1_list = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, ['Vector_use'], 
                                                    'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list.append(f1_irony)
    print(k, f1_irony)

In [None]:
# Cleaned tweets

f1_list_cleaned = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, 
                        ['Vector_use_cleaned'], 'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list_cleaned.append(f1_irony)
    print(k, f1_irony)

In [None]:
# Cleaned tweets without stopwords

f1_list_wt_sw = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, 
                    ['Vector_use_wt_stopwords'], 'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list_wt_sw.append(f1_irony)
    print(k, f1_irony)

In [152]:
#if 0.05 > p-value - arrays are different

print(ttest_ind(f1_list, f1_list_cleaned))
print(ttest_ind(f1_list_cleaned, f1_list_wt_stopwords))
print(ttest_ind(f1_list, f1_list_wt_stopwords))

Ttest_indResult(statistic=0.8801216223546391, pvalue=0.38830675246570723)
Ttest_indResult(statistic=1.706769695671753, pvalue=0.10194258358825793)
Ttest_indResult(statistic=2.488074573613451, pvalue=0.02090406227474988)


The best is embedding for the original tweets, k = 5, F1 = 0.2808

**DeepMoji**

In [None]:
# Original tweets

f1_list = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, ['Vector_deepmoji'], 
                                                    'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list.append(f1_irony)
    print(k, f1_irony)

In [None]:
# Cleaned tweets

f1_list_cleaned = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, 
                        ['Vector_deepmoji_cleaned'], 'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list_cleaned.append(f1_irony)
    print(k, f1_irony)

In [None]:
# Cleaned tweets without stopwords

f1_list_wt_sw = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, 
                    ['Vector_deepmoji_wt_stopwords'], 'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list_wt_sw.append(f1_irony)
    print(k, f1_irony)

In [177]:
#if 0.05 > p-value - arrays are different

print(ttest_ind(f1_list, f1_list_cleaned))
print(ttest_ind(f1_list_cleaned, f1_list_wt_stopwords))
print(ttest_ind(f1_list, f1_list_wt_stopwords))

Ttest_indResult(statistic=1.5677094486136374, pvalue=0.13122100768593062)
Ttest_indResult(statistic=2.327942293253145, pvalue=0.02950835720454593)
Ttest_indResult(statistic=3.896138327258023, pvalue=0.0007767139224709184)


Embeddings for original and cleaned tweets are similar and better than for cleaned without stopwords

The best is cleaned, k = 5, F1 = 0.3157

**Word2Vec**

In [None]:
# Original tweets

f1_list = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, ['Vector_w2v'], 
                                                    'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list.append(f1_irony)
    print(k, f1_irony)

In [None]:
# Cleaned tweets

f1_list_cleaned = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, 
                        ['Vector_w2v_cleaned'], 'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list_cleaned.append(f1_irony)
    print(k, f1_irony)

In [None]:
# Cleaned tweets without stopwords

f1_list_wt_sw = []
for k in [5, 7, 9, 11, 13, 15, 17, 19, 21]:
    irony_train_roberta = cross_validation_ensemble_owa(irony_train_select, 
                    ['Vector_w2v_wt_stopwords'], 'sarcastic' K_fold, [k], additive(), additive())
    p_irony, r_irony, f1_irony, support = 
        precision_recall_fscore_support(irony_train_roberta["sarcastic"].to_list(), 
                            irony_train_roberta["Labels"].to_list(), average = "binary", pos_label=1)
    f1_list_wt_sw.append(f1_irony)
    print(k, f1_irony)

In [160]:
#if 0.05 > p-value - arrays are different

print(ttest_ind(f1_list, f1_list_cleaned))
print(ttest_ind(f1_list_cleaned, f1_list_wt_stopwords))
print(ttest_ind(f1_list, f1_list_wt_stopwords))

Ttest_indResult(statistic=-1.2436440220376481, pvalue=0.2267200781180628)
Ttest_indResult(statistic=-1.4258594434672731, pvalue=0.16794491080267582)
Ttest_indResult(statistic=-2.447818599784717, pvalue=0.02281533955976826)


The best embedding is for cleaned tweets without stopwords, for k = 5, F1 = 0.2050

### Ensemble of models

The best setups:
    
1. roBERTa: raw tweets, k = 5, F1 = 0.3722
2. DeepMoji: cleaned tweets, k = 5, F1 = 0.3157 
3. USE: raw tweets, k = 5, F1 = 0.2808
4. BERT: cleaned tweets without stop-words, k = 5, F1 = 0.2351
5. Word2Vec: cleaned tweets without stop-words, k = 5, F1 = 0.2050
6. SBERT: raw tweets, k = 7, F1 = 0.1618 \\

**All embeddings**

In [323]:
irony_ensemble_labels = cross_validation_ensemble_owa(irony_train_select, ['Vector_roBERTa', 
                       'Vector_bert_wt_stopwords', 'Vector_sbert', 'Vector_use', 
                       'Vector_deepmoji_cleaned', 'Vector_w2v_wt_stopwords'], 'sarcastic', K_fold, 
                          [5, 5, 7, 5, 5, 5], additive(), additive())
p_irony, r_irony, f1_irony, support = precision_recall_fscore_support(
    irony_ensemble_labels["sarcastic"].to_list(), 
    [np.round(i) for i in irony_ensemble_labels["Labels"].to_list()], average = "binary", pos_label=1)
print("F1-score: ", f1_irony)

F1-score:  0.09956709956709958


**TOP-5**

In [324]:
irony_ensemble_labels = cross_validation_ensemble_owa(irony_train_select, ['Vector_roBERTa', 
                       'Vector_bert_wt_stopwords', 'Vector_w2v_wt_stopwords', 'Vector_use', 
                       'Vector_deepmoji_cleaned'], 'sarcastic', K_fold, [5, 5, 7, 5, 5], 
                          additive(), additive())
p_irony, r_irony, f1_irony, support = precision_recall_fscore_support(
    irony_ensemble_labels["sarcastic"].to_list(), 
    [np.round(i) for i in irony_ensemble_labels["Labels"].to_list()], average = "binary", pos_label=1)
print("F1-score: ", f1_irony)

F1-score:  0.18664047151277016


**TOP-4**

In [326]:
irony_ensemble_labels = cross_validation_ensemble_owa(irony_train_select, ['Vector_roBERTa', 
                       'Vector_bert_wt_stopwords', 'Vector_use', 'Vector_deepmoji_cleaned'], 
                      'sarcastic', K_fold, [5, 5, 5, 5], additive(), additive())
p_irony, r_irony, f1_irony, support = precision_recall_fscore_support(
    irony_ensemble_labels["sarcastic"].to_list(), 
    [np.round(i) for i in irony_ensemble_labels["Labels"].to_list()], average = "binary", pos_label=1)
print("F1-score: ", f1_irony)

F1-score:  0.13179916317991633


**TOP-3**

In [328]:
irony_ensemble_labels = cross_validation_ensemble_owa(irony_train_select, ['Vector_roBERTa', 
                       'Vector_use', 'Vector_deepmoji_cleaned'], 'sarcastic', K_fold, 
                          [5, 5, 5], additive(), additive())
p_irony, r_irony, f1_irony, support = precision_recall_fscore_support(
    irony_ensemble_labels["sarcastic"].to_list(), 
    [np.round(i) for i in irony_ensemble_labels["Labels"].to_list()], average = "binary", pos_label=1)
print("F1-score: ", f1_irony)

F1-score:  0.29416884247171454


**TOP-2**

In [329]:
irony_ensemble_labels = cross_validation_ensemble_owa(irony_train_select, ['Vector_roBERTa', 
                   'Vector_deepmoji_cleaned'], 'sarcastic', K_fold, [5, 5], additive(), additive())
p_irony, r_irony, f1_irony, support = precision_recall_fscore_support(
    irony_ensemble_labels["sarcastic"].to_list(), 
    [np.round(i) for i in irony_ensemble_labels["Labels"].to_list()], average = "binary", pos_label=1)
print("F1-score: ", 0.18669314796425027)

F1-score:  0.18669314796425027


Ensembles are weaker than the single roBERTa model.

**The best architecture:**

For cross-validation F1-score = 0.3722

- Embeddings: roBERTa
- Preprocessing: no
- k: 5

## Evaluate on the test data

In [307]:
# Read datasets

irony_test = pd.read_csv('./data/taskA.En.input.csv')

In [313]:
# Check dataset 

irony_test.head(5)

Unnamed: 0,text
0,"Size on the the Toulouse team, That pack is mo..."
1,Pinball!
2,So the Scottish Government want people to get ...
3,villainous pro tip : change the device name on...
4,I would date any of these men 🥺


In [None]:
# Apply roBERTa model

irony_test['Vector_roBERTa'] = irony_test['text'].apply(lambda x: 
                                                    get_vector_roberta(x, tokenizer_roberta, model))

In [27]:
# Calculate predicted labels

irony_test_labels = test_ensemble(irony_train_select, irony_train_select['sarcastic'], 
                                  irony_test, ['Vector_roBERTa'], [5], additive(), additive())

In [35]:
# Save predictions in .txt format for the submition

irony_test_labels_int = [int(i) for i in irony_test_labels]
with open('labels.txt', 'w') as f:
    for item in irony_test_labels_int:
        f.write("%s\n" % item)
        
f.close()

## After the competition ended

### Test data with labels

In [38]:
# Read predicted labels from file is needed 

with open('labels.txt','r') as f:
    lines = f.readlines()
f.close()

lines = lines[1:]
pred_labels = [int(i.replace('\n', '')) for i in lines]
pred_labels

# Otherwise, rename variable:
# pred_labels = irony_test_labels_int

In [31]:
# Read labeled test data

irony_test_labeled = pd.read_csv('./data/task_A_En_test.csv')

In [32]:
# Take a look on the labeled data

irony_test_labeled.head(1)

Unnamed: 0,text,sarcastic
0,"Size on the the Toulouse team, That pack is mo...",0


In [41]:
# Save real labels as a list

real_labels = irony_test_labeled['sarcastic'].to_list()

In [43]:
# Calculate marco F1-score

p_irony, r_irony, f1_irony, support = precision_recall_fscore_support(real_labels, pred_labels, average = "macro")
print("F1-score rounded labels: ", f1_irony)

F1-score rounded labels:  0.6552349845763019


In [44]:
# Calculate sarcastic F1-score - our score from leader-board

p_irony, r_irony, f1_irony, support = precision_recall_fscore_support(real_labels, pred_labels, average = "binary", pos_label=1)
print("F1-score rounded labels: ", f1_irony)

F1-score rounded labels:  0.4242424242424242


## Error analysis

### Check neigbours of test tweets

In [53]:
# Detect wrong predictions

error_ind = [i for i in range(len(real_labels)) if real_labels[i] != pred_labels[i]]

**Example of the CORRECT prediction**

In [121]:
# Test instance to examine 

i = 2
print(irony_test_labeled['text'].iloc[i])
print(irony_test_labeled['sarcastic'].iloc[i])

So the Scottish Government want people to get their booster shots so badly that the website doesn't even work
1


In [122]:
# Find neighbouring train instances 

test_vector = irony_test['Vector_roBERTa'].iloc[i]
get_neigbours(test_vector, irony_train_select, 'Vector_roBERTa', 5, 'tweet', 'sarcastic')

(['I\'d like to thank middle aged men watching the Olympics for solving the mental health crisis. Getting someone to shout "get some perspective" at every athlete struggling with anxiety will definitely solve their problem.',
  'I love health insurance so much fun paying $100’s a month and then $100’s for basic medical services that somehow aren’t covered. #America',
  'YouTube just gave me a gamer targeted covid vaccine ad and anyway get vaccinated gamers.',
  'V excited to grant an extension to students who email me and my co-teacher and get both of our names wrong',
  'We really need to remind people that universal healthcare means never needing to sit through another annual explanation of benefits meeting'],
 [1, 1, 0, 1, 1])

**Example of the WRONG prediction**

In [118]:
i = error_ind[0]
print(irony_test_labeled['text'].iloc[i])
print(irony_test_labeled['sarcastic'].iloc[i])

Sometimes I lay in bed and think about how today will be the day I make my life better. Exercise, drinking water, eating healthy. Then I wake up. 
1


In [119]:
test_vector = irony_test['Vector_roBERTa'].iloc[i]

get_neigbours(test_vector, irony_train_select, 'Vector_roBERTa', 5, 'tweet', 'sarcastic')

(['me: I’m gonna wash my hair and shave my legs! \r\nMe instead: I’m gonna dissociate in the shower for 45 minutes',
  "I think I've figured out why Ted Cruz always looks like he's melting.  It's because of the flames constantly emanating from his pants. #TXSenateDebate #LyinTed #LoseCruz #PantsOnFire",
  'July 1st. Half way point of the year. Well I think we can all agree that 2020 has gone swimmingly so far. \r\nCan’t wait for part 2. \r\n#murderhornets',
  "@nypost Since going keto, I haven't had sunburn, and I haven't used sunscreen either.  Also, since losing weight, when all my obese friends are sweating their nuts off, I seem to have a high heat tolerance.  One of many surprising benefits of a low carb lifestyle.",
  'Well, my vaccinated therapist tested positive for Covid.  Hope she’s okay and hope my mental health hangs on for 10 extra days.'],
 [0, 0, 1, 0, 0])

### Try the weighted kNN classification model instead of OWA-FRNN

In [None]:
# Apply the wkNN instead of OWA-FRNN in our best setup (roBERTa with k=5) with cross-validation 

irony_train_wknn = cross_validation_ensemble_wknn(irony_train_select, 'Vector_roBERTa', 
                                                  'sarcastic', 5, 5)

In [257]:
# Calculate macro F1-score 

p_irony, r_irony, f1_irony, support = precision_recall_fscore_support(
    irony_train_wknn["Prediction"].to_list(), irony_train_select["sarcastic"].to_list(), 
    average = "macro")
print(f1_irony)

0.6015477117692167

In [258]:
# Calculate sarcastic F1-score

p_irony, r_irony, f1_irony, support = precision_recall_fscore_support(
    irony_train_wknn["Prediction"].to_list(), irony_train_select["sarcastic"].to_list(), 
    average = "binary", pos_label=1)
print(f1_irony)

0.35698282300224055

In [178]:
# Apply the wkNN to test data instead of OWA-FRNN to our best setup - roBERTa with k=5

pred_labels_wknn = irony_test['Vector_roBERTa'].apply(lambda x: 
                                          weighted_knn(irony_train_select, 'Vector_roBERTa', x, 5))

In [179]:
# Calculate macro F1-score (we got 0.6552 for OWA-FRNN)

p_irony, r_irony, f1_irony, support = precision_recall_fscore_support(real_labels, 
                                                                  pred_labels_wknn, average = "macro")
print("F1-score rounded labels: ", f1_irony) 

F1-score rounded labels:  0.6635198814833494


In [180]:
# Calculate sarcastic F1-score (we got 0.4242 for OWA-FRNN)

p_irony, r_irony, f1_irony, support = precision_recall_fscore_support(real_labels, 
                                                  pred_labels_wknn, average = "binary", pos_label=1)
print("F1-score rounded labels: ", f1_irony)

F1-score rounded labels:  0.42990654205607476


In [317]:
# Calculate the sarcastic F1-score with cross-validation for k=17

irony_train_wknn = cross_validation_ensemble_wknn(irony_train_select, 'Vector_roBERTa', 
                                                  'sarcastic', 5, 17)
p_irony, r_irony, f1_irony, support = precision_recall_fscore_support(
    irony_train_wknn["Prediction"].to_list(), irony_train_select["sarcastic"].to_list(), 
    average = "binary", pos_label=1)
print(f1_irony)

0.2790294627383016


In [318]:
# Calculate the sarcastic F1-score for test data with k=17

pred_labels_wknn = irony_test['Vector_roBERTa'].apply(lambda x: 
                                          weighted_knn(irony_train_select, 'Vector_roBERTa', x, 17))
p_irony, r_irony, f1_irony, support = precision_recall_fscore_support(real_labels, 
                                                  pred_labels_wknn, average = "binary", pos_label=1)
print(f1_irony)

0.49693251533742333
