In [None]:
# 4.15.0
!pip install transformers
!pip install ekphrasis
!pip install emoji
!pip install sentencepiece
!pip install emoji

In [None]:
import transformers
from transformers import BertTokenizerFast, TFBertModel
from PIL import Image

import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow import keras
from collections import Counter
from tensorflow.keras.callbacks import ModelCheckpoint

import pandas as pd
import numpy as np
import os

transformers.__version__

In [3]:
df_train = pd.read_csv('/content/train.En.csv')

In [4]:
df_train['tweet'] = df_train['tweet']. fillna('')

In [None]:
df_train.head()

In [6]:
Counter(df_train['sarcastic'])

Counter({0: 2601, 1: 867})

In [None]:
df_train.info()

<h6>TPU CONF</h6>

In [None]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])

tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

In [None]:
text = df_train['tweet']
Y_sarcastic = df_train['sarcastic']
Counter(Y_sarcastic)

In [10]:
text = np.array(text)
Y_sarcastic = np.array(Y_sarcastic)

<h6>Text Train Split</h6>

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
text_train, text_val, Y_train_sarcastic, Y_val_sarcastic = train_test_split(text, Y_sarcastic, test_size=0.1, random_state=3)

In [13]:
text_train = pd.Series(text_train)
text_val = pd.Series(text_val)

In [None]:
print("Train")
print(np.shape(text_train), type(text_train))
print(np.shape(Y_train_sarcastic), type(Y_train_sarcastic))
print(Counter(Y_train_sarcastic))
print("Val")
print(np.shape(text_val))
print(np.shape(Y_val_sarcastic))
print(Counter(Y_val_sarcastic))

<h6>Text pre-processing</h6>

In [15]:
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

In [None]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

In [17]:
def print_text(texts,i,j):
    for u in range(i,j):
        print(texts[u])
        print()

In [None]:
print_text(text_train,0,5)
print("##############################################################################################################")
print_text(text_val,0,5)

In [None]:
import re
#removing website names
def remove_website(text):
    return " ".join([word if re.search("r'https?://\S+|www\.\S+'|((?i).com$|.co|.net)",word)==None else "" for word in text.split(" ") ])

# Training set 
text_train = text_train.apply(lambda text: remove_website(text))
print_text(text_train,0,5)

print("**************************************************************************")

# Validation set 
text_val = text_val.apply(lambda text: remove_website(text))
print_text(text_val,0,5)

In [20]:
# Functions for chat word conversion
f = open("slang.txt", "r")
chat_words_str = f.read()
chat_words_map_dict = {}
chat_words_list = []

for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [None]:
# Chat word conversion
# Training set
text_train = text_train.apply(lambda text: chat_words_conversion(text))
print_text(text_train,0,5)

print("********************************************************************************")

# Validation set
text_val = text_val.apply(lambda text: chat_words_conversion(text))
print_text(text_val,0,5)

In [None]:
# FUnction for removal of emoji
import emoji

def convert_emojis(text):
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub("_|-"," ",text)
    return text

# Training set
text_train = text_train.apply(lambda text: convert_emojis(text))
print_text(text_train,0,5)

print("**************************************************************************")

# Validation set
text_val = text_val.apply(lambda text: convert_emojis(text))
print_text(text_val,0,5)

In [23]:
def ekphrasis_pipe(sentence):
    cleaned_sentence = " ".join(text_processor.pre_process_doc(sentence))
    return cleaned_sentence

In [None]:
# Training set
text_train = text_train.apply(lambda text: ekphrasis_pipe(text))
print("Training set completed.......")
#Validation set
text_val = text_val.apply(lambda text: ekphrasis_pipe(text))
print("Validation set completed.......")

In [None]:
u = lambda text: len(text.split(" "))
sentence_lengths = []
for x in text_train:
    sentence_lengths.append(u(x))
print(sorted(sentence_lengths)[-200:])
print(len(sentence_lengths))

</h6>Text processing complete</h6>

In [26]:
from transformers import RobertaTokenizerFast, TFRobertaModel, MPNetTokenizerFast, TFMPNetModel, ElectraTokenizerFast, TFElectraModel, XLNetTokenizerFast, TFXLNetModel, AlbertTokenizerFast, TFAlbertModel, DebertaTokenizer, TFDebertaModel

In [None]:
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
tokenizer = MPNetTokenizerFast.from_pretrained("microsoft/mpnet-base")
# tokenizer = ElectraTokenizerFast.from_pretrained('google/electra-base-discriminator')

In [None]:
train_encodings = tokenizer(list(text_train), max_length=70, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(train_encodings["input_ids"]))

val_encodings = tokenizer(list(text_val), max_length=70, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(val_encodings["input_ids"]))

In [31]:
def task1(input_shape):
    inputs = keras.Input(shape=input_shape, dtype='int32')
    input_masks = keras.Input(shape=input_shape, dtype='int32')

    # Text
    model = TFMPNetModel.from_pretrained('microsoft/mpnet-base')
    layer = model.layers[0]
    embeddings = layer([inputs, input_masks])[0]
    features = embeddings[:, 0, :] # Not used only in bert,albert where [1] pooler output is used
    # features = embeddings[:, -1] used for xlnet

    X = keras.layers.Dense(64,activation='elu')(features)

    X = keras.layers.BatchNormalization()(X)

    X = keras.layers.Dense(1,activation='elu',kernel_regularizer=keras.regularizers.l2(0.01))(X)
    
    # Add a sigmoid activation
    X = keras.layers.Activation('sigmoid')(X)    
    
    # Create Model instance which converts sentence_indices into X.
    model = keras.Model(inputs=[inputs,input_masks], outputs=[X])
    return model

In [None]:
strategy = tf.distribute.TPUStrategy(resolver)

In [33]:
from sklearn.metrics import classification_report

In [34]:
class EvaluationMetric(keras.callbacks.Callback):   
    
    def __init__(self, val_encodings, val_masks, Y_val):
        super(EvaluationMetric, self).__init__()
        self.val_encodings = val_encodings
        self.val_masks = val_masks
        self.Y_val = Y_val
    
    def on_epoch_begin(self, epoch, logs={}):
        print("\nTraining...")

    def on_epoch_end(self, epoch, logs={}):
        print("\nEvaluating...")
        val_prediction = self.model.predict([self.val_encodings, self.val_masks])
        
        pred = []
        for i in range(0,len(self.Y_val)):
            num = val_prediction[i]
            if(num > 0.5):
              num = 1
            else:
              num = 0
            pred.append(num)
        
        print(classification_report(self.Y_val, pred, digits=3))
        
evaluation_metric = EvaluationMetric(val_encodings["input_ids"], val_encodings["attention_mask"], Y_val_sarcastic)

In [None]:
with strategy.scope():
  model = task1((70,))
  optimizer = keras.optimizers.Adam(learning_rate=4e-5)
  loss_fun = tf.keras.losses.BinaryCrossentropy(from_logits=False)
  metric = [tf.metrics.BinaryAccuracy(), tf.metrics.Precision(), tf.metrics.Recall()]
  model.compile(optimizer=optimizer, loss=loss_fun, metrics=metric)

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(
    model)

In [38]:
checkpoint = ModelCheckpoint(filepath='/content/sarcasm-1.{epoch:03d}.h5',
                                 verbose = 0,
                                 save_weights_only=True,
                                 epoch=1)

In [None]:
from sklearn.utils.class_weight import compute_class_weight
sarcastic = compute_class_weight('balanced', classes=[0,1], y=Y_train_sarcastic)
class_weights_sarcastic = {0: sarcastic[0], 1: sarcastic[1]}
print("sarcastic")
print(class_weights_sarcastic)

In [40]:
history = model.fit(
    x = [train_encodings["input_ids"], train_encodings["attention_mask"]],
    y = Y_train_sarcastic,
    validation_data = ([val_encodings["input_ids"],val_encodings["attention_mask"]],Y_val_sarcastic),
    callbacks = [evaluation_metric, checkpoint],
    batch_size = 128,
    shuffle=True,
    epochs=10,
    class_weight=class_weights_sarcastic
)

In [41]:
model.load_weights("/content/sarcasm-1.008.h5")

<h5>Task 1 test</h5>

In [None]:
df_test_a = pd.read_csv('/content/taskA.En.input.csv')
df_test_a.head()

In [None]:
df_test_a.info()

In [None]:
text_test_a = df_test_a['text']
print(type(text_test_a))
print(len(text_test_a))

In [None]:
text_test_a = text_test_a.apply(lambda text: remove_website(text))
text_test_a = text_test_a.apply(lambda text: chat_words_conversion(text))
text_test_a = text_test_a.apply(lambda text: convert_emojis(text))
text_test_a = text_test_a.apply(lambda text: ekphrasis_pipe(text))
print("Test set completed.......")

In [None]:
test_encodings_a = tokenizer(list(text_test_a), max_length=70, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(test_encodings_a["input_ids"]))

In [47]:
pred_test_a = model.predict([test_encodings_a["input_ids"], test_encodings_a["attention_mask"]])

In [None]:
test_answer_a = np.array(np.round(pred_test_a[:,0])).astype(int)
Counter(test_answer_a)

In [49]:
with open('answer.txt', 'w') as outf:
  outf.write('task_a_en' + '\n')
  for i in range(0, len(test_answer_a)-1):
    outf.write(str(test_answer_a[i]) + '\n')
  outf.write(str(test_answer_a[len(test_answer_a)-1]))

In [50]:
import zipfile
zipfile.ZipFile('sarcasm_electra_a.zip', mode='w').write("answer.txt")

<h5>Task 2</h5>

In [51]:
df_train_b = df_train[df_train['sarcastic'] == 1]

In [None]:
df_train_b.head(3)

In [None]:
df_train_b.info()

In [54]:
text_b = np.array(df_train_b['tweet'])
sarcasm = np.array(df_train_b['sarcasm'])
irony = np.array(df_train_b['irony'])
satire = np.array(df_train_b['satire'])
under = np.array(df_train_b['understatement'])
over = np.array(df_train_b['overstatement'])
rhetorical = np.array(df_train_b['rhetorical_question'])

In [None]:
print("Sarcasm")
print(Counter(sarcasm))
print("Irony")
print(Counter(irony))
print("Satire")
print(Counter(satire))
print("Understatement")
print(Counter(under))
print("Over Statement")
print(Counter(over))
print("Rhetorical")
print(Counter(rhetorical))

In [56]:
(text_train_b, text_val_b,
 Y_sarcasm_train, Y_sarcasm_val,
 Y_irony_train, Y_irony_val,
 Y_satire_train, Y_satire_val,
 Y_under_train, Y_under_val,
 Y_over_train, Y_over_val,
 Y_rhetorical_train, Y_rhetorical_val) = train_test_split(text_b, sarcasm, irony, satire, under, over, rhetorical, test_size=0.05, random_state=3)

In [None]:
print("Train")
print(Counter(Y_sarcasm_train), type(Y_sarcasm_train))
print(Counter(Y_irony_train), type(Y_irony_train))
print(Counter(Y_satire_train), type(Y_satire_train))
print(Counter(Y_under_train), type(Y_under_train))
print(Counter(Y_over_train), type(Y_over_train))
print(Counter(Y_rhetorical_train), type(Y_rhetorical_train))

In [None]:
print("Val")
print(Counter(Y_sarcasm_val), type(Y_sarcasm_val))
print(Counter(Y_irony_val), type(Y_irony_val))
print(Counter(Y_satire_val), type(Y_satire_val))
print(Counter(Y_under_val), type(Y_under_val))
print(Counter(Y_over_val), type(Y_over_val))
print(Counter(Y_rhetorical_val), type(Y_rhetorical_val))

In [None]:
text_train_b = pd.Series(text_train_b)
text_val_b = pd.Series(text_val_b)

text_train_b = text_train_b.apply(lambda text: remove_website(text))
text_train_b = text_train_b.apply(lambda text: chat_words_conversion(text))
text_train_b = text_train_b.apply(lambda text: convert_emojis(text))
text_train_b = text_train_b.apply(lambda text: ekphrasis_pipe(text))
print("Training set completed.......")

text_val_b = text_val_b.apply(lambda text: remove_website(text))
text_val_b = text_val_b.apply(lambda text: chat_words_conversion(text))
text_val_b = text_val_b.apply(lambda text: convert_emojis(text))
text_val_b = text_val_b.apply(lambda text: ekphrasis_pipe(text))
print("Val set completed.......")

In [None]:
train_encodings_b = tokenizer(list(text_train_b), max_length=70, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(train_encodings_b["input_ids"]))

val_encodings_b = tokenizer(list(text_val_b), max_length=70, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(val_encodings_b["input_ids"]))

In [62]:
def task_2(input_shape):
    inputs = keras.Input(shape=input_shape, dtype='int32')
    input_masks = keras.Input(shape=input_shape, dtype='int32')

    # Text
    model = TFMPNetModel.from_pretrained('microsoft/mpnet-base')
    layer = model.layers[0]
    embeddings = layer([inputs, input_masks])[0]
    features = embeddings[:, 0, :] # Not used only in bert,albert where [1] pooler output is used
    # features = embeddings[:, -1] used for xlnet

    sarcasm = keras.layers.Dense(64,activation='relu')(features)
    sarcasm = keras.layers.BatchNormalization()(sarcasm)
    sarcasm = keras.layers.Dense(1,activation='sigmoid',kernel_regularizer=keras.regularizers.l2(0.01))(sarcasm)

    irony = keras.layers.Dense(64,activation='relu')(features)
    irony = keras.layers.BatchNormalization()(irony)
    irony = keras.layers.Dense(1,activation='sigmoid',kernel_regularizer=keras.regularizers.l2(0.01))(irony)

    satire = keras.layers.Dense(64,activation='relu')(features)
    satire = keras.layers.BatchNormalization()(satire)
    satire = keras.layers.Dense(1,activation='sigmoid',kernel_regularizer=keras.regularizers.l2(0.01))(satire)

    under = keras.layers.Dense(64,activation='relu')(features)
    under = keras.layers.BatchNormalization()(under)
    under = keras.layers.Dense(1,activation='sigmoid',kernel_regularizer=keras.regularizers.l2(0.01))(under)
    
    over = keras.layers.Dense(64,activation='relu')(features)
    over = keras.layers.BatchNormalization()(over)
    over = keras.layers.Dense(1,activation='sigmoid',kernel_regularizer=keras.regularizers.l2(0.01))(over)

    rhetorical = keras.layers.Dense(64,activation='relu')(features)
    rhetorical = keras.layers.BatchNormalization()(rhetorical)
    rhetorical = keras.layers.Dense(1,activation='sigmoid',kernel_regularizer=keras.regularizers.l2(0.01))(rhetorical)
    
    # Create Model instance which converts sentence_indices into X.
    model = keras.Model(inputs=[inputs,input_masks], outputs=[sarcasm, irony, satire, under, over, rhetorical])
    return model

In [None]:
strategy = tf.distribute.TPUStrategy(resolver)

In [64]:
class EvaluationMetricB(keras.callbacks.Callback):   
    
    def __init__(self, val_encodings, val_masks, Y_val):
        super(EvaluationMetricB, self).__init__()
        self.val_encodings = val_encodings
        self.val_masks = val_masks
        self.Y_val = Y_val
    
    def on_epoch_begin(self, epoch, logs={}):
        print("\nTraining...")

    def on_epoch_end(self, epoch, logs={}):
        print("\nEvaluating...")
        val_prediction = self.model.predict([self.val_encodings, self.val_masks])
        
        pred = np.round(val_prediction)

        from sklearn.metrics import classification_report

        print("Sarcasm")
        sarcasm = classification_report(self.Y_val[:,0], pred[0][:,0], digits=3, output_dict=True)
        print(sarcasm['1.0'])
        print(sarcasm['macro avg'])
        print("##################################################################")
        print("Irony")
        irony = classification_report(self.Y_val[:,1], pred[1][:,0], digits=3, output_dict=True)
        print(irony['1.0'])
        print(irony['macro avg'])
        print("##################################################################")
        print("Satire")
        satire = classification_report(self.Y_val[:,2], pred[2][:,0], digits=3, output_dict=True)
        print(satire['1.0'])
        print(satire['macro avg'])
        print("##################################################################")
        print("Under statement")
        under = classification_report(self.Y_val[:,3], pred[3][:,0], digits=3, output_dict=True)
        print(under['1.0'])
        print(under['macro avg'])
        print("##################################################################")
        print("Over statement")
        over = classification_report(self.Y_val[:,4], pred[4][:,0], digits=3, output_dict=True)
        print(over['1.0'])
        print(over['macro avg'])
        print("##################################################################")
        print("Rhetorical")
        rhetorical = classification_report(self.Y_val[:,5], pred[5][:,0], digits=3, output_dict=True)
        print(rhetorical['1.0'])
        print(rhetorical['macro avg'])
        print("##################################################################")

Y_val_b = np.column_stack((Y_sarcasm_val, Y_irony_val, Y_satire_val, Y_under_val, Y_over_val, Y_rhetorical_val))        
evaluation_metric_b = EvaluationMetricB(val_encodings_b["input_ids"], val_encodings_b["attention_mask"], Y_val_b)

In [65]:
from tensorflow.keras.losses import Loss
from tensorflow.keras import backend as K
class weightedBinaryCrossEntropy(Loss):

  def __init__(self, weights):
    super().__init__()
    self.zero = weights[0]
    self.one = weights[1]
  
  def call(self, y_true, y_pred):
    y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
    loss = tf.keras.backend.binary_crossentropy(y_true, y_pred, from_logits=False)
    weight_vector = (y_true*self.one) + ((1-y_true)*self.zero)
    loss = loss*weight_vector
    if(len(loss) != 0):
      return tf.keras.backend.mean(loss)
    else:
      return 0.0

In [66]:
def get_weights(Y_train, name):
  weights = compute_class_weight('balanced', classes=[0,1], y=Y_train)
  class_weights = {0: weights[0], 1: weights[1]}
  print(name)
  print(class_weights)
  return class_weights

In [None]:
class_weights_sarcasm = get_weights(Y_sarcasm_train, "sarcasm")
class_weights_irony = get_weights(Y_irony_train, "irony")
class_weights_satire = get_weights(Y_satire_train, "satire")
class_weights_under = get_weights(Y_under_train, "understatement")
class_weights_over = get_weights(Y_over_train, "overstatement")
class_weights_rhetorical = get_weights(Y_rhetorical_train, "rhetorical")

In [68]:
weighted_loss = [weightedBinaryCrossEntropy(class_weights_sarcasm),
                 weightedBinaryCrossEntropy(class_weights_irony),
                 weightedBinaryCrossEntropy(class_weights_satire),
                 weightedBinaryCrossEntropy(class_weights_under),
                 weightedBinaryCrossEntropy(class_weights_over),
                 weightedBinaryCrossEntropy(class_weights_rhetorical)
                 ]

In [None]:
with strategy.scope():
  model_b = task_2((70,))
  optimizer = keras.optimizers.Adam(learning_rate=4e-5)
  loss_fun = tf.keras.losses.BinaryCrossentropy(from_logits=False)
  metric = [tf.metrics.BinaryAccuracy(), tf.metrics.Precision(), tf.metrics.Recall()]
  model_b.compile(optimizer=optimizer, loss=weighted_loss, metrics=metric)

In [None]:
model_b.summary()

In [None]:
tf.keras.utils.plot_model(
    model_b)

In [72]:
checkpoint_b = ModelCheckpoint(filepath='/content/sarcasm-2.{epoch:03d}.h5',
                                 verbose = 0,
                                 save_weights_only=True,
                                 epoch=1)

In [None]:
# electra
history = model_b.fit(
    x = [train_encodings_b["input_ids"], train_encodings_b["attention_mask"]],
    y = [Y_sarcasm_train, Y_irony_train, Y_satire_train, Y_under_train, Y_over_train, Y_rhetorical_train],
    callbacks = [evaluation_metric_b, checkpoint_b],
    batch_size = 128,
    shuffle=True,
    epochs=10
)

In [None]:
model_b.load_weights('/content/sarcasm-2.010.h5')

<h7>Test B</h6>

In [None]:
df_test_b = pd.read_csv('/content/taskB.En.input.csv')
df_test_b.head()

In [None]:
df_test_b.info()

In [None]:
text_test_b = df_test_b['text']
print(type(text_test_b))
print(len(text_test_b))

In [None]:
text_test_b = text_test_b.apply(lambda text: remove_website(text))
text_test_b = text_test_b.apply(lambda text: chat_words_conversion(text))
text_test_b = text_test_b.apply(lambda text: convert_emojis(text))
text_test_b = text_test_b.apply(lambda text: ekphrasis_pipe(text))
print("Test set completed.......")

In [None]:
test_encodings_b = tokenizer(list(text_test_b), max_length=70, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(test_encodings_b["input_ids"]))

In [78]:
pred_test_b = model_b.predict([test_encodings_b["input_ids"], test_encodings_b["attention_mask"]])

In [79]:
def get_answer_list(answer):
  print(np.shape(answer))
  final = (np.round(answer)).astype(np.int)
  print("Sarcasm")
  print(Counter(final[0][:,0]))
  print("Irony")
  print(Counter(final[1][:,0]))
  print("Satire")
  print(Counter(final[2][:,0]))
  print("Understatement")
  print(Counter(final[3][:,0]))
  print("Overstatement")
  print(Counter(final[4][:,0]))
  print("Rhetorical")
  print(Counter(final[5][:,0]))
  return final

In [None]:
final_answer = get_answer_list(pred_test_b)

In [None]:
np.shape(final_answer)

In [82]:
with open('answer.txt', 'w') as outf:
  outf.write('sarcasm,irony,satire,understatement,overstatement,rhetorical_question\n')
  for i in range(0, len(test_encodings_b['input_ids'])):
    outf.write(str(final_answer[0][i,0]) + ',' + str(final_answer[1][i,0]) + ',' + str(final_answer[2][i,0]) + ',' + str(final_answer[3][i,0]) + ',' + str(final_answer[4][i,0]) + ',' + str(final_answer[5][i,0]) + '\n')

In [83]:
import zipfile
zipfile.ZipFile('sarcasm_electra_b.zip', mode='w').write("answer.txt")

<h6>Task C</h6>

In [84]:
df_train_c = df_train[df_train['sarcastic'] == 1]

In [None]:
df_train_c.info()

In [None]:
df_train_c.head(2)

In [87]:
text_c = np.array(df_train_c['tweet'])
rephrase_c = np.array(df_train_c['rephrase'])

In [None]:
Y_sarcastic_c = np.ones((len(text_c,)), dtype='int32')
Y_rephrase_c = np.zeros((len(text_c,)), dtype='int32')
print(np.shape(Y_sarcastic_c))
print(np.shape(Y_rephrase_c))
print(Counter(Y_sarcastic_c))
print(Counter(Y_rephrase_c))

In [89]:
for i in range(0,400):
  temp1 = text_c[i]
  temp2 = rephrase_c[i]
  text_c[i] = temp2
  rephrase_c[i] = temp1
  Y_sarcastic_c[i] = 0
  Y_rephrase_c[i] = 1

In [90]:
text_c = pd.Series(text_c)
rephrase_c = pd.Series(rephrase_c)

In [92]:
text_train_c, text_val_c, rephrase_train_c, rephrase_val_c, Y_c_train, Y_c_val, Y_rephrase_train, Y_rephrase_val = train_test_split(text_c, rephrase_c, Y_sarcastic_c, Y_rephrase_c, test_size=0.1, random_state=3)

In [None]:
print(Counter(Y_c_train))
print(Counter(Y_rephrase_train))

print(Counter(Y_c_val))
print(Counter(Y_rephrase_val))

In [None]:
print("Train")
print(len(text_train_c), type(text_train_c))
print(len(rephrase_train_c), type(rephrase_train_c))
print(len(Y_c_train), type(Y_c_train))
print(len(Y_rephrase_train), type(Y_rephrase_train))

print("Val")
print(len(text_val_c), type(text_val_c))
print(len(rephrase_val_c), type(rephrase_val_c))
print(len(Y_c_val), type(Y_c_val))
print(len(Y_rephrase_val), type(Y_rephrase_val))

In [None]:
print("Sarcastic")
text_train_c = text_train_c.apply(lambda text: remove_website(text))
text_train_c = text_train_c.apply(lambda text: chat_words_conversion(text))
text_train_c = text_train_c.apply(lambda text: convert_emojis(text))
text_train_c = text_train_c.apply(lambda text: ekphrasis_pipe(text))
print("Rephrase")
rephrase_train_c = rephrase_train_c.apply(lambda text: remove_website(text))
rephrase_train_c = rephrase_train_c.apply(lambda text: chat_words_conversion(text))
rephrase_train_c = rephrase_train_c.apply(lambda text: convert_emojis(text))
rephrase_train_c = rephrase_train_c.apply(lambda text: ekphrasis_pipe(text))
print("Train set completed.......")

In [None]:
print("Sarcastic")
text_val_c = text_val_c.apply(lambda text: remove_website(text))
text_val_c = text_val_c.apply(lambda text: chat_words_conversion(text))
text_val_c = text_val_c.apply(lambda text: convert_emojis(text))
text_val_c = text_val_c.apply(lambda text: ekphrasis_pipe(text))
print("Rephrase")
rephrase_val_c = rephrase_val_c.apply(lambda text: remove_website(text))
rephrase_val_c = rephrase_val_c.apply(lambda text: chat_words_conversion(text))
rephrase_val_c = rephrase_val_c.apply(lambda text: convert_emojis(text))
rephrase_val_c = rephrase_val_c.apply(lambda text: ekphrasis_pipe(text))
print("Test set completed.......")

In [None]:
train_encodings_c = tokenizer(list(text_train_c), max_length=70, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(train_encodings_c["input_ids"]))

val_encodings_c = tokenizer(list(text_val_c), max_length=70, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(val_encodings_c["input_ids"]))

In [None]:
train_encodings_rc = tokenizer(list(rephrase_train_c), max_length=70, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(train_encodings_rc["input_ids"]))

val_encodings_rc = tokenizer(list(rephrase_val_c), max_length=70, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(val_encodings_rc["input_ids"]))

In [99]:
def euclidean_distance(vects):
    x, y = vects
    sum_square = tf.math.reduce_sum(tf.math.square(x - y), axis=1, keepdims=True)
    return tf.math.sqrt(tf.math.maximum(sum_square, tf.keras.backend.epsilon()))

In [100]:
def get_model(input_shape):
  inputs = keras.Input(shape=input_shape, dtype='int32')
  input_masks = keras.Input(shape=input_shape, dtype='int32')

  inputs_1 = keras.Input(shape=input_shape, dtype='int32')
  input_masks_1 = keras.Input(shape=input_shape, dtype='int32')

  inputs_2 = keras.Input(shape=input_shape, dtype='int32')
  input_masks_2 = keras.Input(shape=input_shape, dtype='int32')

  model = TFMPNetModel.from_pretrained('microsoft/mpnet-base')
  layer = model.layers[0]
  embeddings = layer([inputs, input_masks])[0]
  features = embeddings[:, 0, :]
  features = keras.layers.Dense(64,activation='elu')(features)
  siamese_ = keras.Model([inputs, input_masks], features)

  tower_1 = siamese_([inputs_1, input_masks_1])
  tower_2 = siamese_([inputs_2, input_masks_2])

  # Tower 1 outputs classifier
  x_1 = keras.layers.Dense(64,activation='elu')(tower_1)
  x_1 = keras.layers.BatchNormalization()(x_1)
  x_1 = keras.layers.Dense(1,activation='sigmoid')(x_1)

  # Tower 2 outputs classifier
  x_2 = keras.layers.Dense(64,activation='elu')(tower_2)
  x_2 = keras.layers.BatchNormalization()(x_2)
  x_2 = keras.layers.Dense(1,activation='sigmoid')(x_2)

  merge_layer = keras.layers.Lambda(euclidean_distance)([tower_1, tower_2])
  normal_layer = tf.keras.layers.BatchNormalization()(merge_layer)
  output_layer = tf.keras.layers.Dense(1, activation="relu")(normal_layer)
  output_layer = tf.keras.layers.Activation('sigmoid')(output_layer)

  siamese = keras.Model(inputs=[inputs_1, input_masks_1, inputs_2, input_masks_2], outputs=[x_1, x_2, output_layer])

  return siamese

In [101]:
def loss(margin=1):
    def contrastive_loss(y_true, y_pred):
      margin_square = tf.math.square(tf.math.maximum(margin - (y_pred), 0))
      loss = tf.math.reduce_mean((y_true) * margin_square)
      if(tf.math.is_nan(loss)):
        return 0.0
      else:
        return loss
    return contrastive_loss

In [None]:
strategy = tf.distribute.TPUStrategy(resolver)

In [None]:
with strategy.scope():
  model_c = get_model((70,))
  optimizer = keras.optimizers.Adam(learning_rate=4e-5)
  loss_func = [tf.keras.losses.BinaryCrossentropy(from_logits=False), tf.keras.losses.BinaryCrossentropy(from_logits=False), loss()]
  metric = [tf.metrics.BinaryAccuracy(), tf.metrics.Precision(), tf.metrics.Recall()]
  model_c.compile(optimizer=optimizer, loss=loss_func, metrics=metric)

In [None]:
model_c.summary()

In [None]:
tf.keras.utils.plot_model(
    model_c)

In [106]:
class EvaluationMetricC(keras.callbacks.Callback):   
    
    def __init__(self, val_encodings, val_masks, rephrase_encodings, rephrase_masks, Y_val):
        super(EvaluationMetricC, self).__init__()
        self.val_encodings = val_encodings
        self.val_masks = val_masks
        self.rephrase_encodings = rephrase_encodings
        self.rephrase_masks = rephrase_masks
        self.Y_val = Y_val
    
    def on_epoch_begin(self, epoch, logs={}):
        print("\nTraining...")

    def on_epoch_end(self, epoch, logs={}):
        print("\nEvaluating...")
        val_prediction = self.model.predict([self.val_encodings, self.val_masks, self.rephrase_encodings, self.rephrase_masks])
        
        pred = np.round(val_prediction)

        from sklearn.metrics import classification_report

        print("Sarcasm") # 1
        print(classification_report(self.Y_val[:,0], pred[0][:,0], digits=3))
        print("##################################################################")

        print("Rephrase") # 0
        print(classification_report(self.Y_val[:,1], pred[1][:,0], digits=3))
        print("##################################################################")
      

Y_val_c = np.column_stack((Y_c_val, Y_rephrase_val))        
evaluation_metric_c = EvaluationMetricC(val_encodings_c["input_ids"], val_encodings_c["attention_mask"], val_encodings_rc["input_ids"], val_encodings_rc["attention_mask"], Y_val_c)

In [107]:
checkpoint_c = ModelCheckpoint(filepath='/content/sarcasm-3.{epoch:03d}.h5',
                                 verbose = 0,
                                 save_weights_only=True,
                                 epoch=1)

In [None]:
history = model_c.fit(
    x = [train_encodings_c["input_ids"], train_encodings_c["attention_mask"], train_encodings_rc["input_ids"], train_encodings_rc["attention_mask"]],
    y = [Y_c_train, Y_rephrase_train, np.ones((len(Y_c_train)))],
    callbacks = [evaluation_metric_c, checkpoint_c],
    batch_size = 128,
    shuffle=True,
    epochs=10
)

In [None]:
model_c.load_weights('/content/sarcasm-3.007.h5')

<h6>Test C</h6>

In [None]:
df_test_c = pd.read_csv('/content/taskC.En.input.csv')
df_test_c.head()

In [109]:
text_0_test = df_test_c['text_0']
text_1_test = df_test_c['text_1']

In [None]:
text_0_test = text_0_test.apply(lambda text: remove_website(text))
text_0_test = text_0_test.apply(lambda text: chat_words_conversion(text))
text_0_test = text_0_test.apply(lambda text: convert_emojis(text))
text_0_test = text_0_test.apply(lambda text: ekphrasis_pipe(text))
print("Test set completed.......")

In [None]:
text_1_test = text_1_test.apply(lambda text: remove_website(text))
text_1_test = text_1_test.apply(lambda text: chat_words_conversion(text))
text_1_test = text_1_test.apply(lambda text: convert_emojis(text))
text_1_test = text_1_test.apply(lambda text: ekphrasis_pipe(text))
print("Test set completed.......")

In [None]:
test_encodings_c_0 = tokenizer(list(text_0_test), max_length=70, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(test_encodings_c_0["input_ids"]))

test_encodings_c_1 = tokenizer(list(text_1_test), max_length=70, truncation=True, padding="max_length", return_tensors='tf')
print(np.shape(test_encodings_c_1["input_ids"]))

In [113]:
pred_c = model_c.predict([test_encodings_c_0['input_ids'], test_encodings_c_0['attention_mask'], test_encodings_c_1['input_ids'], test_encodings_c_1['attention_mask']])

In [None]:
np.shape(pred_c)

In [115]:
pred_test_0 = np.array(np.round(pred_c[0][:,0])).astype(int)
pred_test_1 = np.array(np.round(pred_c[1][:,0])).astype(int)

In [None]:
Counter(pred_test_0)

In [None]:
Counter(pred_test_1)

In [None]:
# final_answer = np.where(pred_test_0 == 0, 1, 0)
# Use one of pred_test_0 or pred_test_1
final_answer = pred_test_1
Counter(final_answer)

In [119]:
with open('answer.txt', 'w') as outf:
  outf.write('task_c_en\n')
  for i in range(0, len(test_encodings_c_0['input_ids'])):
    outf.write(str(pred_test_1[i]) + '\n')

In [120]:
import zipfile
zipfile.ZipFile('sarcasm_electra_c.zip', mode='w').write("answer.txt")