In [None]:
!pip install ekphrasis

In [None]:
!pip install transformers==4.2.1

In [None]:
pip install tf-models-official

In [4]:
import tensorflow as tf
import os
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
import re
import os
from collections import Counter
from official import nlp
import official.nlp.optimization

import ekphrasis
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

In [None]:
resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='grpc://' + os.environ['COLAB_TPU_ADDR'])

tf.config.experimental_connect_to_cluster(resolver)
tf.tpu.experimental.initialize_tpu_system(resolver)
print("All devices: ", tf.config.list_logical_devices('TPU'))

In [None]:
text_processor = TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=True,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
)

In [None]:
df_train = pd.read_csv('/content/train.csv', encoding='utf-8')
df_train['humor_rating'] = df_train['humor_rating'].fillna(0)
df_train['humor_controversy'] = df_train['humor_controversy'].fillna(2)
text_train = df_train["text"]
df_train.head(15)

In [None]:
df_val = pd.read_csv('/content/dev.csv', encoding='utf-8')
df_val['humor_rating'] = df_val['humor_rating'].fillna(0)
df_val['humor_controversy'] = df_val['humor_controversy'].fillna(2)
text_val = df_val["text"]
df_val.head(15)

In [None]:
df_test = pd.read_csv('/content/public_test.csv', encoding='utf-8')
text_test = df_test["text"]
df_test.head(15)

In [None]:
print(len(text_train))
print(len(text_val))
print(len(text_test))

In [11]:
def print_text(texts,i,j):
    for u in range(i,j):
        print(texts[u])
        print()

In [None]:
print_text(text_train,0,10)
print("##############################################################################################################")
print_text(text_val,0,10)

In [13]:
# Functions for chat word conversion
f = open("slang.txt", "r")
chat_words_str = f.read()
chat_words_map_dict = {}
chat_words_list = []

for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[1]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

def chat_words_conversion(text):
    new_text = []
    for w in text.split():
        if w.upper() in chat_words_list:
            new_text.append(chat_words_map_dict[w.upper()])
        else:
            new_text.append(w)
    return " ".join(new_text)

In [None]:
# Chat word conversion
# Training set
text_train = text_train.apply(lambda text: chat_words_conversion(text))
print_text(text_train,0,10)

print("********************************************************************************")

# Validation set
text_val = text_val.apply(lambda text: chat_words_conversion(text))
print_text(text_val,0,10)

# Test set
text_test = text_test.apply(lambda text: chat_words_conversion(text))
# print_text(text_test,0,10)

In [15]:
def ekphrasis_pipe(sentence):
    cleaned_sentence = " ".join(text_processor.pre_process_doc(sentence))
    return cleaned_sentence

In [None]:
# Training set
text_train = text_train.apply(lambda text: ekphrasis_pipe(text))
print("Training set completed.......")
#Validation set
text_val = text_val.apply(lambda text: ekphrasis_pipe(text))
print("Validation set completed.......")
#Test set
text_test = text_test.apply(lambda text: ekphrasis_pipe(text))
print("Test set completed.......")

In [None]:
# Finding length of longest array
maxLen = len(max(text_train,key = lambda text: len(text.split(" "))).split(" "))
print(maxLen)

In [None]:
u = lambda text: len(text.split(" "))
sentence_lengths = []
for x in text_train:
    sentence_lengths.append(u(x))
print(sorted(sentence_lengths)[-50:])
print(len(sentence_lengths))

In [None]:
is_humor = df_train["is_humor"]
humor_rating = df_train["humor_rating"]
humor_controversy = df_train["humor_controversy"].astype(int)
offense_rating = df_train["offense_rating"]
print(Counter(is_humor))
print(Counter(humor_controversy))

In [None]:
is_humor_val = df_val["is_humor"]
humor_rating_val = df_val["humor_rating"]
humor_controversy_val = df_val["humor_controversy"].astype(int)
offense_rating_val = df_val["offense_rating"]
print(Counter(is_humor_val))
print(Counter(humor_controversy_val))

In [22]:
from transformers import RobertaTokenizerFast, TFRobertaModel, TFBertModel, BertTokenizerFast, ElectraTokenizerFast, TFElectraModel, AlbertTokenizerFast, TFAlbertModel, XLNetTokenizerFast, TFXLNetModel, MPNetTokenizerFast, TFMPNetModel
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [23]:
# strategy = tf.distribute.TPUStrategy(resolver)

In [None]:
# Define tokenizer as per requirement
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [31]:
text_train = list(text_train)
text_val = list(text_val)
text_test = list(text_test)

In [32]:
train_encodings = tokenizer(text_train, max_length=150, truncation=True, padding="max_length", return_tensors='tf')
val_encodings = tokenizer(text_val, max_length=150, truncation=True, padding="max_length", return_tensors='tf')
test_encodings = tokenizer(text_test, max_length=150, truncation=True, padding="max_length", return_tensors='tf')

In [None]:
print(np.shape(train_encodings["input_ids"]))
print(np.shape(val_encodings["input_ids"]))
print(np.shape(test_encodings["input_ids"]))

In [None]:
print(train_encodings["input_ids"][0])
print("***************************************************************************")
print(val_encodings["input_ids"][0])

In [45]:
def hahackathon_task_1(input_shape):
    # Import model as required
    model = TFBertModel.from_pretrained('bert-base-uncased')
    layer = model.layers[0]
    #Model
    inputs = keras.Input(shape=input_shape, dtype='int32')
    input_masks = keras.Input(shape=input_shape, dtype='int32')

    outputs = layer([inputs, input_masks])
    output = outputs[0]
    pooled_output = output[:, 0, :] # Use for bert, roberta, albert, mpnet, electra
    # pooled_output = output[:, -1] # Use for XLNet

    # Humour classification
    is_humor = layers.Dropout(0.3)(pooled_output)
    # is_humor = layers.Dense(128, activation="gelu")(is_humor)
    is_humor = layers.Dense(1, activation="sigmoid")(is_humor)

    model = keras.Model(inputs=[inputs,input_masks], outputs=is_humor, name='Task_1_a')
    
    return model

In [None]:
strategy = tf.distribute.TPUStrategy(resolver)

In [None]:
with strategy.scope():
    model = hahackathon_task_1((150,))
    optimizer = keras.optimizers.Adam(learning_rate=2e-5)
    loss_fun = [
          tf.keras.losses.BinaryCrossentropy(from_logits=False)
    ]
    metric = [
        tf.keras.metrics.BinaryAccuracy(),
        tf.keras.metrics.Precision(),
        tf.keras.metrics.Recall()
    ]
    model.compile(optimizer=optimizer, loss=loss_fun, metrics=metric)

In [None]:
model.summary()

In [49]:
checkpoint = ModelCheckpoint(filepath='/content/task-1-a-model-name.{epoch:03d}.h5',
                                 verbose = 0,
                                 save_weights_only=True,
                                 epoch=4)

In [None]:
count_humor = Counter(is_humor)
print(count_humor)
print(count_humor.keys())
zero = count_humor[0]
one = count_humor[1]
total = zero + one
print("Not humorous: ",zero)
print("Humorous: ",one)
print("Total: ",total)

In [None]:
class_weight_ = {}
maxi = max(zero, one)
weight_for_0 = (maxi / (maxi + zero))
weight_for_1 = (maxi / (maxi + one))

class_weight_motivation = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

In [None]:
# albert
history_task_1_a = model.fit(
    x = [train_encodings["input_ids"], train_encodings["attention_mask"]],
    y = is_humor,
    validation_data = ([val_encodings["input_ids"],val_encodings["attention_mask"]], is_humor_val),
    callbacks = [checkpoint],
    batch_size=16,
    class_weight=class_weight_,
    shuffle=True,
    epochs=4)

In [53]:
val_answer = model.predict([val_encodings["input_ids"],val_encodings["attention_mask"]])

In [54]:
val_answer = np.round(val_answer)

In [55]:
val_answer = np.squeeze(val_answer, axis=-1)

In [56]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(is_humor_val, val_answer, digits=4))

In [58]:
test_answer = model.predict([test_encodings["input_ids"],test_encodings["attention_mask"]])

In [59]:
test_answer = np.round(test_answer)

In [60]:
test_answer = np.squeeze(test_answer, axis=-1)

In [None]:
np.shape(test_answer)

In [62]:
test_id = df_test["id"]

In [63]:
test_dict = {
    "id" : test_id,
    "offense_rating" : test_answer
}

In [None]:
df_test = pd.DataFrame(test_dict)
df_test.head()

In [65]:
df_test.to_csv('answer-1-a.csv', index=False)