In [None]:
import os
from silence_tensorflow import silence_tensorflow
import numpy as np
from transformers import TFAutoModel
from sklearn import metrics
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dropout, Conv1D, Embedding, SpatialDropout1D, concatenate, MaxPool1D
from tensorflow.keras.layers import GRU, LSTM,Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import LSTM, GRU, Dropout
from tensorflow.keras import backend as K
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import Callback
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Lambda
import warnings
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm.notebook import tqdm
import gc
np.random.seed(21)
warnings.filterwarnings('ignore')
silence_tensorflow()

In [None]:
BATCH_SIZE = 32
model_name = 'vinai/bertweet-base'
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
MAX_LEN = 125

In [None]:
train =pd.read_csv("../data/ext_data/task1b/train_full.csv")
test=pd.read_csv("../data/ext_data/task1b/test.csv")
FEATURES=["dale_chall","bad_words", "num_words","all_caps",	"emoji"	,"capitals","total_length","caps_vs_length","num_unique_words",	"words_vs_unique"	,"num_urls",	"!",	"?"]
features = train[FEATURES].fillna(0)
test_features = test[FEATURES].fillna(0)
ss = StandardScaler()
ss.fit(np.vstack((features, test_features)))
features = ss.transform(features)
test_features = ss.transform(test_features)

In [None]:
#Function for cutting off the middle part of long texts.
def text_process(text):
    ws = text.split(' ')
    if(len(ws)>130):
        text = ' '.join(ws[:90]) + ' ' + ' '.join(ws[-40:])
    return text

In [None]:
y_train = train[["NONE","OFFN","PRFN"]].values
y_test= test[["NONE","OFFN","PRFN"]].values
X_train = train['c_text'].apply(lambda x: text_process(str(x))).fillna("something").values.tolist()
X_test = test['c_text'].apply(lambda x: text_process(str(x))).fillna("something").values.tolist()

In [None]:
def tokenize_sentences(sentences, tokenizer, max_seq_len = 125 ):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,
                            truncation=True,               
                            add_special_tokens = True, 
                            max_length = max_seq_len,
                    )
        
        tokenized_sentences.append(tokenized_sentence)

    return np.array(tokenized_sentences)

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)
def regular_encode(texts,tokenizer,maxlen=MAX_LEN):
  input_ids = tokenize_sentences(texts, tokenizer, MAX_LEN)
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
  attention_masks = create_attention_masks(input_ids)
  return input_ids,attention_masks

In [None]:
x_test,x_test_att = regular_encode(X_test, tokenizer, maxlen=MAX_LEN)
x_train,x_train_att = regular_encode(X_train,tokenizer,maxlen=MAX_LEN)

In [None]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()
        self.interval = interval
        self.X_val, self.y_val = validation_data
        self.max_score = 0
        self.not_better_count = 0

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=1)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))
            if (score > self.max_score):
                print("*** New High Score (previous: %.6f) \n" % self.max_score)
                model.save_weights("../checkpoints/ft_fine_1b.h5")
                self.max_score=score
                self.not_better_count = 0
            else:
                self.not_better_count += 1
                if self.not_better_count > 2:
                    print("Epoch %05d: early stopping, high score = %.6f" % (epoch,self.max_score))
                    self.model.stop_training = True

In [None]:
def get_model(bert_model, features ,clipvalue=1.,num_filters=40,dropout=0.5,max_len=125):
    import tensorflow as tf
    features_input = Input(shape=(features.shape[1],))
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    attention_masks = Input(shape=(max_len,), dtype=tf.int32, name="input_att_masks")
    bert_output = bert_model(input_ids, attention_mask=attention_masks)
    cls_token = bert_output.pooler_output
    cls_token = Dense(50, activation="relu")(cls_token)
    cls_token = Dropout(0.5)(cls_token)
    x = concatenate([cls_token,features_input])
    outp = Dense(3, activation="softmax")(x)
    model = Model(inputs=[input_ids,attention_masks,features_input], outputs=outp)
    import tensorflow as tf
    adam = tf.optimizers.Adam(clipvalue=clipvalue)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    return model

In [None]:
transformer_layer = TFAutoModel.from_pretrained(model_name,output_hidden_states=True)
transformer_layer.compile()
model = get_model(transformer_layer, features)
model.summary()
epochs = 100
gc.collect()
K.clear_session()
num_folds = 10
predict = np.zeros((test.shape[0],3))
kf = KFold(n_splits=num_folds, shuffle=True, random_state=239)
x_train=np.asarray(x_train).astype(np.float32)
x_test=np.asarray(x_test).astype(np.float32)
i=0
for train_index, test_index in kf.split(x_train):
    i+=1
    print(f"fold: {i}")
    kfold_y_train,kfold_y_test = y_train[train_index], y_train[test_index]
    kfold_X_train = x_train[train_index]
    kfold_X_train_att = x_train_att[train_index]
    kfold_X_features = features[train_index]
    kfold_X_valid = x_train[test_index]
    kfold_X_valid_att = x_train_att[test_index]
    kfold_X_valid_features = features[test_index] 
    gc.collect()
    K.clear_session()
    del model
    model = get_model(transformer_layer, features)
    ra_val = RocAucEvaluation(validation_data=([kfold_X_valid,kfold_X_valid_att,kfold_X_valid_features], kfold_y_test), interval = 1)
    model.fit([kfold_X_train,kfold_X_train_att,kfold_X_features], kfold_y_train, batch_size=BATCH_SIZE, epochs=epochs, verbose=1,
            callbacks = [ra_val])
    gc.collect()
    model.load_weights("../checkpoints/ft_fine_1b.h5")
    predict += model.predict([x_test,x_test_att,test_features], batch_size=BATCH_SIZE,verbose=1) / num_folds
print("Done")

In [None]:
import json
with open("./predictions/ft_fine_1b.json","w") as f:
    json.dump(predict.tolist(),f, indent=4)