In [1]:
import os
from silence_tensorflow import silence_tensorflow
import numpy as np
from transformers import TFAutoModel
from sklearn import metrics
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dropout, Conv1D, Embedding, SpatialDropout1D, concatenate, MaxPooling1D, Conv1D
from tensorflow.keras.layers import GRU, LSTM,Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import LSTM, GRU, Dropout, Flatten
from tensorflow.keras import backend as K
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import Callback
from tensorflow.keras import optimizers
from tensorflow.keras.layers import Lambda
import warnings
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tqdm.notebook import tqdm
import gc
import wandb
from wandb.keras import WandbCallback
np.random.seed(21)
warnings.filterwarnings('ignore')
silence_tensorflow()

In [2]:
BATCH_SIZE = 4
model_name = 'vinai/bertweet-base'
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
MAX_LEN = 125

emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
train =pd.read_csv("../data/task1b/train.csv")
test=pd.read_csv("../data/test.csv")
FEATURES=["dale_chall","bad_words", "num_words","all_caps",	"emoji"	,"capitals","total_length","caps_vs_length","num_unique_words",	"words_vs_unique"	,"num_urls",	"!",	"?"]
features = train[FEATURES].fillna(0)
test_features = test[FEATURES].fillna(0)
ss = StandardScaler()
ss.fit(features)
features = ss.transform(features)
test_features = ss.transform(test_features)

In [4]:
#Function for cutting off the middle part of long texts.
def text_process(text):
    ws = text.split(' ')
    if(len(ws)>130):
        text = ' '.join(ws[:90]) + ' ' + ' '.join(ws[-40:])
    return text

In [5]:
y_train = train[["NONE","OFFN","HATE","PRFN"]].values
X_train = train['c_text'].apply(lambda x: text_process(str(x))).fillna("something").values.tolist()
X_test = test['c_text'].apply(lambda x: text_process(str(x))).fillna("something").values.tolist()

In [6]:
def tokenize_sentences(sentences, tokenizer, max_seq_len = 125 ):
    tokenized_sentences = []

    for sentence in tqdm(sentences):
        tokenized_sentence = tokenizer.encode(
                            sentence,
                            truncation=True,               
                            add_special_tokens = True, 
                            max_length = max_seq_len,
                    )
        
        tokenized_sentences.append(tokenized_sentence)

    return np.array(tokenized_sentences)

def create_attention_masks(tokenized_and_padded_sentences):
    attention_masks = []

    for sentence in tokenized_and_padded_sentences:
        att_mask = [int(token_id > 0) for token_id in sentence]
        attention_masks.append(att_mask)

    return np.asarray(attention_masks)
def regular_encode(texts,tokenizer,maxlen=MAX_LEN):
  input_ids = tokenize_sentences(texts, tokenizer, MAX_LEN)
  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", value=0, truncating="post", padding="post")
  attention_masks = create_attention_masks(input_ids)
  return input_ids,attention_masks

In [7]:
x_test,x_test_att = regular_encode(X_test, tokenizer, maxlen=MAX_LEN)
x_train,x_train_att = regular_encode(X_train,tokenizer,maxlen=MAX_LEN)

  0%|          | 0/1281 [00:00<?, ?it/s]

  0%|          | 0/13403 [00:00<?, ?it/s]

In [8]:
def get_model(bert_model, features ,clipvalue=1.,num_filters=40,dropout=0.5,max_len=125):
    import tensorflow as tf
    features_input = Input(shape=(features.shape[1],))
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    attention_masks = Input(shape=(max_len,), dtype=tf.int32, name="input_att_masks")
    bert_output = bert_model(input_ids, attention_mask=attention_masks)
    hidden_states= bert_output.hidden_states
    last_four_layers = [hidden_states[i] for i in (-1, -2, -3, -4)]
    x= tf.concat(last_four_layers,-1)
    convs = []
    filter_sizes = [2,3,4,5]
    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(x)
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)
    l_merge = concatenate(convs,axis=1)
    x = Dropout(0.5)(l_merge)  
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = concatenate([x ,features_input])
    outp = Dense(4, activation="softmax")(x)
    model = Model(inputs=[input_ids,attention_masks,features_input], outputs=outp)
    import tensorflow as tf
    adam = tf.optimizers.Adam(clipvalue=clipvalue)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    return model

In [9]:
transformer_layer = TFAutoModel.from_pretrained(model_name,output_hidden_states=True)
transformer_layer.compile()
model = get_model(transformer_layer, features)
model.summary()
gc.collect()
K.clear_session()
num_folds = 10
predict = np.zeros((test.shape[0],4))
x_test=np.asarray(x_test).astype(np.float32)
for i in range(num_folds):
    print(f"fold: {i}")
    gc.collect()
    K.clear_session()
    del model
    model = get_model(transformer_layer, features)
    gc.collect()
    model.load_weights(f"/scratch/arjunth2001/ft_fine_1b_cnn_{i+1}.h5")
    predict += model.predict([x_test,x_test_att,test_features], batch_size=BATCH_SIZE,verbose=1) / num_folds
print("Done")

Some layers from the model checkpoint at vinai/bertweet-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at vinai/bertweet-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 125)]        0                                            
__________________________________________________________________________________________________
input_att_masks (InputLayer)    [(None, 125)]        0                                            
__________________________________________________________________________________________________
tf_roberta_model (TFRobertaMode TFBaseModelOutputWit 134899968   input_word_ids[0][0]             
                                                                 input_att_masks[0][0]            
__________________________________________________________________________________________________
tf_op_layer_concat (TensorFlowO [(None, 125, 3072)]  0           tf_roberta_model[0][13

In [10]:
predict

array([[0.35910564, 0.22461091, 0.17350277, 0.24278067],
       [0.14524221, 0.29460305, 0.16939676, 0.39075798],
       [0.12718632, 0.2785843 , 0.14822241, 0.44600698],
       ...,
       [0.10494383, 0.18032335, 0.06702482, 0.64770799],
       [0.16175017, 0.31157096, 0.26332267, 0.26335619],
       [0.13654618, 0.27457375, 0.13893416, 0.44994591]])

In [11]:
df = test
df[["NONE","OFFN","HATE","PRFN"]]=predict
df = df[["NONE","OFFN","HATE","PRFN"]]

In [12]:
df

Unnamed: 0,NONE,OFFN,HATE,PRFN
0,0.359106,0.224611,0.173503,0.242781
1,0.145242,0.294603,0.169397,0.390758
2,0.127186,0.278584,0.148222,0.446007
3,0.413207,0.178135,0.211414,0.197243
4,0.152802,0.311913,0.181249,0.354036
...,...,...,...,...
1276,0.286577,0.244598,0.422946,0.045878
1277,0.125728,0.282413,0.144306,0.447552
1278,0.104944,0.180323,0.067025,0.647708
1279,0.161750,0.311571,0.263323,0.263356


In [13]:
df.to_csv("./predictions/ft_1b_cnn.csv")