In [None]:
import json
import re
import sys
import pandas as pd
import numpy as np

# TODO: Check whether spacy is allowed
import spacy

In [None]:
# TODO: Load latest multimodal_transformers from GitHub to allow dataframes

In [None]:
from dataclasses import dataclass, field
import json
import logging
import os
from typing import Optional

from transformers import (
    AutoTokenizer,
    AutoConfig,
    Trainer,
    EvalPrediction,
    set_seed
)
from transformers.training_args import TrainingArguments

from multimodal_transformers.data import load_data_from_folder
from multimodal_transformers.model import TabularConfig
from multimodal_transformers.model import AutoModelWithTabular

logging.basicConfig(level=logging.INFO)
os.environ['COMET_MODE'] = 'DISABLED'


In [None]:
def convert_label(label):
    if label == "rumour":
        return 1
    elif label == "non-rumour":
        return 0
    else:
        raise Exception("label classes must be 'rumour' or 'non-rumour'")

In [None]:
def convert_prediction(pred):
    if pred == 1:
        return "rumour"
    elif pred == 0:
        return "non-rumour"
    else:
        raise Exception("prediction classes must be '0' or '1'")

In [None]:
def extract_features(tweet):
    tweet_features = {}
    
    ### Tweet features
    tweet_features['text'] = tweet['text']
    
    # Number of retweets
    tweet_features['retweet_count'] = tweet['retweet_count']
    #Number of favorites
    tweet_features['favorite_count'] = tweet['favorite_count']
    
    #Whether tweet has a question mark
    tweet_features['question_mark'] = '?' in tweet['text']
    
    #Whether tweet contains URLs
    if 'urls' in tweet['entities']:
        number_of_urls = len(tweet['entities']['urls'])
    else: 
        number_of_urls = 0
        
    tweet_features['contains_url'] = True if number_of_urls > 0 else False
    
    #Number of URLs embedded in tweet
    tweet_features['number_urls'] =  number_of_urls
    
    #Whether tweet has native media
    if 'media' in tweet['entities']:
        number_of_media = len(tweet['entities']['media'])
    else: 
        number_of_media = 0
        
    tweet_features['contains_media'] = True if number_of_media > 0 else False
    
    
    ### User features
    user_features = {}
    
    # Number of posts user has posted
    user_features['statuses_count'] = tweet['user']['statuses_count']
    
    #Number of public lists user belongs to
    user_features['listed_count'] = tweet['user']['listed_count']


    #Number of followers
    user_features['followers_count'] = tweet['user']['followers_count']

    #Number of followings
    user_features['friends_count'] = tweet['user']['friends_count']

    #Whether user has a background profile image
    if 'profile_background_image_url' in tweet['user']:
        profile_background_image_url = True
    else:
        profile_background_image_url = False
    
    user_features['contains_profile_background_image'] = profile_background_image_url
    
    #User reputation (i.e., followers/(followings+1))
    user_features['reputation_score_1'] = user_features['followers_count'] / ( user_features['friends_count'] +1)
    
    #User reputation (i.e., followers/(followings+followers+1))
    user_features['reputation_score_2'] = user_features['followers_count'] /(user_features['followers_count'] +
                                                                              user_features['friends_count'] +1)

    # Number of tweets user has liked so far (aka ”user favorites”)
    user_features['favourites_count'] = tweet['user']['favourites_count']

    # Account age in days
    # TODO
    
    # Following rate (i.e., followings / (account age+1))
    # TODO
    
    # Favorite rate (i.e., user favorites / (account age+1))
    # TODO
    
    # User engagement (i.e., # posts / (account age+1))
    # TODO
    
    # Response time decay (time difference between context and source tweet in mins)
    # TODO
    
    # Whether user is verified
    user_features['verified'] = tweet['user']['verified']

    # Whether geolocation is enabled
    user_features['geo_enabled'] = tweet['user']['geo_enabled']

    # Number of words in user description
    if 'description' in tweet['user'] and tweet['user']['description'] != None:
        length_description = len(tweet['user']['description'])
    else:
        length_description = 0
        
    # Whether user has a description
    user_features['has_description'] = True if length_description > 0 else False
        
    user_features['length_description'] = length_description

    
    # Merge features
    tweet_features.update(user_features)
    return tweet_features


    

## Preprocessing

1--> Removing Contraction (Decontraction)

2--> Dealing with HashTags

3--> Removing URLs and Email

4--> Removing Stopwords and Lemmatization

In [None]:
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won\'t", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
import spacy
import re
nlp = spacy.load("en_core_web_sm")

def preprocessing(text):
    text = text.replace('#','')
    text = decontracted(text)
    text = re.sub('\S*@\S*\s?','',text)
    text = re.sub('http[s]?:(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',text)

    token=[]
    result=''
    text = re.sub('[^A-z]', ' ',text.lower())

    text = nlp(text)
    for t in text:
        if not t.is_stop and len(t)>2:  
            token.append(t.lemma_)
    result = ' '.join([i for i in token])

    return result.strip()

In [None]:
def load_data(data_file, label_file):
    
    if label_file != None:
        y_true = json.load(open(label_file))
    
    with open(data_file, 'r') as data_train:
        raw_list = list(data_train)

    data_list = []


    for event in raw_list:
        tweets_in_event = json.loads(event)

        tweet = {}

        tweet['id'] = tweets_in_event[0]['id']
        tweet.update(extract_features(tweets_in_event[0]))
        

        # append text from follow-up tweets in tweet chain
        follow_up_tweets = ""
        for i in range(1, len(tweets_in_event)):
            #follow_tweet_features = extract_features(tweets_in_event[i])
            follow_up_tweets = follow_up_tweets + preprocessing(tweets_in_event[i]['text']) + " [SEP] "
        
        # TODO: Edit
        #tweet['follow_up_tweets'] = follow_up_tweets
        tweet['text'] = preprocessing(tweet['text']) + " [SEP] " + follow_up_tweets
        
        if label_file != None:
            tweet['label'] = convert_label(y_true[str(tweet['id'])])
        
        data_list.append(tweet)

    df = pd.DataFrame(data_list)

    return df


    

In [None]:
train_df = load_data(data_file = '../data/train.data.jsonl', label_file = '../data/train.label.json')
dev_df = load_data(data_file = '../data/dev.data.jsonl', label_file = '../data/dev.label.json')


In [None]:
# Workaround to make size of training dataset even 
# Better to set drop_last = True in Pytorch DataLoader

#train_df.drop(train_df.tail(1).index,inplace=True)
#len(train_df)

In [None]:
test_df = load_data(data_file = '../data/test.data.jsonl', label_file = None)

In [None]:
# Pre-processing applied directly
#train_df.text = train_df.text.apply(lambda x : preprocessing(x))
#dev_df.text = dev_df.text.apply(lambda x : preprocessing(x))
#test_df.text = test_df.text.apply(lambda x : preprocessing(x))

#train_df.text = train_df.follow_up_tweets.apply(lambda x : preprocessing(x))
#dev_df.text = dev_df.follow_up_tweets.apply(lambda x : preprocessing(x))
#test_df.text = test_df.follow_up_tweets.apply(lambda x : preprocessing(x))

In [None]:
#train_df.to_csv('train.csv')
#dev_df.to_csv('val.csv')
#test_df.to_csv('test.csv')

In [None]:
import matplotlib.pyplot as plt
plt.title('Train Data')
plt.xlabel('Target Distribution')
plt.ylabel('Samples')
plt.hist(train_df.label)
plt.show()

In [None]:
combined_df = train_df.append(dev_df, ignore_index = True)
combined_df

In [None]:
combined_df['text'][0]

## Multi-modal BERT

In [18]:
@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """
    model_name_or_path: str = field(
      metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    config_name: Optional[str] = field(
      default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
    )
    tokenizer_name: Optional[str] = field(
      default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
      default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
    )


@dataclass
class MultimodalDataTrainingArguments:
    """
    Arguments pertaining to how we combine tabular features
    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """

    data_path: str = field(metadata={
                            'help': 'the path to the csv file containing the dataset'
                        })
    column_info_path: str = field(
      default=None,
      metadata={
          'help': 'the path to the json file detailing which columns are text, categorical, numerical, and the label'
    })

    column_info: dict = field(
      default=None,
      metadata={
          'help': 'a dict referencing the text, categorical, numerical, and label columns'
                  'its keys are text_cols, num_cols, cat_cols, and label_col'
    })

    categorical_encode_type: str = field(default='ohe',
                                        metadata={
                                            'help': 'sklearn encoder to use for categorical data',
                                            'choices': ['ohe', 'binary', 'label', 'none']
                                        })
    numerical_transformer_method: str = field(default='yeo_johnson',
                                            metadata={
                                                'help': 'sklearn numerical transformer to preprocess numerical data',
                                                'choices': ['yeo_johnson', 'box_cox', 'quantile_normal', 'none']
                                            })
    task: str = field(default="classification",
                    metadata={
                        "help": "The downstream training task",
                        "choices": ["classification", "regression"]
                    })

    mlp_division: int = field(default=4,
                            metadata={
                                'help': 'the ratio of the number of '
                                        'hidden dims in a current layer to the next MLP layer'
                            })
    combine_feat_method: str = field(default='individual_mlps_on_cat_and_numerical_feats_then_concat',
                                    metadata={
                                        'help': 'method to combine categorical and numerical features, '
                                                'see README for all the method'
                                    })
    mlp_dropout: float = field(default=0.1,
                              metadata={
                                'help': 'dropout ratio used for MLP layers'
                              })
    numerical_bn: bool = field(default=True,
                              metadata={
                                  'help': 'whether to use batchnorm on numerical features'
                              })
    use_simple_classifier: str = field(default=True,
                                      metadata={
                                          'help': 'whether to use single layer or MLP as final classifier'
                                      })
    mlp_act: str = field(default='relu',
                        metadata={
                            'help': 'the activation function to use for finetuning layers',
                            'choices': ['relu', 'prelu', 'sigmoid', 'tanh', 'linear']
                        })
    gating_beta: float = field(default=0.2,
                              metadata={
                                  'help': "the beta hyperparameters used for gating tabular data "
                                          "see https://www.aclweb.org/anthology/2020.acl-main.214.pdf"
                              })

    def __post_init__(self):
        assert self.column_info != self.column_info_path
        if self.column_info is None and self.column_info_path:
            with open(self.column_info_path, 'r') as f:
                self.column_info = json.load(f)

In [136]:
text_cols = ['text']
cat_cols = ['question_mark', 'contains_url', 'contains_media', 'contains_profile_background_image', 'verified', 'geo_enabled', 'has_description']
numerical_cols = ['retweet_count', 'favorite_count', 'number_urls','statuses_count', 'listed_count', 'reputation_score_1', 'reputation_score_2', 'favourites_count','length_description','follow_tweets']

column_info_dict = {
    'text_cols': text_cols,
    'num_cols': numerical_cols,
    'cat_cols': cat_cols,
    'label_col': 'label',
    'label_list': [0, 1]
}


model_args = ModelArguments(
    model_name_or_path='bert-base-uncased'
)

data_args = MultimodalDataTrainingArguments(
    data_path='.',
    combine_feat_method='gating_on_cat_and_num_feats_then_sum',
    column_info=column_info_dict,
    task='classification'
)

training_args = TrainingArguments(
    output_dir="./logs/model_name",
    logging_dir="./logs/runs",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,
    num_train_epochs=5,
    evaluate_during_training=True,
    logging_steps=25,
    eval_steps=250,
    dataloader_drop_last=True
)

set_seed(training_args.seed)

In [137]:
tokenizer_path_or_name = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
print('Specified tokenizer: ', tokenizer_path_or_name)

# TODO: Check if tokens should be converted to lower-case?!
tokenizer = AutoTokenizer.from_pretrained(
    tokenizer_path_or_name,
    cache_dir=model_args.cache_dir
)

Specified tokenizer:  bert-base-uncased


In [138]:
# Get Datasets
train_dataset, dev_dataset, test_dataset = load_data_from_folder(train_df, dev_df, test_df,
    data_args.column_info['text_cols'],
    tokenizer,
    label_col=data_args.column_info['label_col'],
    label_list=data_args.column_info['label_list'],
    categorical_cols=data_args.column_info['cat_cols'],
    numerical_cols=data_args.column_info['num_cols'],
    sep_text_token_str=tokenizer.sep_token
)

INFO:multimodal_transformers.data.data_utils:9 numerical columns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
INFO:multimodal_transformers.data.data_utils:20 categorical columns
INFO:multimodal_transformers.data.data_utils:9 numerical columns
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
INFO:multimodal_transformers.data.load_data:Text columns: ['text']
INFO:multimodal_transformers.data.load_data:Raw text example: sep jews label like anti semite campaign person company finish sep sep imcharliehebdo sep ditto sep i

In [139]:
dev_dataset

<multimodal_transformers.data.tabular_torch_dataset.TorchTabularTextDataset at 0x7fc68ba77110>

In [140]:
num_labels = len(np.unique(train_dataset.labels))
num_labels

2

In [141]:
config = AutoConfig.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
tabular_config = TabularConfig(num_labels=num_labels,
                               cat_feat_dim=train_dataset.cat_feats.shape[1],
                               numerical_feat_dim=train_dataset.numerical_feats.shape[1],
                               **vars(data_args))
config.tabular_config = tabular_config

In [142]:
model = AutoModelWithTabular.from_pretrained(
        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
        config=config,
        cache_dir=model_args.cache_dir
    )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertWithTabular: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertWithTabular from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertWithTabular from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertWithTabular were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifi

In [143]:
import numpy as np
from scipy.special import softmax
from sklearn.metrics import (
    auc,
    precision_recall_curve,
    roc_auc_score,
    f1_score,
    confusion_matrix,
    matthews_corrcoef,
)

def calc_classification_metrics(p: EvalPrediction):
    pred_labels = np.argmax(p.predictions, axis=1)
    pred_scores = softmax(p.predictions, axis=1)[:, 1]
    labels = p.label_ids
    if len(np.unique(labels)) == 2:  # binary classification
        roc_auc_pred_score = roc_auc_score(labels, pred_scores)
        precisions, recalls, thresholds = precision_recall_curve(labels,
                                                                pred_scores)
        fscore = (2 * precisions * recalls) / (precisions + recalls)
        fscore[np.isnan(fscore)] = 0
        ix = np.argmax(fscore)
        threshold = thresholds[ix].item()
        pr_auc = auc(recalls, precisions)
        tn, fp, fn, tp = confusion_matrix(labels, pred_labels, labels=[0, 1]).ravel()
        result = {'roc_auc': roc_auc_pred_score,
                'threshold': threshold,
                'pr_auc': pr_auc,
                'recall': recalls[ix].item(),
                'precision': precisions[ix].item(), 'f1': fscore[ix].item(),
                'tn': tn.item(), 'fp': fp.item(), 'fn': fn.item(), 'tp': tp.item()
                }
    else:
        acc = (pred_labels == labels).mean()
        f1 = f1_score(y_true=labels, y_pred=pred_labels)
        result = {
          "acc": acc,
          "f1": f1,
          "acc_and_f1": (acc + f1) / 2,
          "mcc": matthews_corrcoef(labels, pred_labels)
        }

    return result

In [144]:
# TODO: Verify dataloader_drop_last = True

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    compute_metrics=calc_classification_metrics
)

In [145]:
torch.cuda.empty_cache()

In [146]:
%%time
trainer.train()

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/1160 [00:00<?, ?it/s]

{'loss': 0.7329568481445312, 'learning_rate': 4.978448275862069e-05, 'epoch': 0.021551724137931036, 'step': 25}
{'loss': 0.6096640014648438, 'learning_rate': 4.9568965517241384e-05, 'epoch': 0.04310344827586207, 'step': 50}
{'loss': 0.6685610961914062, 'learning_rate': 4.935344827586207e-05, 'epoch': 0.06465517241379311, 'step': 75}
{'loss': 0.6038140869140625, 'learning_rate': 4.913793103448276e-05, 'epoch': 0.08620689655172414, 'step': 100}
{'loss': 0.642603759765625, 'learning_rate': 4.892241379310345e-05, 'epoch': 0.10775862068965517, 'step': 125}
{'loss': 0.6176510620117187, 'learning_rate': 4.870689655172414e-05, 'epoch': 0.12931034482758622, 'step': 150}
{'loss': 0.6286431884765625, 'learning_rate': 4.849137931034483e-05, 'epoch': 0.15086206896551724, 'step': 175}
{'loss': 0.5317636108398438, 'learning_rate': 4.827586206896552e-05, 'epoch': 0.1724137931034483, 'step': 200}
{'loss': 0.6091281127929687, 'learning_rate': 4.806034482758621e-05, 'epoch': 0.1939655172413793, 'step': 2

Evaluation:   0%|          | 0/72 [00:00<?, ?it/s]

{'eval_loss': 0.571140755381849, 'eval_roc_auc': 0.7157981803143094, 'eval_threshold': 0.39266160130500793, 'eval_pr_auc': 0.538579544613773, 'eval_recall': 0.6505376344086021, 'eval_precision': 0.5170940170940171, 'eval_f1': 0.5761904761904763, 'eval_tn': 330, 'eval_fp': 60, 'eval_fn': 112, 'eval_tp': 74, 'epoch': 0.21551724137931033, 'step': 250}
{'loss': 0.63719970703125, 'learning_rate': 4.762931034482759e-05, 'epoch': 0.23706896551724138, 'step': 275}
{'loss': 0.57749755859375, 'learning_rate': 4.741379310344828e-05, 'epoch': 0.25862068965517243, 'step': 300}
{'loss': 0.6854901123046875, 'learning_rate': 4.719827586206897e-05, 'epoch': 0.2801724137931034, 'step': 325}
{'loss': 0.6259033203125, 'learning_rate': 4.698275862068966e-05, 'epoch': 0.3017241379310345, 'step': 350}
{'loss': 0.6268218994140625, 'learning_rate': 4.6767241379310346e-05, 'epoch': 0.3232758620689655, 'step': 375}
{'loss': 0.540374755859375, 'learning_rate': 4.655172413793104e-05, 'epoch': 0.3448275862068966, '

Evaluation:   0%|          | 0/72 [00:00<?, ?it/s]



{'eval_loss': 0.5854036925981442, 'eval_roc_auc': 0.7018748276812793, 'eval_threshold': 0.2108280509710312, 'eval_pr_auc': 0.5062255826405996, 'eval_recall': 0.7634408602150538, 'eval_precision': 0.461038961038961, 'eval_f1': 0.5748987854251013, 'eval_tn': 329, 'eval_fp': 61, 'eval_fn': 123, 'eval_tp': 63, 'epoch': 0.43103448275862066, 'step': 500}
{'loss': 0.64150390625, 'learning_rate': 4.5474137931034485e-05, 'epoch': 0.4525862068965517, 'step': 525}
{'loss': 0.522335205078125, 'learning_rate': 4.5258620689655176e-05, 'epoch': 0.47413793103448276, 'step': 550}
{'loss': 0.650234375, 'learning_rate': 4.504310344827587e-05, 'epoch': 0.4956896551724138, 'step': 575}
{'loss': 0.626070556640625, 'learning_rate': 4.482758620689655e-05, 'epoch': 0.5172413793103449, 'step': 600}
{'loss': 0.642850341796875, 'learning_rate': 4.461206896551724e-05, 'epoch': 0.5387931034482759, 'step': 625}
{'loss': 0.655572509765625, 'learning_rate': 4.4396551724137933e-05, 'epoch': 0.5603448275862069, 'step': 

Evaluation:   0%|          | 0/72 [00:00<?, ?it/s]



{'eval_loss': 0.5469962555087275, 'eval_roc_auc': 0.7555004135649298, 'eval_threshold': 0.23558713495731354, 'eval_pr_auc': 0.5979558263838503, 'eval_recall': 0.7688172043010753, 'eval_precision': 0.5070921985815603, 'eval_f1': 0.611111111111111, 'eval_tn': 372, 'eval_fp': 18, 'eval_fn': 142, 'eval_tp': 44, 'epoch': 0.646551724137931, 'step': 750}
{'loss': 0.760955810546875, 'learning_rate': 4.331896551724138e-05, 'epoch': 0.6681034482758621, 'step': 775}
{'loss': 0.56223388671875, 'learning_rate': 4.3103448275862066e-05, 'epoch': 0.6896551724137931, 'step': 800}
{'loss': 0.5667236328125, 'learning_rate': 4.288793103448276e-05, 'epoch': 0.7112068965517241, 'step': 825}
{'loss': 0.672664794921875, 'learning_rate': 4.267241379310345e-05, 'epoch': 0.7327586206896551, 'step': 850}
{'loss': 0.5685498046875, 'learning_rate': 4.245689655172414e-05, 'epoch': 0.7543103448275862, 'step': 875}
{'loss': 0.59909912109375, 'learning_rate': 4.224137931034483e-05, 'epoch': 0.7758620689655172, 'step': 

Evaluation:   0%|          | 0/72 [00:00<?, ?it/s]



{'eval_loss': 0.5283502750098705, 'eval_roc_auc': 0.7745795423214777, 'eval_threshold': 0.29524073004722595, 'eval_pr_auc': 0.6061894631737952, 'eval_recall': 0.7688172043010753, 'eval_precision': 0.5375939849624061, 'eval_f1': 0.6327433628318584, 'eval_tn': 359, 'eval_fp': 31, 'eval_fn': 118, 'eval_tp': 68, 'epoch': 0.8620689655172413, 'step': 1000}
{'loss': 0.59927734375, 'learning_rate': 4.116379310344828e-05, 'epoch': 0.8836206896551724, 'step': 1025}
{'loss': 0.5568017578125, 'learning_rate': 4.094827586206897e-05, 'epoch': 0.9051724137931034, 'step': 1050}
{'loss': 0.570390625, 'learning_rate': 4.073275862068966e-05, 'epoch': 0.9267241379310345, 'step': 1075}
{'loss': 0.60729248046875, 'learning_rate': 4.0517241379310344e-05, 'epoch': 0.9482758620689655, 'step': 1100}
{'loss': 0.5515869140625, 'learning_rate': 4.0301724137931035e-05, 'epoch': 0.9698275862068966, 'step': 1125}
{'loss': 0.5043212890625, 'learning_rate': 4.0086206896551726e-05, 'epoch': 0.9913793103448276, 'step': 1

Iteration:   0%|          | 0/1160 [00:00<?, ?it/s]

{'loss': 0.57208251953125, 'learning_rate': 3.9870689655172416e-05, 'epoch': 1.0129310344827587, 'step': 1175}
{'loss': 0.55347900390625, 'learning_rate': 3.965517241379311e-05, 'epoch': 1.0344827586206897, 'step': 1200}
{'loss': 0.48760986328125, 'learning_rate': 3.94396551724138e-05, 'epoch': 1.0560344827586208, 'step': 1225}
{'loss': 0.63018798828125, 'learning_rate': 3.922413793103448e-05, 'epoch': 1.0775862068965518, 'step': 1250}


Evaluation:   0%|          | 0/72 [00:00<?, ?it/s]

{'eval_loss': 0.5352216431250175, 'eval_roc_auc': 0.7588089330024815, 'eval_threshold': 0.26584872603416443, 'eval_pr_auc': 0.5922371253809741, 'eval_recall': 0.7795698924731183, 'eval_precision': 0.5197132616487455, 'eval_f1': 0.6236559139784947, 'eval_tn': 355, 'eval_fp': 35, 'eval_fn': 119, 'eval_tp': 67, 'epoch': 1.0775862068965518, 'step': 1250}
{'loss': 0.53931396484375, 'learning_rate': 3.9008620689655174e-05, 'epoch': 1.0991379310344827, 'step': 1275}
{'loss': 0.62081787109375, 'learning_rate': 3.8793103448275865e-05, 'epoch': 1.1206896551724137, 'step': 1300}
{'loss': 0.60524169921875, 'learning_rate': 3.8577586206896555e-05, 'epoch': 1.1422413793103448, 'step': 1325}
{'loss': 0.6106298828125, 'learning_rate': 3.8362068965517246e-05, 'epoch': 1.1637931034482758, 'step': 1350}
{'loss': 0.52974853515625, 'learning_rate': 3.814655172413794e-05, 'epoch': 1.1853448275862069, 'step': 1375}
{'loss': 0.6096630859375, 'learning_rate': 3.793103448275862e-05, 'epoch': 1.206896551724138, 

Evaluation:   0%|          | 0/72 [00:00<?, ?it/s]

{'eval_loss': 0.529348997813132, 'eval_roc_auc': 0.7674524400330853, 'eval_threshold': 0.36584705114364624, 'eval_pr_auc': 0.6051101215550081, 'eval_recall': 0.6344086021505376, 'eval_precision': 0.6178010471204188, 'eval_f1': 0.6259946949602122, 'eval_tn': 359, 'eval_fp': 31, 'eval_fn': 116, 'eval_tp': 70, 'epoch': 1.293103448275862, 'step': 1500}
{'loss': 0.679052734375, 'learning_rate': 3.685344827586207e-05, 'epoch': 1.3146551724137931, 'step': 1525}
{'loss': 0.62488525390625, 'learning_rate': 3.663793103448276e-05, 'epoch': 1.3362068965517242, 'step': 1550}
{'loss': 0.547412109375, 'learning_rate': 3.642241379310345e-05, 'epoch': 1.3577586206896552, 'step': 1575}
{'loss': 0.66468505859375, 'learning_rate': 3.620689655172414e-05, 'epoch': 1.3793103448275863, 'step': 1600}
{'loss': 0.65642578125, 'learning_rate': 3.5991379310344833e-05, 'epoch': 1.4008620689655173, 'step': 1625}
{'loss': 0.6310986328125, 'learning_rate': 3.5775862068965524e-05, 'epoch': 1.4224137931034484, 'step': 1

Evaluation:   0%|          | 0/72 [00:00<?, ?it/s]



{'eval_loss': 0.5372722078528669, 'eval_roc_auc': 0.7675765095119934, 'eval_threshold': 0.3246932923793793, 'eval_pr_auc': 0.6059458167644938, 'eval_recall': 0.6182795698924731, 'eval_precision': 0.6149732620320856, 'eval_f1': 0.6166219839142091, 'eval_tn': 371, 'eval_fp': 19, 'eval_fn': 133, 'eval_tp': 53, 'epoch': 1.5086206896551724, 'step': 1750}
{'loss': 0.72595703125, 'learning_rate': 3.4698275862068966e-05, 'epoch': 1.5301724137931034, 'step': 1775}
{'loss': 0.585771484375, 'learning_rate': 3.4482758620689657e-05, 'epoch': 1.5517241379310345, 'step': 1800}
{'loss': 0.5639697265625, 'learning_rate': 3.426724137931035e-05, 'epoch': 1.5732758620689655, 'step': 1825}
{'loss': 0.553359375, 'learning_rate': 3.405172413793103e-05, 'epoch': 1.5948275862068966, 'step': 1850}
{'loss': 0.5990087890625, 'learning_rate': 3.383620689655172e-05, 'epoch': 1.6163793103448276, 'step': 1875}
{'loss': 0.660166015625, 'learning_rate': 3.3620689655172414e-05, 'epoch': 1.6379310344827587, 'step': 1900}

KeyboardInterrupt: 

In [147]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [148]:
%tensorboard --logdir ./logs/runs

### Inference

In [149]:
result = trainer.predict(test_dataset=test_dataset)

Prediction:   0%|          | 0/72 [00:00<?, ?it/s]

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
result

In [None]:
predicted_labels = [convert_prediction(pred) for pred in result]


In [None]:
output = pd.DataFrame({'id':test_df.id,'target':predicted_labels})
output

In [None]:
submission = pd.Series(output.target.values,index=output.id).to_dict()
with open('test-output.json', 'w') as f:
    json.dump(submission, f)

## Text-only BERT

## Loading BertTokenizer

Load tokenizer based on wordpiece approach

In [None]:
# TODO: Check whether transformerss is allowed
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [None]:
import numpy as np
import tensorflow as tf 

## BERT encoding

Data is encoded according to BERT requirement.There is a very helpful function called encode_plus provided in the Tokenizer class. It can seamlessly perform the following operations:

Tokenize the text
Add special tokens - [CLS] and [SEP]

Add special tokens - [CLS] and [SEP]

create token IDs

Pad the sentences to a common length

Create attention masks for the above PAD tokens

In [None]:
def bert_encode(data,maximum_length) :
    input_ids = []
    attention_masks = []


    for i in range(len(data.text)):
        encoded = tokenizer.encode_plus(
            data.text[i],
            add_special_tokens=True,
            max_length=maximum_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            truncation = True

        )

        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids),np.array(attention_masks)

Input are 2 Numpy array. Let me briefly go over them:

1) input_ids : list of token ids to be fed to a model

2) attention_masks: list of indices specifying which tokens should be attended to by the model.The input sequences are denoted by 1 and the padded ones by 0. These masks help to differentiate between the two.

Note : Token Ids are not necessary as it is used Two Sentence Problem (To differentiate two sentence)

In [None]:
combined_input_ids,combined_attention_masks = bert_encode(combined_df,512)
test_input_ids,test_attention_masks = bert_encode(test_df,512)

In [None]:
combined_input_ids

In [None]:
len(combined_input_ids)

## Creating Custom Model

Base TFBert Model with Dense layer and sigmoid activation as head.

In [None]:
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.optimizers import Adam

def create_model(bert_model):
    input_ids = tf.keras.Input(shape=(512,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(512,),dtype='int32')

    output = bert_model([input_ids,attention_masks])
    output = output[1]
    output = tf.keras.layers.Dense(32,activation='relu')(output)
    output = tf.keras.layers.Dropout(0.2)(output)

    output = tf.keras.layers.Dense(1,activation='sigmoid')(output)
    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(Adam(lr=6e-6), loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    return model

## TFBertModel

The bare Bert Model transformer outputing raw hidden-states without any specific head on top. https://huggingface.co/transformers/model_doc/bert.html#tfbertmodel

In [None]:
#torch.cuda.empty_cache()

In [None]:
from transformers import TFBertModel

bert_model = TFBertModel.from_pretrained('bert-base-uncased')

## Implementing custom model

In [None]:
model = create_model(bert_model)
model.summary()

## Training

Fit for **5 epochs**:

In [None]:
callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=3)

In [None]:
history = model.fit([combined_input_ids,combined_attention_masks],combined_df.label,validation_split=0.1,callbacks=[callback], epochs=5,batch_size=4)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

In [None]:
import matplotlib.pyplot as plt

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()


plt.show()

## Testing

In [None]:
result = model.predict([test_input_ids,test_attention_masks])
result = np.round(result).astype(int)

In [None]:
predicted_labels = [convert_prediction(pred) for pred in result]

In [None]:
output = pd.DataFrame({'id':test_df.id,'target':predicted_labels})
output

In [None]:
submission = pd.Series(output.target.values,index=output.id).to_dict()
with open('test-output.json', 'w') as f:
    json.dump(submission, f)