## BERT

In [None]:
# !git clone --depth 1 -b v2.3.0 https://github.com/tensorflow/models.git

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub  as hub
import sys
sys.path.append('models')
from sklearn.model_selection import train_test_split
from official.nlp.data import classifier_data_lib
from official.nlp.bert import tokenization
from official.nlp import optimization
import seaborn as sns

In [None]:
df = pd.read_csv(r'C:\Users\Admin\tweet\train.csv')

## Data preprocessing

In [None]:
train_df, eval_df = train_test_split(df, test_size = 0.10)

In [None]:
train_data = tf.data.Dataset.from_tensor_slices((train_df.text.values,train_df.target.values))
valid_data = tf.data.Dataset.from_tensor_slices((eval_df.text.values,eval_df.target.values))

In [None]:
label_list = [0,1]
max_seq_length = 150
bert_layer = hub.KerasLayer('https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2', trainable = True) 
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file = vocab_file, do_lower_case = do_lower_case)

In [None]:
# This provides a function to convert row to input features and label
def to_feature(text, label, label_list = label_list, max_seq_length = max_seq_length, tokenizer= tokenizer):
    example = classifier_data_lib.InputExample(
        guid = None,
        text_a = text.numpy(),
        text_b =None,
        label = label.numpy()
    )
    feature = classifier_data_lib.convert_single_example(
        0,
        example,
        label_list,
        max_seq_length,
        tokenizer
    )
    return (feature.input_ids, feature.input_mask, feature.segment_ids, feature.label_id)
    

In [None]:
def to_feature_map(text, label):

    input_ids, input_mask, segment_ids, label_id = tf.py_function(
        to_feature,
        inp = [text,label],
        Tout = [tf.int32,tf.int32, tf.int32, tf.int32 ]
    )

    input_ids.set_shape([max_seq_length])
    input_mask.set_shape([max_seq_length])
    segment_ids.set_shape([max_seq_length])
    label_id.set_shape([])

    x = {
        'input_word_ids': input_ids,
        "input_mask": input_mask, 
        "input_type_ids": segment_ids
    }

    return (x, label_id)
 

In [None]:
# train
train_data = (
    train_data.map(to_feature_map,
                   num_parallel_calls = tf.data.experimental.AUTOTUNE)
    .shuffle(1000)
    .batch(32,drop_remainder = True)
    .prefetch(tf.data.experimental.AUTOTUNE)
  )

# valid
valid_data = (valid_data.map(to_feature_map,
                               num_parallel_calls = tf.data.experimental.AUTOTUNE)
.batch(32, drop_remainder = True)
.prefetch(tf.data.experimental.AUTOTUNE)
)

## Building model

In [None]:
def get_model():
    input_word_ids = tf.keras.layers.Input(shape =(max_seq_length,),dtype = tf.int32,
                                           name='input_word_ids')
    input_mask = tf.keras.layers.Input(shape =(max_seq_length,),dtype = tf.int32,
                                           name='input_mask')
    input_type_ids = tf.keras.layers.Input(shape =(max_seq_length,),dtype = tf.int32,
                                           name='input_type_ids')
    
    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, input_type_ids])
    drop = tf.keras.layers.Dropout(0.4)(pooled_output)
    output = tf.keras.layers.Dense(1,activation = 'sigmoid', name = "output")(drop)

    model = tf.keras.Model(
        inputs ={
            'input_word_ids': input_word_ids,
            'input_mask': input_mask,
            'input_type_ids': input_type_ids
        },
        outputs = output
    )
    return model 

In [None]:
model = get_model()
model.compile(optimizer = tf.keras.optimizers.Adam(),
              loss = tf.keras.losses.BinaryCrossentropy(),
              metrics = [tf.keras.metrics.BinaryAccuracy()])
model.summary()

In [None]:
# Train model_BERT NLP
epochs = 4
his = model.fit(
    train_data,
    validation_data = valid_data,
    epochs = epochs,
    verbose = 1
) 