<h2 align=center> Fine-Tune BERT for Text Classification with TensorFlow</h2>

In [None]:
#Importing the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

#bert libraries
from transformers import TFAutoModel, BertTokenizer


In [None]:
print(f'Pandas Version: {pd.__version__}')
print(f'Numpy Version: {np.__version__}')
print(f'Seaborn Version: {sns.__version__}')

In [None]:
#nlp = spacy.load("en_core_web_sm")

### Exploratory Data Analysis

In [None]:
df = pd.read_csv(r'df.csv')
df = df[['Keyword','Label']]
df.head(10)

In [None]:
def get_analysis_values(dataframe, columnname):
    print(f'Shape of the dataframe is {dataframe.shape}')
    print(dataframe.info())
    print('Data labels Distribution')
    print(dataframe[columnname].value_counts())
    print()
    fig=plt.figure(figsize=(10, 5))
    plt.hist(dataframe[columnname],color = "skyblue", lw=0)
    plt.xlabel('Types of Labels')
    plt.ylabel('Number of Instances')
    plt.title('Distribution of Label in Dataset');
    

In [None]:
get_analysis_values(df, 'Label')

### Data Preparation

In [None]:
def minimum_label_count(dataframe, columnname):
    label_count = dataframe[columnname].value_counts()
    criteria = label_count < 100
    col_out = criteria[criteria == True]
    remove_col_name = col_out.index[0]
    print(f'Column {remove_col_name} fulfills the criteria < 100. Thus, removing it out of our dataframe')
    
    dataframe = dataframe[dataframe[columnname] != 'remove_col_name']
    print(f'New Shape of the Dataframe : {dataframe.shape}')

In [None]:
minimum_label_count(df, 'Label')

In [None]:
#Checks if the data columns are present in the df
def check_if_columns_in_data_frame(df, columns_needed):
    checked = [col for col in columns_needed if col in df.columns.values]
    if len(checked) < len(columns_needed):
        missing_cols = [col for col in columns_needed if col not in checked]
        raise ValueError(
            f"Column(s) {missing_cols} not found in dataset, found {checked}. Please change your column names"
        )

In [None]:
def _prepare_df(df):
    df.columns = map(str.lower, df.columns)
    check_if_columns_in_data_frame(df, ["keyword", "label"])
    return df.drop_duplicates(
        subset=["keyword"]
        )

In [None]:
_prepare_df(df)

### Data Preprocessing

In [None]:
def preprocess_data(dataframe, columntoencode, newcolumn):
    possible_labels = dataframe[columntoencode].unique()
    label_dict = {}
    
    for index, possible_label in enumerate(possible_labels):
        label_dict[possible_label] = index
        
    dataframe[columntoencode] = dataframe[columntoencode].replace(label_dict)
    
    dataframe[newcolumn] = dataframe[columntoencode].replace(label_dict)
    
    dataframe.drop([columntoencode], axis = 1, inplace = True)
    return dataframe.head(10)

In [None]:
preprocess_data(df,'label','Label')

### Tokenize and Form Input Layers for Bert

In [None]:
engmodel = 'bert-base-cased'
dutchmodel = 'GroNLP/bert-base-dutch-cased'

seq_len = 128
num_samples = len(df)

#token ids
x_ids = np.zeros((num_samples,seq_len))
x_mask = np.zeros((num_samples,seq_len)) #attention mask

print(x_ids.shape)
print(x_mask.shape)

In [None]:
#initiating the bert tokenizer
tokenizer = BertTokenizer.from_pretrained(engmodel)

#going through each of the sentences in keyword and tokenizing them. (forms a dictionary)
for i, phrase in enumerate(df['keyword']):
    tokens = tokenizer.encode_plus(phrase, max_length = seq_len, truncation=True,
                                  padding = 'max_length', add_special_tokens=True,
                                  return_tensors='tf')
    #filling the arrays of x_ids and x_masks with the new tokenized values 
    x_ids[i, :] = tokens['input_ids']
    x_mask[i, :] = tokens['attention_mask']

In [None]:
x_ids

The 101 is the [CLS] tokens and the 0 is the [PAD] tokens.

In [None]:
x_mask

In [None]:
#one hot encoding 
arr = df['Label'].values
arr

In [None]:
#initialize a zero array 
#columns for each class
labels = np.zeros((num_samples, arr.max()+1)) 
labels.shape

In [None]:
#specify the row 
labels[np.arange(num_samples), arr] = 1
labels

In [None]:
#putting them into format that tensorflow will read

import tensorflow as tf 
dataset = tf.data.Dataset.from_tensor_slices((x_ids, x_mask, labels))

#shows the very top batch/ sample
dataset.take(1)

In [None]:
print(tf.__version__)

Each sample in our dataset is a tuple containing single x_ids, x_mask and label tensors. However, when feeding data into our neural network we need a two-item tuple in the format (\<inputs>, \<outputs>). Now, we have two tensors for our inputs - so, what we do is enter our \<inputs> tensor as dictionary. 
 ### {
    'input_ids': <input_id_tensor>,
    'attention_mask': <mask_tensor>
  }

In [None]:
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels # we convert our three-item tuple into a two-item tuple where the input item is a dictionary


# then we use the dataset map method to apply this transformation
dataset = dataset.map(map_func)

dataset.take(1)

In [None]:
batch_size = 32

#for shuffling the data values
#drop remainder is making sure the batches are 32 and 
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

dataset.take(1)

In [None]:
len(dataset)

In [None]:
split = 0.2

           #number of samples
steps_per_epoch = int((x_ids.shape[0] / batch_size) * split)
print(steps_per_epoch)

In [None]:
#fitting the size of the dataset values in the training set
train_ds = dataset.take(steps_per_epoch)
print(len(train_ds))

#fitting the remaining by skipping the trained samples for the validation set
val_ds = dataset.skip(steps_per_epoch)
print(len(val_ds))

del dataset

In [None]:
text = "The quick brown fox jumps over the lazy dog."

In [None]:
aug_bert = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action=ACT, top_k=TOPK)

In [None]:
def augment_text(df,samples=300):
    aug_bert = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action=ACT, top_k=TOPK)
    new_text=[]
    
    
    ##selecting the minority class samples
    df_n=df[df.Label==3].reset_index(drop=True)

    ## data augmentation loop
    for i in tqdm(np.random.randint(0,len(df_n),samples)):
        
            text = df_n.iloc[i]['text']
            augmented_text = aug_bert(text)
            new_text.append(augmented_text)
    
    
    ## dataframe
    new=pd.DataFrame({'text':new_text,'target':1})
    df=shuffle(df.append(new).reset_index(drop=True))
    return df
   
train_ds = augment_text(train_ds)

In [None]:
bert = TFAutoModel.from_pretrained(engmodel)

bert.summary()

In [None]:
def create_classifier_model():
    
    #Input Layers
    input_ids = tf.keras.layers.Input(shape=(seq_len,), name = 'input_ids', dtype = 'int32')
    mask_ids = tf.keras.layers.Input(shape=(seq_len,), name = 'attention_mask', dtype = 'int32')

    #Creating embeddings from bert 
    #[0] is the 3d tensors pooled into 2d tensors and we have dense layers so we need the pooled layer
    embeddings = bert.bert(input_ids,attention_mask= mask_ids)[1]

    #Convert these embeddings into our label predictions
    #passing the embeddings into the dense layer
    x = tf.keras.layers.Dense(1024, activation = 'relu')(embeddings)
    y = tf.keras.layers.Dense(arr.max()+1, activation='softmax', name = 'outputs')(x)
    
    model = tf.keras.Model(inputs=[input_ids, mask_ids], outputs = y)
    
    optimizer = tf.keras.optimizers.Adam(learning_rate = 5e-5, decay = 1e-6)
    loss = tf.keras.losses.CategoricalCrossentropy()
    acc = tf.keras.metrics.CategoricalAccuracy('accuracy')
    
    model.compile(optimizer=optimizer, loss = loss, metrics=[acc])
    
    return model

In [None]:
model = create_classifier_model()
model.summary()

In [None]:
#model.compile(optimizer=optimizer, loss = loss, metrics=[acc])
history = model.fit(train_ds,validation_data=val_ds,
                    epochs = 4)

In [None]:
#Model Accuracy 
plt.figure(figsize=(9,5))
plt.plot(history.history['accuracy'], color = 'blueviolet', marker = 'h', label = 'Train Set')
plt.plot(history.history['val_accuracy'], color = 'lightcoral', marker = 'd', label = 'Test Set')

plt.title('Model Accuracy')
plt.xlabel('Number of Epochs')
plt.ylabel('Accuracy Score')
plt.legend(loc='upper left')
plt.show()

In [None]:
#Model Loss 
plt.figure(figsize=(9,5))
plt.plot(history.history['loss'], color = 'blueviolet', marker = 'h', label = 'Train Set')
plt.plot(history.history['val_loss'], color = 'lightcoral', marker = 'd', label = 'Test Set')

plt.title('Model Loss')
plt.xlabel('Number of Epochs')
plt.ylabel('Loss Score')
plt.legend(loc='upper left')
plt.show()

## Model Results

In [None]:
train_loss, train_acc = model.evaluate(train_ds)
test_loss, test_acc = model.evaluate(val_ds)
print("Training Set Accuracy: {:.2f}" . format(train_acc))
print("Test Set Accuracy: {:.2f}" . format(test_acc))

In [None]:
def evaluate_model(y_pred, y_true):
    y_pred = model.predict(val_ds)
    y_pred = tf.argmax(y_pred, axis=1) #axis = 1, to get the highest common values and for classfication metrics to handle a multiclass and continou-outpuits targers.
    
    y_true = tf.concat([y for x, y in val_ds], axis=0)
    y_true = np.argmax(y_true, axis = 1)
    
    
    print(len(y_pred))
    print(len(y_true))
    
    #Plot Confusion Matrix
    sns.heatmap(confusion_matrix(y_true,y_pred), annot = True, cmap = plt.cm.Blues, fmt = ".1f")
    plt.title("Confusion Matrix of Test Data")
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    plt.show()
    
    #Plot Classification Report
    print(classification_report(y_true, y_pred))

In [None]:
model.save('label_model')

### Prediction on New Keywords

In [None]:
import tensorflow 

In [None]:
# initialize tokenizer from transformers
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prep_data(text):
    # tokenize to get input IDs and attention mask tensors
    tokens = tokenizer.encode_plus(text, 
                                   max_length=128,
                                   truncation=True, 
                                   padding='max_length',
                                   add_special_tokens=True, 
                                   return_token_type_ids=False,
                                   return_tensors='tf')
    # tokenizer returns int32 tensors, we need to return float64, so we use tf.cast
    return {'input_ids': tensorflow.cast(tokens['input_ids'], tensorflow.float64),
            'attention_mask': tensorflow.cast(tokens['attention_mask'], tensorflow.float64)}

In [None]:
keyword = prep_data("best camera for me")
label = model.predict(keyword)[0]
label

In [None]:
import numpy as np

np.argmax(label)