## Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn 
from tqdm import tqdm

import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import plot_model
from transformers import TFBertModel, BertConfig, BertTokenizerFast
from tensorflow.python.keras import backend as K

import keras
import torch.nn.functional as Fun
import torch

## Import Data

In [2]:
#Read data
#Change the path to read data
#The data must be in 1 Csv File with text and value column
path_data='/flush5/sou090/model/Train_data/data.csv'
df=pd.read_csv(path_data)
df.head()

Unnamed: 0.1,Unnamed: 0,text,value
0,0,"(1) a Federal, State, or local law enforcement...",2.0
1,1,"(1) a Federal, State, or local law enforcement...",2.0
2,2,(1) announcing the change on the home page of ...,7.0
3,3,(1) comply with the law or legal processes;,2.0
4,4,(1) comply with the law or with legal process;,2.0


### 1. Encode Labels

In [3]:
from sklearn.preprocessing import OneHotEncoder 

In [4]:
one=OneHotEncoder(sparse=False)
encoded=one.fit_transform(df[['value']])
len(encoded)

29740

### 2. Creat new data with the encoded value

In [5]:
head = {'text' : [], 'value' : []}
df_model = pd.DataFrame(head,dtype=object)
for k in tqdm(range (len(df))):
    df_model.loc[k]=[df['text'][k],encoded[k]]
df_model.head(10)

100%|██████████| 29740/29740 [01:32<00:00, 321.16it/s]


Unnamed: 0,text,value
0,"(1) a Federal, State, or local law enforcement...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,"(1) a Federal, State, or local law enforcement...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,(1) announcing the change on the home page of ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."
3,(1) comply with the law or legal processes;,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,(1) comply with the law or with legal process;,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,(1) following the \,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,(1) identify the terms of any special offers y...,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,(1) identify the terms of any special offers y...,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,(1) information we receive from you,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,"(1) personal information,","[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### 3. Split data

In [6]:
#Split data
train_df, test_df = train_test_split(df_model,test_size=0.1)
print(len(train_df),len(test_df))

26766 2974


### Check the len max 

In [7]:
ex_len = []
for i in train_df['text']:
  ex_len.append(len(i.split()))
print('max length of text: ', max(ex_len), 'words')

max length of text:  247 words


## Tokenize

In [8]:
bert_model = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(bert_model)

In [9]:
# Encoder the function to tokenize the data
def encoder(df, tokenizer, label = 'text', maxLen = 254):
    input_id = []
    token_type = []
    attention_mask = []
    for i in df[label].values:
        token = tokenizer(i, max_length = maxLen, truncation = True, padding = 'max_length', add_special_tokens = True)
        input_id.append(token['input_ids'])
        token_type.append(token['token_type_ids'])
        attention_mask.append(token['attention_mask'])
    return np.array(input_id), np.array(token_type), np.array(attention_mask)

In [10]:
#Call the function to tokenize data
train_d = encoder(train_df, tokenizer)
test_d = encoder(test_df, tokenizer)

## Config Bert

In [11]:
model_config = BertConfig.from_pretrained(bert_model)

In [12]:
model_config.output_hidden_states = True
model_config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [13]:
bert = TFBertModel.from_pretrained(bert_model, config = model_config)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


### 1. Parameters

In [14]:
# tunable parameters
max_len = 254
#fix learning_rate
learning_rate = 1e-4
epochs = 3

#callbacks
#Change the path_checkpoint
ckpt_dir = '/flush5/sou090/model/save/ckpt{epoch:02d}.h5'
ckpt = ModelCheckpoint(
    filepath = ckpt_dir,
    save_freq = 'epoch',
    save_weights_only=True)
callbacks = [ckpt]

In [15]:
#optimizer = Adam(learning_rate = learning_rate)
optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate)
#Loss
loss = 'categorical_crossentropy'
#Metrics
metrics=tf.keras.metrics.CategoricalAccuracy(name="accuracy", dtype=None)

In [16]:
input_ids_i = Input(shape = (max_len, ), dtype = tf.int32, name = 'input_ids')
token_type_ids_i = Input(shape = (max_len, ), dtype = tf.int32, name = 'token_type_ids')
attention_mask_i = Input(shape = (max_len, ), dtype = tf.int32, name = 'attention_mask')
inputs = [input_ids_i, token_type_ids_i, attention_mask_i]

bert_output = bert(input_ids_i, token_type_ids = token_type_ids_i, attention_mask = attention_mask_i)[0]

### 2. Layers 

In [17]:
bert_output = bert(input_ids_i, token_type_ids = token_type_ids_i, attention_mask = attention_mask_i)[0]
output = bert_output[:, 0, :]

output = Dropout(0.15)(output)

output =Dense(50,activation='relu')(output)
output=Dense(25,activation='relu')(output)
output=Dense(10,activation='relu')(output)
output = Dense(10, activation = 'softmax')(output) #Adding a softmax layer for softmax regression with categorical 
#cross entropy

model = Model(inputs = inputs, outputs = output)

model.compile(loss = loss, optimizer = optimizer, metrics = metrics)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 254)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 254)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 254)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]',     

In [18]:
model.layers[3].trainable = False # Trying to make the bert layers non-trainable
model.compile(loss = loss, optimizer = optimizer, metrics = metrics)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 254)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 254)]        0           []                               
                                                                                                  
 token_type_ids (InputLayer)    [(None, 254)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]',     

### 3. Data

In [19]:
# Datas
train_l = train_df['value'].values

val_prob = 0.1
split = int(len(train_l)*(1 - val_prob))

train_x = tuple(np.array(train_d)[:, :split, :])
train_y =np.array(train_l)[:split]

val_x = tuple(np.array(train_d)[:, split:, :])
val_y =np.array(train_l)[split:]

In [20]:
train_yl = []
val_yl = []
for i in range(0,train_y.shape[0]):
    train_yl.append(list(train_y[i]))
for i in range(0,val_y.shape[0]):
    val_yl.append(list(val_y[i]))

In [21]:
train_yll = np.asarray(train_yl).astype(np.float32)
val_yl1 = np.asarray(val_yl).astype(np.float32)

## Train the model 

In [None]:
history=model.fit(train_x, train_yll, validation_data = (val_x, val_yl1), epochs = 50,callbacks = callbacks)

Epoch 1/50


In [None]:
#Change the path to save the model
model.save('/flush5/sou090/model/save/model_project.h5')

In [None]:
model_save= loaded_model = tf.keras.models.load_model('/flush5/sou090/model/save/model_project.h5', custom_objects={"TFBertModel": TFBertModel})

In [None]:
print(history.history.keys())
#print(model_save.history.keys)

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.savefig('accuracy.pdf')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.savefig('loss.pdf')
plt.show()
