## Import libraries


In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import plot_model
from transformers import TFBertModel, BertConfig, BertTokenizerFast
from tensorflow.python.keras import backend as K
import matplotlib.pyplot as plt


In [3]:
import torch.nn.functional as Fun
import torch

## Import Data

In [4]:
#Read data
df=pd.read_csv('/flush5/sou090/project7/data_c_10.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,text,value
0,0,"(1) a Federal, State, or local law enforcement...",4.0
1,1,"(1) a Federal, State, or local law enforcement...",4.0
2,2,(1) announcing the change on the home page of ...,4.0
3,3,(1) comply with the law or legal processes;,4.0
4,4,(1) comply with the law or with legal process;,4.0


In [5]:
type(df['value'][2])

numpy.float64

### 1. Encode Labels

In [6]:
from sklearn.preprocessing import OneHotEncoder 

In [7]:
one=OneHotEncoder(sparse=False)
encoded=one.fit_transform(df[['value']])
len(encoded)

29740

### 2. Creat new data with the encoded value

In [8]:
head = {'text' : [], 'value' : []}
df_model = pd.DataFrame(head,dtype=object)

In [9]:
for k in tqdm(range (len(df))):
    df_model.loc[k]=[df['text'][k],encoded[k]]
df_model.head(10)

100%|██████████| 29740/29740 [01:30<00:00, 328.04it/s]


Unnamed: 0,text,value
0,"(1) a Federal, State, or local law enforcement...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
1,"(1) a Federal, State, or local law enforcement...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
2,(1) announcing the change on the home page of ...,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
3,(1) comply with the law or legal processes;,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
4,(1) comply with the law or with legal process;,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
5,(1) following the \,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]"
6,(1) identify the terms of any special offers y...,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]"
7,(1) identify the terms of any special offers y...,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
8,(1) information we receive from you,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"
9,"(1) personal information,","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]"


### 2. Create the new data

In [41]:
#Split data
train_df, test_df = train_test_split(df_model)

print(train_df)

                                                    text  \
22799  shopping cart, screen preferences, and the pag...   
13894  communicate related information, such as order...   
28518  with third party processors for processing on ...   
17250  in the sites and in our communications with yo...   
24132                                    third parties f   
...                                                  ...   
2286   Geekdo collects information about your transac...   
4544                                    Mapping features   
9185   We may use cookies, web beacons/pixel tags, lo...   
6710   The information we learn from customers helps ...   
6272   Submit to opt out of online Site behavior used...   

                                     value  
22799  [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]  
13894  [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]  
28518  [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]  
17250  [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]  
24132  [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0]  
...      

### Check the len max of the word

In [11]:
ex_len = []
for i in train_df['text']:
  ex_len.append(len(i.split()))
print('max length of text: ', max(ex_len), 'words')

max length of text:  254 words


## Tokenize

In [12]:
bert_model = 'bert-base-uncased'
tokenizer = BertTokenizerFast.from_pretrained(bert_model)

In [13]:
# Encoder the function to tokenize the data
def encoder(df, tokenizer, label = 'text', maxLen = 210):
    input_id = []
    token_type = []
    attention_mask = []
    for i in df[label].values:
        token = tokenizer(i, max_length = maxLen, truncation = True, padding = 'max_length', add_special_tokens = True)
        input_id.append(token['input_ids'])
        token_type.append(token['token_type_ids'])
        attention_mask.append(token['attention_mask'])
    return np.array(input_id), np.array(token_type), np.array(attention_mask)

In [14]:
#Call the function to tokenize data
train_d = encoder(train_df, tokenizer)
test_d = encoder(test_df, tokenizer)

# Config of Bert

In [15]:
model_config = BertConfig.from_pretrained(bert_model)

In [16]:
model_config.output_hidden_states = True
model_config

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

## Parameters

In [17]:
bert = TFBertModel.from_pretrained(bert_model, config = model_config)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


### Tunable parameters

In [48]:
# tunable parameters
max_len = 210
#fix learning_rate
learning_rate = 1e-4
epochs = 3

In [49]:
#callbacks
ckpt_dir = 'ckpt{epoch:02d}.h5'
ckpt = ModelCheckpoint(
    filepath = ckpt_dir,
    save_freq = 'epoch',
    save_weights_only=True)
callbacks = [ckpt]

In [None]:
#optimizer = Adam(learning_rate = learning_rate)
optimizer=tf.keras.optimizers.Adam(
    learning_rate=0.001,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-07,
    amsgrad=False,
    name="Adam",
    **kwargs
)

In [None]:
#Loss
loss = 'categorical_crossentropy'

In [None]:
#Metrics
metrics=tf.keras.metrics.CategoricalAccuracy(name="accuracy", dtype=None)

### Bert

In [None]:
input_ids_i = Input(shape = (max_len, ), dtype = tf.int32, name = 'input_ids')
token_type_ids_i = Input(shape = (max_len, ), dtype = tf.int32, name = 'token_type_ids')
attention_mask_i = Input(shape = (max_len, ), dtype = tf.int32, name = 'attention_mask')
inputs = [input_ids_i, token_type_ids_i, attention_mask_i]

bert_output = bert(input_ids_i, token_type_ids = token_type_ids_i, attention_mask = attention_mask_i)[0]

## Add layers

In [None]:
bert_output = bert(input_ids_i, token_type_ids = token_type_ids_i, attention_mask = attention_mask_i)[0]
output = bert_output[:, 0, :]

output = Dropout(0.3)(output)

output = Dense(10, activation = 'relu')(output)

output = Dense(7, activation = 'softmax')(output) #Adding a softmax layer for softmax regression with categorical 
#cross entropy

model = Model(inputs = inputs, outputs = output)

model.compile(loss = loss, optimizer = optimizer, metrics = metrics)

model.summary()

In [61]:
bert_output

<KerasTensor: shape=(None, 210, 768) dtype=float32 (created by layer 'tf_bert_model')>

In [62]:
# Datas
train_l = train_df['value'].values

val_prob = 0.2
split = int(len(train_l)*(1 - val_prob))

train_x = tuple(np.array(train_d)[:, :split, :])
train_y =np.array(train_l)[:split]

val_x = tuple(np.array(train_d)[:, split:, :])
val_y =np.array(train_l)[split:]

In [63]:
train_yl = []
val_yl = []
for i in range(0,train_y.shape[0]):
    train_yl.append(list(train_y[i]))
for i in range(0,val_y.shape[0]):
    val_yl.append(list(val_y[i]))

In [64]:
train_yll = np.asarray(train_yl).astype(np.float32)
val_yl1 = np.asarray(val_yl).astype(np.float32)

## Train the model 

In [None]:
history=model.fit(train_x, train_yll, validation_data = (val_x, val_yl1), epochs = 10)

In [None]:
print(history.history.keys())

In [None]:
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()


## Test the model 

In [29]:
test_model=tuple(np.array(test_d))
test_model_val=np.array(test_df['value'].values)

In [30]:
test_yl = []

for i in range(0,len(test_model)):
    test_yl.append(list(test_model_val[i]))

In [31]:
a=model.predict(test_model)

In [32]:
a

array([[7.9964260e-03, 2.5284803e-03, 9.9971414e-02, ..., 2.8065926e-01,
        6.0672247e-01, 1.1110840e-03],
       [3.1416974e-04, 2.3464370e-04, 9.6047288e-01, ..., 3.7956160e-02,
        6.2877976e-04, 1.9828625e-05],
       [2.9507941e-03, 9.8873502e-01, 2.0062386e-03, ..., 1.4966296e-03,
        4.5724483e-03, 1.7803730e-04],
       ...,
       [5.1689416e-04, 1.6180331e-05, 9.9644178e-01, ..., 1.8473691e-03,
        1.1502355e-03, 1.1271140e-06],
       [1.1378243e-03, 4.2802579e-04, 5.1281315e-01, ..., 4.8310748e-01,
        9.5566647e-04, 4.8137709e-04],
       [3.5086492e-04, 1.5715865e-05, 2.4540357e-03, ..., 9.9703264e-01,
        3.5865214e-05, 6.2416031e-05]], dtype=float32)

In [33]:
sum(a[0])

1.0000000067520887

In [34]:
len(a)

7435

In [35]:
test_model_val[0]

array([0., 0., 0., 0., 1., 0., 0.])

In [36]:
len(test_model_val)

7435

In [37]:
def maximum (l):
    u=0
    max=0
    for k in range (len(l)):
        if l[k]>max:
            max=l[k]
            u=k
    return(u)

In [38]:
i=0
j=0
for k in range (len(a)):
    if maximum(a[k])==maximum(test_model_val[k]):
        i=i+1
    else: 
        j=j+1
print('the model has {} good answers'.format(i))
print('the model has {} bad answers'.format(j))
print('The accuracy is {}'.format(i/(i+j)))


the model has 5364 good answers
the model has 2071 bad answers
The accuracy is 0.7214525891055817


In [None]:
Y=[]
X=[k for k in range (len(a))]
for k in range (len(a)):
    Y.append(100*max(a[k]))
plt.plot(X,Y)

###### 