In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
import sklearn
from tqdm import tqdm




In [3]:
df = pd.read_csv('../data/IEEEData-dict2.csv')
df.sample()

Unnamed: 0,word2,label
3144,9d345009-a-62cb3a1a-s-sites googlegroups com ...,1


In [4]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development





All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
example = df['word2'][0]
tokens = tokenizer.tokenize(example)
token_ids = tokenizer.convert_tokens_to_ids(tokens)
print(tokens)
print(token_ids)

['bela', '##jar', '##pro', '##gram', '##ming', 'com', 'new', '##pa', '##yp', '##al', 'log', '##gy', 'pay', '##pal', '.', 'h', '##tm']
[20252, 16084, 21572, 13113, 6562, 4012, 2047, 4502, 22571, 2389, 8833, 6292, 3477, 12952, 1012, 1044, 21246]


In [7]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109483778 (417.65 MB)
Trainable params: 109483778 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [10]:
train = df[:76625]
test = df[76625:]

In [11]:
print(len(train))
print(len(test))

train = train.rename(columns={"word2": "text"})
test = test.rename(columns={"word2": "text"})
train.to_csv("../data/IEEE-dict2-train.csv", index=False)
test.to_csv("../data/IEEE-dict2-val.csv", index=False)

76625
19157


In [13]:
def convert2inputexamples(train, test, text, label): 
    trainexamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[text], 
                                                          label = x[label]), axis = 1)

    validexamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[text], 
                                                          label = x[label]), axis = 1)
  
    return trainexamples, validexamples

trainexamples, validexamples = convert2inputexamples(train,  test, 'word2',  'label')

In [14]:
def convertexamples2tf(examples, tokenizer, max_length=32):
    features = []

    for i in tqdm(examples):
        input_dict = tokenizer.encode_plus(
            i.text_a,
            add_special_tokens=True,    # Add 'CLS' and 'SEP'
            max_length=max_length,    # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],input_dict["token_type_ids"], input_dict['attention_mask'])
        features.append(InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=i.label) )

    def generate():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        generate,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'word2'
LABEL_COLUMN = 'label'            

In [15]:
train_data = convertexamples2tf(list(trainexamples), tokenizer)
train_data = train_data.shuffle(100).batch(16).repeat(2)

100%|██████████████████████████████████████████████████████████████████████████| 76625/76625 [00:30<00:00, 2501.18it/s]


In [16]:
validation_data = convertexamples2tf(list(validexamples), tokenizer)
validation_data = validation_data.batch(16)

100%|██████████████████████████████████████████████████████████████████████████| 19157/19157 [00:08<00:00, 2368.93it/s]


In [17]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

In [18]:
model.fit(train_data, epochs=1, validation_data=validation_data)




<keras.src.callbacks.History at 0x18c5cc43890>

In [19]:
input_ids_test = np.asarray([tokenizer(sent, padding="max_length", truncation=True)["input_ids"] for sent in tqdm(test.word2)])
attention_mask_test = np.asarray([tokenizer(sent,padding="max_length",truncation=True)["attention_mask"] for sent in tqdm(test.word2)])
token_type_ids_test = np.asarray([tokenizer(sent,padding="max_length",truncation=True)["token_type_ids"] for sent in tqdm(test.word2)])

100%|██████████████████████████████████████████████████████████████████████████| 19157/19157 [00:08<00:00, 2233.78it/s]
100%|██████████████████████████████████████████████████████████████████████████| 19157/19157 [00:08<00:00, 2236.12it/s]
100%|██████████████████████████████████████████████████████████████████████████| 19157/19157 [00:08<00:00, 2281.34it/s]


In [20]:
# Get predictions
y_pred = model.predict([input_ids_test, attention_mask_test, token_type_ids_test])
y_pred_proba = [float(x[1]) for x in tf.nn.softmax(y_pred.logits)]
y_pred_label = [0 if x[0] > x[1] else 1 for x in tf.nn.softmax(y_pred.logits)]


# Evaluate the model
from sklearn.metrics import (
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
)

print("Confusion Matrix : ")

Confusion Matrix : 


In [21]:
print(confusion_matrix(test.label.values, y_pred_label))

[[9408  142]
 [ 138 9469]]
