# Train model and validation
Here, I use BERT as base-model, and fine-tune it

### References
  * [kaggle: Real/Fake Job Posting Prediction](https://www.kaggle.com/datasets/shivamb/real-or-fake-fake-jobposting-prediction)
  * [進擊的 BERT：NLP 界的巨人之力與遷移學習](https://leemeng.tw/attack_on_bert_transfer_learning_in_nlp.html)

In [1]:
%load_ext autoreload
%autoreload 2

## Install transformers
Colab is not installed in default
* before you install, check your notebook settings or wiew runtime resource, change your notebook hardware accelerator to GPU
* more details, you can click following link:
[Google Colab - Using Free GPU](https://www.tutorialspoint.com/google_colab/google_colab_using_free_gpu.htm)


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [3]:
import re
import os
import copy
import torch

import pandas as pd
import numpy as np

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from torch.nn.utils.rnn import pad_sequence

## Implement a dataset class to transform data to tensor

In [15]:
class JobPostingDataset(Dataset):    
    def __init__(self, data, tokenizer, has_label=True):        
        self.df = copy.deepcopy(data).fillna("")
        self.len = len(self.df)
        self.tokenizer = tokenizer
        self.has_label = has_label
        self.label_map = None
        
        if self.has_label:
          self.label_map = {
              label:index for index, label in enumerate(self.df.iloc[:, -1].unique().tolist())
          }
    
    def __getitem__(self, idx):
        if self.has_label:          
          text, label = self.df.iloc[idx, :].values
          label_id = self.label_map[label]
          label_tensor = torch.tensor(label_id)
            
        else:
          text = self.df.iloc[idx, :].values
          label_tensor = None           

        # Limit word pieces length to avoid error   
        text_word_pieces = self.tokenizer.tokenize(text, max_length=512, truncation=True)
        text_len = len(text_word_pieces)
                
        ids = self.tokenizer.convert_tokens_to_ids(text_word_pieces)
        tokens_tensor = torch.tensor(ids)

        segments_tensor = torch.tensor([0]*text_len, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

## Define mini_batch function for processing batch data

In [5]:
def mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # If sample has label
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero padding to same sequence length
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)
    
    # attention masks，let BERT focus on non-zero padding values
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

## Define get_predictions function for getting predictions and computing metrics

In [6]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
    recall_total = 0
    recall_correct = 0
    precision_total = 0
    precision_correct = 0
      
    with torch.no_grad():        
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(
                input_ids=tokens_tensors, 
                token_type_ids=segments_tensors, 
                attention_mask=masks_tensors
                )
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            if compute_acc:
                labels = data[3]
                
                total += labels.size(0)                
                recall_total += (labels==1).sum().item()
                precision_total += (pred==1).sum().item()
                
                correct += (pred==labels).sum().item()
                recall_correct += ((pred==labels)&(labels==1)).sum().item()
                precision_correct += ((pred==labels)&(pred==1)).sum().item()
                
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    # Computing Mertics
    if compute_acc:
        acc = correct / total
        
        recall = 0
        if recall_total > 0:
            recall = recall_correct/recall_total
        
        precision = 0
        if precision_total > 0:
            precision = precision_correct/precision_total
        
        return predictions, acc, recall, precision
    return predictions

## Variables setting

In [10]:
# Default mount path on colab
DRIVE_MOUNT_PATH = "/content/drive"

# Project path on your google drive
# You can change this line 
PROJECT_PATH = "MyDrive/Fake-JD-Detector"

# Target or label column name
TARGET_COLUMN = "fraudulent"

# Used based-model
BESED_MODEL = "bert-base-cased"
NUM_CLASS = 2

#
BATCH_SIZE = 16
TRAIN_RUNS = 10
MAX_EPOCHS = 10

## Create model and tokenizer from pretrained

In [8]:
tokenizer = AutoTokenizer.from_pretrained(BESED_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BESED_MODEL, num_labels=NUM_CLASS)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

### mount your google drive

In [11]:
from google.colab import drive
drive.mount(DRIVE_MOUNT_PATH)

Mounted at /content/drive


## Loading training data

In [12]:
data_path = os.path.join(DRIVE_MOUNT_PATH, PROJECT_PATH, "splitted_text_data")
train_df = pd.read_csv(data_path+"/train.csv")
train_df

Unnamed: 0,text,fraudulent
0,Production Technician 1781 US NC Garner 1781Jo...,1
1,PHP Software Engineer GR I Athens Cookisto is ...,0
2,TV Makeup Artist US FL Miami Applied Memetics ...,0
3,Credit and Collections Clerk US WA Basil Lifec...,1
4,Entry Level Sales US VA Norfolk General Summar...,0
...,...,...
14257,Pay Per Click Specialist US NJ Hoboken SpotSou...,0
14258,Customer Service Technical Specialist US MN St...,0
14259,Lead Business Analyst US NY New York City Job ...,1
14260,Ruby on Rails Expert Disrupt the financial ind...,0


In [16]:
%%time

model.train()

# Using Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

for run in range(TRAIN_RUNS):
    
    # Because of computing resource, I only use small sample data to training
    train_sample_dataset = JobPostingDataset(data=train_df.sample(frac=0.01), tokenizer=tokenizer)
    train_sample_data_loader = DataLoader(train_sample_dataset, batch_size=BATCH_SIZE, collate_fn=mini_batch)
    acc, recall, precision, eps, eps_loss = 0, 0, 0, 0, 0

    for epoch in range(MAX_EPOCHS):

        running_loss = 0.0

        for data in train_sample_data_loader:

            tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

            optimizer.zero_grad()
            outputs = model(
                input_ids=tokens_tensors,
                token_type_ids=segments_tensors,
                attention_mask=masks_tensors,
                labels=labels
            )

            loss = outputs[0]
            # backward
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        _, acc, recall, precision = get_predictions(model, train_sample_data_loader, compute_acc=True)
        eps, eps_loss = epoch, running_loss

        if recall == 1 and precision == 1:            
            break

    print("[run %d][epoch %d] loss: %.3f, acc: %.3f, recall: %.3f, precision: %3f" % (run+1, eps+1, eps_loss, acc, recall, precision))

device: cuda
[run 1][epoch 10] loss: 0.765, acc: 0.965, recall: 0.286, precision: 1.000000
[run 2][epoch 10] loss: 0.502, acc: 0.965, recall: 0.286, precision: 1.000000
[run 3][epoch 10] loss: 0.257, acc: 0.986, recall: 0.667, precision: 1.000000
[run 4][epoch 8] loss: 0.284, acc: 1.000, recall: 1.000, precision: 1.000000
[run 5][epoch 6] loss: 0.233, acc: 1.000, recall: 1.000, precision: 1.000000
[run 6][epoch 6] loss: 0.116, acc: 1.000, recall: 1.000, precision: 1.000000
[run 7][epoch 4] loss: 0.554, acc: 1.000, recall: 1.000, precision: 1.000000
[run 8][epoch 4] loss: 0.189, acc: 1.000, recall: 1.000, precision: 1.000000
[run 9][epoch 4] loss: 0.216, acc: 1.000, recall: 1.000, precision: 1.000000
[run 10][epoch 3] loss: 0.209, acc: 1.000, recall: 1.000, precision: 1.000000
CPU times: user 19min 46s, sys: 2.41 s, total: 19min 48s
Wall time: 19min 58s


In [17]:
model.save_pretrained("/content/drive/MyDrive/Fake-JD-Detector/fine_tuned_model")

## Validation

In [24]:
validation_df = pd.read_csv(data_path+"/validation.csv")
validation_df

Unnamed: 0,text,fraudulent
0,Professional Services Specialist GB LND As a g...,0
1,Sr Windows Server Administrator US NC Sanford ...,0
2,Account Payable Rep US CA Chula Vista QB Medic...,0
3,Maintenance Person GB ABD Aberdeenshire Priori...,0
4,Online Marketing Specialist GB LND London incr...,0
...,...,...
3561,Graduate Software Engineer PK PB Lahore Founde...,0
3562,Tile Installer Setter US CA Fairfield This pos...,0
3563,Software Engineer Android Japanese US CA San F...,0
3564,English Teacher Abroad US NY New York We help ...,0


In [28]:
validation_dataset = JobPostingDataset(data=validation_df, tokenizer=tokenizer)
validation_data_loader = DataLoader(validation_dataset, batch_size=BATCH_SIZE, collate_fn=mini_batch)

In [29]:
_, acc, recall, precision = get_predictions(model, validation_data_loader, compute_acc=True)
print("classification acc: ", acc)
print("classification recall:", recall)
print("classification precision:", precision)

classification acc:  0.9610207515423443
classification recall: 0.38372093023255816
classification precision: 0.6666666666666666


## Testing saved model

In [30]:
# If you want to test saved model
test_df = pd.read_csv(data_path+"/validation.csv")
test_df

Unnamed: 0,text,fraudulent
0,Professional Services Specialist GB LND As a g...,0
1,Sr Windows Server Administrator US NC Sanford ...,0
2,Account Payable Rep US CA Chula Vista QB Medic...,0
3,Maintenance Person GB ABD Aberdeenshire Priori...,0
4,Online Marketing Specialist GB LND London incr...,0
...,...,...
3561,Graduate Software Engineer PK PB Lahore Founde...,0
3562,Tile Installer Setter US CA Fairfield This pos...,0
3563,Software Engineer Android Japanese US CA San F...,0
3564,English Teacher Abroad US NY New York We help ...,0


In [31]:
test_dataset = JobPostingDataset(data=test_df, tokenizer=tokenizer)
test_data_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=mini_batch)

In [32]:
saved_model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/Fake-JD-Detector/fine_tuned_model")

In [33]:
_, acc, recall, precision = get_predictions(model, test_data_loader, compute_acc=True)
print("classification acc: ", acc)
print("classification recall:", recall)
print("classification precision:", precision)

classification acc:  0.9613011777902412
classification recall: 0.4127906976744186
classification precision: 0.6574074074074074
