# Some References for Inspiration

* [Real/Fake Job Posting Prediction](https://www.kaggle.com/datasets/shivamb/real-or-fake-fake-jobposting-prediction)
* [Fine-tuing BERT model for text classification](https://www.kaggle.com/datasets/shivamb/real-or-fake-fake-jobposting-prediction)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m70.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m118.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [3]:
import re
import os
import copy
import torch

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from torch.nn.utils.rnn import pad_sequence

In [4]:
class JobPostingDataset(Dataset):    
    def __init__(self, mode, data, label_column, tokenizer):
        assert mode in ["train", "test"]
        self.mode = mode        
        self.df = copy.deepcopy(data).fillna("")
        self.len = len(self.df)
        self.label_column = label_column
        self.label_map = {label:index for index, label in enumerate(list(set(self.df[label_column].to_list())))}
        self.tokenizer = tokenizer
    
    def __getitem__(self, idx):
        if self.mode == "test":
            text = self.df.iloc[idx, :1].values
            label_tensor = None
        else:
            text, label = self.df.iloc[idx, :].values            
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
            
        text_word_pieces = self.tokenizer.tokenize(text, max_length=512, truncation=True)
        text_len = len(text_word_pieces)
                
        ids = self.tokenizer.convert_tokens_to_ids(text_word_pieces)
        tokens_tensor = torch.tensor(ids)

        segments_tensor = torch.tensor([0]*text_len, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

In [5]:
def mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

In [6]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
    recall_total = 0
    recall_correct = 0
    precision_total = 0
    precision_correct = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                
                total += labels.size(0)                
                recall_total += (labels==1).sum().item()
                precision_total += (pred==1).sum().item()
                
                correct += (pred==labels).sum().item()
                recall_correct += ((pred==labels)&(labels==1)).sum().item()
                precision_correct += ((pred==labels)&(pred==1)).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        
        recall = 0
        if recall_total > 0:
            recall = recall_correct/recall_total
        
        precision = 0
        if precision_total > 0:
            precision = precision_correct/precision_total
        
        return predictions, acc, recall, precision
    return predictions

In [7]:
#
TARGET_COLUMN = "fraudulent"
BESED_MODEL = "bert-base-cased"
NUM_CLASS = 2

#
BATCH_SIZE = 16
TRAIN_RUNS = 10
MAX_EPOCHS = 10

In [8]:
tokenizer = AutoTokenizer.from_pretrained(BESED_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BESED_MODEL, num_labels=NUM_CLASS)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
train_df = pd.read_csv("/content/drive/MyDrive/Fake-JD-Detector/splitted_text_data/train.csv")
train_df

Unnamed: 0,text,fraudulent
0,Production Technician 1781 US NC Garner 1781Jo...,1
1,PHP Software Engineer GR I Athens Cookisto is ...,0
2,TV Makeup Artist US FL Miami Applied Memetics ...,0
3,Credit and Collections Clerk US WA Basil Lifec...,1
4,Entry Level Sales US VA Norfolk General Summar...,0
...,...,...
14257,Pay Per Click Specialist US NJ Hoboken SpotSou...,0
14258,Customer Service Technical Specialist US MN St...,0
14259,Lead Business Analyst US NY New York City Job ...,1
14260,Ruby on Rails Expert Disrupt the financial ind...,0


In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

device: cuda


In [12]:
%%time

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

for run in range(TRAIN_RUNS):
    
    train_sample_dataset = JobPostingDataset("train", data=train_df.sample(frac=0.01), label_column=TARGET_COLUMN, tokenizer=tokenizer)
    train_sample_data_loader = DataLoader(train_sample_dataset, batch_size=BATCH_SIZE, collate_fn=mini_batch)
    acc, recall, precision, eps, eps_loss = 0, 0, 0, 0, 0

    for epoch in range(MAX_EPOCHS):

        running_loss = 0.0
        for data in train_sample_data_loader:

            tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

            # 將參數梯度歸零
            optimizer.zero_grad()

            # forward pass
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors, 
                            labels=labels)

            loss = outputs[0]
            # backward
            loss.backward()
            optimizer.step()


            # 紀錄當前 batch loss
            running_loss += loss.item()

        # 計算分類準確率
        _, acc, recall, precision = get_predictions(model, train_sample_data_loader, compute_acc=True)
        eps, eps_loss = epoch, running_loss

        if recall == 1 and precision == 1:            
            break

    print("[run %d][epoch %d] loss: %.3f, acc: %.3f, recall: %.3f, precision: %3f" % (run+1, eps+1, eps_loss, acc, recall, precision))

device: cuda
[run 1][epoch 10] loss: 0.394, acc: 0.986, recall: 0.714, precision: 1.000000
[run 2][epoch 7] loss: 0.408, acc: 1.000, recall: 1.000, precision: 1.000000
[run 3][epoch 10] loss: 0.376, acc: 0.979, recall: 0.000, precision: 0.000000
[run 4][epoch 5] loss: 0.312, acc: 1.000, recall: 1.000, precision: 1.000000
[run 5][epoch 10] loss: 0.582, acc: 0.972, recall: 0.333, precision: 1.000000
[run 6][epoch 10] loss: 0.166, acc: 0.993, recall: 0.800, precision: 1.000000
[run 7][epoch 10] loss: 0.128, acc: 0.993, recall: 0.800, precision: 1.000000
[run 8][epoch 6] loss: 0.131, acc: 1.000, recall: 1.000, precision: 1.000000
[run 9][epoch 10] loss: 0.134, acc: 1.000, recall: 1.000, precision: 1.000000
[run 10][epoch 6] loss: 0.277, acc: 1.000, recall: 1.000, precision: 1.000000
CPU times: user 27min 3s, sys: 4.48 s, total: 27min 8s
Wall time: 27min 40s


In [13]:
model.save_pretrained("/content/drive/MyDrive/Fake-JD-Detector/fine_tuned_model")

# Testing

In [15]:
test_df = pd.read_csv("/content/drive/MyDrive/Fake-JD-Detector/splitted_text_data/verify.csv")
test_df

Unnamed: 0,text,fraudulent
0,Professional Services Specialist GB LND As a g...,0
1,Sr Windows Server Administrator US NC Sanford ...,0
2,Account Payable Rep US CA Chula Vista QB Medic...,0
3,Maintenance Person GB ABD Aberdeenshire Priori...,0
4,Online Marketing Specialist GB LND London incr...,0
...,...,...
3561,Graduate Software Engineer PK PB Lahore Founde...,0
3562,Tile Installer Setter US CA Fairfield This pos...,0
3563,Software Engineer Android Japanese US CA San F...,0
3564,English Teacher Abroad US NY New York We help ...,0


In [16]:
# Testing
test_dataset = JobPostingDataset("train", data=test_df, label_column=TARGET_COLUMN, tokenizer=tokenizer)
test_data_loader = DataLoader(train_sample_dataset, batch_size=BATCH_SIZE, collate_fn=mini_batch)

In [17]:
_, acc, recall, precision = get_predictions(model, test_data_loader, compute_acc=True)
print("classification acc: ", acc)
print("classification recall:", recall)
print("classification precision:", precision)

classification acc:  1.0
classification recall: 1.0
classification precision: 1.0
