# Some References for Inspiration

* [Real/Fake Job Posting Prediction](https://www.kaggle.com/datasets/shivamb/real-or-fake-fake-jobposting-prediction)
* [Fine-tuing BERT model for text classification](https://www.kaggle.com/datasets/shivamb/real-or-fake-fake-jobposting-prediction)

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
import os
import copy
import torch

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from torch.nn.utils.rnn import pad_sequence

In [3]:
class JobPostingDataset(Dataset):    
    def __init__(self, mode, data, label_column, tokenizer):
        assert mode in ["train", "test"]
        self.mode = mode        
        self.df = copy.deepcopy(data).fillna("")
        self.len = len(self.df)
        self.label_column = label_column
        self.label_map = {label:index for index, label in enumerate(list(set(self.df[label_column].to_list())))}
        self.tokenizer = tokenizer
    
    def __getitem__(self, idx):
        if self.mode == "test":
            text = self.df.iloc[idx, :1].values
            label_tensor = None
        else:
            text, label = self.df.iloc[idx, :].values            
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
            
        text_word_pieces = self.tokenizer.tokenize(text, max_length=512, truncation=True)
        text_len = len(text_word_pieces)
                
        ids = self.tokenizer.convert_tokens_to_ids(text_word_pieces)
        tokens_tensor = torch.tensor(ids)

        segments_tensor = torch.tensor([0]*text_len, dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

In [4]:
def mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

In [5]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
    recall_total = 0
    recall_correct = 0
    precision_total = 0
    precision_correct = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                
                total += labels.size(0)                
                recall_total += (labels==1).sum().item()
                precision_total += (pred==1).sum().item()
                
                correct += (pred==labels).sum().item()
                recall_correct += ((pred==labels)&(labels==1)).sum().item()
                precision_correct += ((pred==labels)&(pred==1)).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        
        recall = 0
        if recall_total > 0:
            recall = recall_correct/recall_total
        
        precision = 0
        if precision_total > 0:
            precision = precision_correct/precision_total
        
        return predictions, acc, recall, precision
    return predictions

In [6]:
#
TARGET_COLUMN = "fraudulent"
BESED_MODEL = "bert-base-cased"
NUM_CLASS = 2

#
BATCH_SIZE = 16
TRAIN_RUNS = 10
MAX_EPOCHS = 10

In [7]:
tokenizer = AutoTokenizer.from_pretrained(BESED_MODEL)
model = AutoModelForSequenceClassification.from_pretrained(BESED_MODEL, num_labels=NUM_CLASS)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [8]:
train_df = pd.read_csv("splitted_text_data/train.csv")
train_df

Unnamed: 0,text,fraudulent
0,Part Time Administrative Assistant US NJ Elmwo...,0
1,Back End Engineer Platform Team GB Lost My Nam...,0
2,Sr Accountant Financial Analyst US The positio...,0
3,Social Universe Project Manager MX DIF Grandat...,0
4,Junior Developer NL NH Amsterdam About the Com...,0
...,...,...
14299,Outside Sales Professional Kenosha US WI Kenos...,0
14300,Commercial Real Estate Salesperson NJ US NJ El...,0
14301,Sales Executive US CA Sacramento Dice is a hig...,0
14302,Customer Experience Professionals US TX Austin...,0


In [10]:
%%time

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)

for run in range(TRAIN_RUNS):
    
    train_sample_dataset = JobPostingDataset("train", data=train_df.sample(frac=0.01), label_column=TARGET_COLUMN, tokenizer=tokenizer)
    train_sample_data_loader = DataLoader(train_sample_dataset, batch_size=BATCH_SIZE, collate_fn=mini_batch)
    acc, recall, precision, eps, eps_loss = 0, 0, 0, 0, 0

    for epoch in range(MAX_EPOCHS):

        running_loss = 0.0
        for data in train_sample_data_loader:

            tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]

            # 將參數梯度歸零
            optimizer.zero_grad()

            # forward pass
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors, 
                            labels=labels)

            loss = outputs[0]
            # backward
            loss.backward()
            optimizer.step()


            # 紀錄當前 batch loss
            running_loss += loss.item()

        # 計算分類準確率
        _, acc, recall, precision = get_predictions(model, train_sample_data_loader, compute_acc=True)
        eps, eps_loss = epoch, running_loss

        if recall == 1 and precision == 1:            
            break

    print("[run %d][epoch %d] loss: %.3f, acc: %.3f, recall: %.3f, precision: %3f" % (run+1, eps+1, eps_loss, acc, recall, precision))

device: cuda:0
[run 1][epoch 10] loss: 1.005, acc: 0.951, recall: 0.000, precision: 0.000000
[run 2][epoch 9] loss: 0.250, acc: 1.000, recall: 1.000, precision: 1.000000
[run 3][epoch 7] loss: 0.459, acc: 1.000, recall: 1.000, precision: 1.000000
[run 4][epoch 7] loss: 0.166, acc: 1.000, recall: 1.000, precision: 1.000000
[run 5][epoch 3] loss: 0.356, acc: 1.000, recall: 1.000, precision: 1.000000
[run 6][epoch 9] loss: 0.123, acc: 1.000, recall: 1.000, precision: 1.000000
[run 7][epoch 5] loss: 0.447, acc: 1.000, recall: 1.000, precision: 1.000000
[run 8][epoch 3] loss: 0.211, acc: 1.000, recall: 1.000, precision: 1.000000
[run 9][epoch 3] loss: 0.254, acc: 1.000, recall: 1.000, precision: 1.000000
[run 10][epoch 5] loss: 0.295, acc: 1.000, recall: 1.000, precision: 1.000000
CPU times: user 12min 14s, sys: 7min 6s, total: 19min 21s
Wall time: 19min 25s


In [11]:
model.save_pretrained("fine_tuned_model")

# Testing

In [12]:
test_df = pd.read_csv("splitted_text_data/test.csv")
test_df

Unnamed: 0,text,fraudulent
0,UX Front End Developer GR I Athens Nubis is at...,0
1,Production Manager Heavy Duty Diesel 2022 US T...,1
2,Security System Installer 2GIG US DE Wilmingto...,0
3,Talent Acquisition Specialist USA US NY New Yo...,0
4,Graduates English Teacher Abroad Conversationa...,0
...,...,...
3571,Outside Sales Professional US East Peoria ABC ...,0
3572,Head of Online Marketing DE BY Munich hello wo...,0
3573,English Teacher Abroad US NY Alfred We help te...,0
3574,Sales Director Business to Business Channel US...,0


In [13]:
# Testing
test_dataset = JobPostingDataset("train", data=test_df, label_column=TARGET_COLUMN, tokenizer=tokenizer)
test_data_loader = DataLoader(train_sample_dataset, batch_size=BATCH_SIZE, collate_fn=mini_batch)

In [14]:
_, acc, recall, precision = get_predictions(model, test_data_loader, compute_acc=True)
print("classification acc: ", acc)
print("classification recall:", recall)
print("classification precision:", precision)

classification acc:  0.986013986013986
classification recall: 0.8571428571428571
classification precision: 0.8571428571428571
