In [2]:
from transformers import (
   BertTokenizerFast,
   AutoModelForCausalLM
)

# masked language model (ALBERT, BERT)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
pre_train_model = AutoModelForCausalLM.from_pretrained('ckiplab/gpt2-base-chinese') # or other models above


In [3]:
# embedding空間維度
d_model = pre_train_model.config.hidden_size
# 字典長度
n_token = tokenizer.vocab_size
# 類別數
catogories = 2
# 一句話進model的token數
seq_len = 20
# batch size
batch_size = 600

In [4]:
import torch
from typing import List

def get_pretrained_embed_matrix(input_txts:List[str], seq_len:int):

    # 取得input_ids
    input_ids = tokenizer(input_txts ,return_tensors="pt", padding=True, truncation=True)['input_ids']


    if input_ids.size()[1] < seq_len:
        # 補到長度為20
        input_ids = torch.cat((input_ids, torch.zeros((input_ids.size()[0], seq_len - input_ids.size()[1]), dtype=torch.long)), dim=1)
        
    elif input_ids.size()[1] > seq_len:
        # 截斷到長度為20
        input_ids = input_ids[:, :seq_len]

    # 查看model.transformer有哪些屬性
    embedding_matrix = pre_train_model.get_input_embeddings().weight.data[input_ids].squeeze() 

    # shape: [batch_size, seq_len, d_model] -> [seq_len, batch_size, d_model]
    embedding_matrix = torch.transpose(embedding_matrix, 0, 1)

    return embedding_matrix

In [5]:
import math
import torch
from torch import nn, Tensor, softmax, argmax
from torch.nn import TransformerEncoder, TransformerEncoderLayer


class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: Tensor) -> Tensor:
        """
        Arguments:
            x: Tensor, shape ``[seq_len, batch_size, embedding_dim]``
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

class TransformerModel(nn.Module):

    def __init__(self, catogories: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5):
        super().__init__()
        self.model_type = 'Transformer'
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout) 
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        self.embedding = get_pretrained_embed_matrix
        self.d_model = d_model
        self.linear = nn.Linear(d_model, catogories)
        # self.linear_test = nn.Linear(d_model, catogories)


    def forward(self, src_texts: Tensor, src_mask: Tensor = None) -> Tensor:
        """
        Arguments:
            src: Tensor, shape ``[seq_len, batch_size]``
            src_mask: Tensor, shape ``[seq_len, seq_len]``

        Returns:
            output Tensor of shape ``[seq_len, batch_size, catogories]``
        """
        src = self.embedding(input_txts=src_texts, seq_len=seq_len)

        src = src* math.sqrt(self.d_model) 
        src = self.pos_encoder(src)
        if src_mask is None:
            """Generate a square causal mask for the sequence. The masked positions are filled with float('-inf').
            Unmasked positions are filled with float(0.0).
            """
            # src_mask = nn.Transformer.generate_square_subsequent_mask(len(src)).to(device)
            src_mask = nn.Transformer.generate_square_subsequent_mask(len(src))
        output = self.transformer_encoder(src, src_mask)
        
        output = torch.mean(output, dim=0)
        output = self.linear(output)
        # 過個softmax
        output = softmax(output, dim=1)
        # 取機率較大的那個類別
        # output = argmax(output, dim=1)
        return output

In [7]:
from torch.utils.data import Dataset

# 自定义数据集类(for dataloader用)
class MyDataset(Dataset):
    def __init__(self, inputs, outputs):
        self.inputs = inputs
        self.outputs = outputs

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        return self.inputs[index], self.outputs[index]

In [8]:
import pandas as pd
from torch.utils.data import DataLoader
from torch.utils.data.sampler import WeightedRandomSampler
# 讀資料
data_csv = pd.read_csv("content_df.csv").dropna().sample(frac=1, random_state=42).reset_index(drop=True)
y = data_csv["Click"]
X = data_csv["title"]
# X = data_csv['Click'].apply(lambda label: "妳好嗎" if label == 1 else "天氣不錯") # 測試
cutpoint = len(y)//10*8 # 切8成當訓練資料
y_train, y_test = y[:cutpoint], y[cutpoint:]
X_train, X_test = X[:cutpoint], X[cutpoint:]

# weight
true_weight = len(y_train)/(y_train==1).sum()
false_weight = len(y_train)/(y_train==0).sum()
train_weights = y_train.apply(lambda label: true_weight if label == 1 else false_weight).to_list()

# # 定義訓練時的sampler
weights = [len(y_train)/(y_train==0).sum(), len(y_train)/(y_train==1).sum()]
sample_set_nums = ((y_train==1).sum())*1 if ((y_train==1).sum())*1 < len(y_train) else len(y_train)
# sample_set_nums = X_train.shape[0]
train_sampler = WeightedRandomSampler(weights=train_weights, num_samples=int(sample_set_nums), replacement=False)

# # 创建数据集和数据加载器
train_dataset = MyDataset(X_train, y_train)
test_dataset = MyDataset(X_test, y_test)

# # 定義訓練時的dataloader
train_dataloader = DataLoader(dataset=train_dataset, sampler=train_sampler, batch_size=batch_size)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=len(y_test), shuffle=False) #驗證時不用隨機丟

In [9]:
model = TransformerModel(catogories=catogories, d_model=d_model, nhead=1, d_hid=128, nlayers=1, dropout=0.2)


In [10]:
epochs = 100000
criterion = nn.CrossEntropyLoss()
# lr = 5.0  # learning rate
# optimizer = torch.optim.Adam(model.parameters(), lr=lr)
optimizer = torch.optim.Adam(model.parameters())
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)

In [12]:
import time
best_val_loss = float('inf')

def train(model: nn.Module) -> None:

    model.train()  # turn on train mode
    total_loss = 0
    log_interval = 5
    start_time = time.time()

    for batch, (train_data, labels) in enumerate(train_dataloader):

        batch += 1
        
        # 轉float
        output = model.forward(train_data)
        targets = torch.eye(2)[labels.long()]

        # 計算損失
        loss = criterion(output, targets)
        total_loss += loss.item()
        

        # 反向傳播
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        
        
        if batch % log_interval == 0 and batch > 1:

            lr = scheduler.get_last_lr()[0]
            ms_per_batch = (time.time() - start_time) * 1000 / log_interval
            cur_loss = total_loss / log_interval
            ppl = math.exp(cur_loss)
            print(f'| epoch {epoch:3d} | batch {(batch):5d} | '
                    f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
                    f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            total_loss = 0
            start_time = time.time()

from sklearn.metrics import f1_score

def f1_test(model):
    model.eval()
    y_pred = torch.argmax(model.forward(X_test.to_list()), dim=1)
    f1 = f1_score(y_test, y_pred.detach().numpy())
    print("test f1: ",f1)

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train(model)
    f1_test(model)

    scheduler.step()

| epoch   1 | batch     5 | lr 0.00 | ms/batch 1046.00 | loss  0.72 | ppl     2.06
test f1:  0.18518518518518517
| epoch   2 | batch     5 | lr 0.00 | ms/batch 1026.04 | loss  0.67 | ppl     1.96
test f1:  0.4011857707509881
| epoch   3 | batch     5 | lr 0.00 | ms/batch 1079.42 | loss  0.63 | ppl     1.88
test f1:  0.5379609544468547
| epoch   4 | batch     5 | lr 0.00 | ms/batch 999.78 | loss  0.61 | ppl     1.83
test f1:  0.5536159600997507
| epoch   5 | batch     5 | lr 0.00 | ms/batch 1044.38 | loss  0.59 | ppl     1.80
test f1:  0.5381460213289582
| epoch   6 | batch     5 | lr 0.00 | ms/batch 1054.55 | loss  0.59 | ppl     1.80
test f1:  0.5594149908592322
| epoch   7 | batch     5 | lr 0.00 | ms/batch 1024.93 | loss  0.58 | ppl     1.78
test f1:  0.5742424242424241
| epoch   8 | batch     5 | lr 0.00 | ms/batch 1052.21 | loss  0.58 | ppl     1.79
test f1:  0.5815602836879432
| epoch   9 | batch     5 | lr 0.00 | ms/batch 1105.34 | loss  0.57 | ppl     1.77
test f1:  0.571226080

KeyboardInterrupt: 