In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

from datetime import datetime
from tqdm import tqdm

from transformers import AutoConfig, AutoTokenizer, AutoModel

In [2]:
class CustomDataset(Dataset):
    def __init__(self, X, y, tokenizer, device='cpu'):
        self.X = X
        self.y = y
        self.tokenizer = tokenizer
        self.device = device
        
    def __getitem__(self, i):
        post, resp = X[i]        
        el_sent = ' '.join([post, self.tokenizer.special_tokens_map['sep_token'], resp])
        tokenized_el = self.tokenizer(el_sent, padding='max_length', return_tensors='pt')
        
        for k, v in tokenized_el.items():
            v_device = v.to(self.device)               
            tokenized_el[k] = v_device.squeeze()
        
        return tokenized_el, self.y[i]
    
    def __len__(self):
        return len(self.y)
    

In [3]:
class ClassifyBERTurk(nn.Module):
    def __init__(self, config):
        super(ClassifyBERTurk, self).__init__()
        self.berturk = AutoModel.from_config(config)
        self.fc = nn.Linear(config.hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, **inputs):
        bert_out = self.berturk(**inputs)
        bert_last_hidden = bert_out[0][:, 0, :]
        
        fc_out = self.fc(bert_last_hidden)
        sigmoid_out = self.sigmoid(fc_out)
        return sigmoid_out
    

In [4]:
def fine_tune(model, dataloader, epochs, loss_fn, optimizer, 
              val_dataloader=None, threshold=.4, device='cpu'):
    model = model.to(device)

    for param in model.berturk.parameters():
        param.requires_grad = False

    num_true, total_count = 0, 0
    losses = []
    
    for epoch in range(epochs):
        model.train()
        pbar = tqdm(dataloader)
        pbar.set_description(f"Epoch #{epoch+1}")
        for i, batch in enumerate(pbar):
            X, y = batch
            y = y.float().to(device)

            optimizer.zero_grad()
            
            outputs = model(**X).squeeze()
                       
            loss = loss_fn(outputs, y)        
            losses.append(loss.item())
            loss.backward()
            
            optimizer.step()
               
            pred = (outputs > threshold).float()
            num_true += sum(pred == y)
            total_count += len(y)
            acc = num_true / total_count
            
            pbar.set_postfix(acc=f"{100*acc:.2f}", bce_loss=f"{sum(losses)/len(losses)}")

        if val_dataloader:
            test(model, val_dataloader, threshold, device)
            
    return model, losses


def test(model, dataloader, threshold=.4, device='cpu'):
    true_count = 0
    total_count = 0

    model = model.to(device)
    model.eval()

    with torch.no_grad():
        for (X, y) in tqdm(dataloader):
            y = y.to(device)
            outputs = model(**X).squeeze()
            
            pred = (outputs > threshold).float()
            true_count += sum(pred == y)
            total_count += len(y)
    acc = true_count/total_count
    return acc

In [5]:
def get_tokenizable(df, tokenizer):
    ls = []
    for i, row in tqdm(df.iterrows()):
        el_sent = ' '.join([row.post, tokenizer.special_tokens_map['sep_token'], row.response])

        tok_sent = tokenizer.tokenize(el_sent)
        
        if len(tok_sent) < tokenizer.model_max_length-1:
            ls.append(i)
            
    return df.iloc[ls, :]

In [6]:
MODEL_PATH = 'dbmdz/bert-base-turkish-cased'
BATCH_SIZE = 256
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
THRESHOLD = 0.4
EPOCHS = 10
LR = 0.01

last_acc=0
while True:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, force_download=True)

    df = pd.read_csv('../input/taboo-datasets/post_resp_dataset_75.csv')
    df = get_tokenizable(df, tokenizer)

    y = df['label'].to_numpy()
    X = df.drop('label', axis=1).to_numpy()

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    train_set = CustomDataset(X=X_train, y=y_train, tokenizer=tokenizer, device=DEVICE)
    train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)

    test_set = CustomDataset(X=X_test, y=y_test, tokenizer=tokenizer, device=DEVICE)
    test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=True)

    bert_config = AutoConfig.from_pretrained(MODEL_PATH, force_download=True)
    bert_model = ClassifyBERTurk(config=bert_config)
    #bert_model = torch.load(MODEL_PATH)

    bce_loss = nn.BCELoss()
    opt = optim.Adam(bert_model.parameters(), lr=LR)

    bert_model, losses = fine_tune(bert_model, train_loader, EPOCHS, loss_fn=bce_loss, 
                                   optimizer=opt, threshold=THRESHOLD, device=DEVICE)

    acc = test(bert_model, test_loader, threshold=THRESHOLD, device=DEVICE)
    print(f"Accuracy: {100*acc:.2f}")

    if acc > last_acc:
        torch.save(bert_model, f'adequacy_model_{acc}')
        last_acc = acc
        
    if acc >= 0.55:
        torch.save(bert_model, f'adequacy_model_best')
        break


In [17]:
import matplotlib.pyplot as plt

plt.plot(losses)

In [15]:
torch.save(bert_model, f'adequacy_model_{datetime.now():%Y%m%d%H%M%S}')