In [19]:
import pandas as pd

import torch
from torch.nn import functional as F
from torch.nn import LSTM
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from pytorch_lightning import LightningModule, Trainer

from transformers import AutoModel, AutoTokenizer
from datasets import Dataset as D

from tqdm import tqdm

#!git clone https://github.com/TQuad/turkish-nlp-qa-dataset

In [20]:
def tquad2df(path):
    df = pd.DataFrame(columns=['title', 'content', 'question'])
    
    dataset = D.from_json(path)['data'][0]
    
    for data in tqdm(dataset):
        title = data['title']
        for para in data['paragraphs']:
            context = para['context']
            for qa in para['qas']:
                question = qa['question']
                
                df_el = pd.DataFrame([{'title': title, 
                                       'content': context, 
                                       'question': question}])
                df = pd.concat([df, df_el], ignore_index=True)
    
    return df

In [21]:
def get_tokenizable(df, tokenizer):
    ls = []
    for i, row in tqdm(df.iterrows()):
        content_len = len(tokenizer.tokenize(row.content))
        question_len = len(tokenizer.tokenize(row.question))
        model_len = tokenizer.model_max_length-1
        if content_len < model_len and question_len < model_len:
            ls.append(i)
            
    return df.iloc[ls, :]

In [38]:
class TquadDataset(Dataset):
    def __init__(self, path, tokenizer=None):
        super(TquadDataset, self).__init__()
        self.df = get_tokenizable(tquad2df(path), tokenizer)
        self.tokenizer = tokenizer
        self.device = device
        
        
    def __getitem__(self, i):
        if self.tokenizer is None:
            raise ValueError('Tokenizer not provided..')
            
        df_element = self.df.iloc[i]
        content, question = df_element.content, df_element.question
        
        X = tokenizer(content, padding='max_length', return_tensors='pt')
        y = tokenizer(question, padding='max_length', return_tensors='pt')
        
        return X, y

    def __len__(self):
        return len(self.df)
        

In [40]:
class AutoModelForTextGeneration(LightningModule):
    def __init__(self, model_path, lr):
        super(AutoModelForTextGeneration, self).__init__()
        self.lr = lr
        
        self.bert_layer = AutoModel.from_pretrained(model_path)
        conf = self.bert_layer.config
        self.lstm_layer = LSTM(conf.hidden_size, 1, 3, bidirectional=True, batch_first=True)
        
    def forward(self, x):
        bert_out = self.bert_layer(**x)
        last_hidden_layer = bert_out[0][:, 0, :][None, :]
        lstm_out = self.lstm_layer(last_hidden_layer)
        return lstm_out
    
    def configure_optimizers(self):
        return AdamW(self.parameters(), lr=self.lr)
    
    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        y = y['input_ids']
        _, (y_pred, _) = self(x)
        y_pred = torch.argmax(y_pred, axis=2)
        loss = F.cross_entropy(y, y_pred)
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        y = y['input_ids']
        _, (y_pred, _) = self(x)
        y_pred = torch.argmax(y_pred, axis=2)
        val_loss = F.cross_entropy(y, y_pred)
        return val_loss


In [39]:
TRAIN_DIR = 'turkish-nlp-qa-dataset/train-v0.1.json'
DEV_DIR = 'turkish-nlp-qa-dataset/dev-v0.1.json'
MODEL_PATH = 'dbmdz/bert-base-turkish-cased'

BATCH_SIZE = 256
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
THRESHOLD = 0.4
EPOCHS = 10
LR = 0.01

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

train_dataset = TquadDataset(TRAIN_DIR, tokenizer, device=DEVICE)
dev_dataset = TquadDataset(DEV_DIR, tokenizer, device=DEVICE)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_loader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=True)

bert_model = AutoModelForTextGeneration(model_path=MODEL_PATH, lr=LR)
trainer = Trainer()
trainer.fit()

Using custom data configuration default-396ab1108c46a4d9
Reusing dataset json (/Users/bugrahamzagundog/.cache/huggingface/datasets/json/default-396ab1108c46a4d9/0.0.0)
100%|██████████| 681/681 [00:05<00:00, 122.29it/s]
0it [00:00, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (599 > 512). Running this sequence through the model will result in indexing errors
8308it [00:07, 1114.25it/s]


({'input_ids': tensor([[    2,  4384, 11449, 27187,  3234,    18, 12462,  4573,  2862, 10785,
           8588,  5207,  2140,  6593,  1996,  9411,  3855,  1007,  8442,  3107,
          13603,  2268,  2050,  3458, 10416,  1998,    18,  8866,  2048,  9411,
             16,  3070,   554,  2054,  7554,  1991,  5739,  1033,    12,  5739,
           1033,  4129, 24163,  8928,  1022,    13,  2133, 12211,  2323,    17,
           2537,  1974,    70,    18,  7397,  2323,    17, 27794,  7742,    12,
           5462,  1030,  3870, 10878,  1087,    19,  3226, 24559,    13,   554,
          12018,    18, 14299,    12, 18325,  1050,    13, 13764, 24542,  1992,
          15194,  5347,  2154, 11315,  2356, 13419,  2048,  3129,  1009,    16,
          18090,  9823,  7141,    16,  3550,  6732,  7554,   554,  2054,  2688,
           2904,  3311,  4384, 13334,  3926,  3139, 20579,  1996,  4484,  2133,
          13603,  1992,  4729,  7504,  2029,  2074,  5999,  5659,    18,  2673,
            554,  3041,  3

In [37]:
len(train_dataset)

8308

In [None]:
# data
dataset = MNIST('', train=True, download=True, transform=transforms.ToTensor())
mnist_train, mnist_val = random_split(dataset, [55000, 5000])

train_loader = DataLoader(mnist_train, batch_size=32)
val_loader = DataLoader(mnist_val, batch_size=32)

# model
model = LitAutoEncoder()

# training
trainer = pl.Trainer(gpus=1, num_nodes=8, precision=16, limit_train_batches=0.5)
trainer.fit(model, train_loader, val_loader)
    