In [1]:
INPUT_FILE = 'dataset/app.db'

Load training dataset

In [65]:
from pathlib import Path
import sqlite3
import pandas as pd

with sqlite3.connect(Path(INPUT_FILE)) as db:
	questions_ids = pd.read_sql('SELECT * FROM train_dataset', db)
	questions_ids = questions_ids['import_questions_id']
	questions_ids = questions_ids.unique()
	questions_ids = questions_ids.tolist()
 
	query = f'SELECT * FROM import_questions WHERE id IN({str(questions_ids)[1:][:-1]})'
	questions_txt = pd.read_sql(query, db)
 
	text_ids = questions_txt['import_text_id']
	text_ids = text_ids.unique()
	text_ids = text_ids.tolist()
	query = f'SELECT * FROM import_text WHERE id IN({str(text_ids)[1:][:-1]})'
	text_txt = pd.read_sql(query, db)
 

Table `import_questions`

In [67]:
questions_txt

Unnamed: 0,id,import_text_id,question,answer
0,111,42,Who was Nicolaus Copernicus?,"Nicholas Copernicus was a Polish astronomer, m..."
1,112,42,In what years did Nicolaus Copernicus live?,Nicholas Copernicus lived in the years 1473-1543.
2,113,42,What theory did Nicolaus Copernicus develop?,Nicholas Copernicus developed a heliocentric t...
3,114,42,What is the most important work of Nicolaus Co...,The most important work of Nicolaus Copernicus...
4,115,42,When was 'De revolutionibus orbium coelestium'...,The work 'De revolutionibus orbium coelestium'...
...,...,...,...,...
64,196,50,Was Baroque a delicious style?,"Yes, baroque was a delicious style."
65,197,50,Which centuries does the Baroque era cover?,"The Baroque era covers the 16th, 17th and 18th..."
66,198,50,Did baroque refer to counterweights and movement?,"Yes, baroque referred to counterweights and mo..."
67,199,50,Did Baroque influence architecture?,"Yes, Baroque had a great influence on the deve..."


Table `import_text_id`

In [68]:
text_txt

Unnamed: 0,id,import_raw_id,text
0,42,1,Nicholas Copernicus (1473-1543) was a Polish a...
1,43,1,Maria Skłodowska-Curie (1867-1934) was a Polis...
2,44,1,The Great Wall of China is a system of defensi...
3,45,1,"Gothic architecture, which developed in Europe..."
4,46,1,The French Revolution (1789-1799) was a period...
5,48,1,Leonardo da Vinci was an Italian Renaissance a...
6,50,1,Baroque is an era in the history of art that p...


Prepare the model

In [258]:
from transformers import GPT2ForTokenClassification, GPT2TokenizerFast

model_name = 'openai-community/gpt2'
tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
model = GPT2ForTokenClassification.from_pretrained(model_name)

Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at openai-community/gpt2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [259]:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 768)

Prepare dataset class

In [260]:
from torch.utils.data import Dataset

class QADataset(Dataset):
	def __init__(self, text: pd.DataFrame, questions: pd.DataFrame):
		super().__init__()
		self.text = text
		self.questions = questions
  
	def __len__(self):
		return len(self.questions)
  
	def __getitem__(self, indx, process=True):
		# default Dataset.__iter__ is not implemented
		if indx >= len(self):
			raise IndexError
     
		cols = ['import_text_id', 'question', 'answer']
		txt_id, question, answer = self.questions.loc[indx, cols]
		txt = self.text.loc[self.text['id'] == txt_id, 'text']
		txt = txt.item()
  
		if process:
			data = self.process(txt, question, answer)
			return data

		return txt, question, answer

	def process(self, txt, question, answer):
		input_format = f'''
		[CONTEXT] {txt}

		[QUESTION] {question}
		'''

		output_format = f'[ANSWER] {answer}'

		processed_input = tokenizer(
			input_format, 
   			return_tensors='pt', 
      		padding='max_length', 
            truncation=True
        )
		processed_output = tokenizer(
      		output_format, 
        	return_tensors='pt',
            padding='max_length', 
            truncation=True
    	)
  
		processed_input['labels'] = processed_output['input_ids']
		return processed_input

In [261]:
dataset = QADataset(text_txt, questions_txt)
dataset[0]

{'input_ids': tensor([[  198,   197,   197,  ..., 50257, 50257, 50257]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[   58, 15037, 45532,  ..., 50257, 50257, 50257]])}

Training phase

In [262]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
	output_dir='training_out',
	overwrite_output_dir=True, 
	do_train=True,
	num_train_epochs=10,
	logging_first_step=True,
	logging_steps=10,
	save_total_limit=5,
	fp16=True
)
trainer = Trainer(
	model=model, 
	args=args, 
	train_dataset=dataset,
	tokenizer=tokenizer
)

In [263]:
trainer.train()

  0%|          | 0/90 [00:00<?, ?it/s]

: 

Save the model

In [None]:
trainer.save_model('model')