In [1]:
INPUT_FILE = 'dataset/app.db'

Load datasets

In [2]:
from pathlib import Path
import sqlite3
import pandas as pd

def get_dataset(name: str):
	with sqlite3.connect(Path(INPUT_FILE)) as db:
		questions_ids = pd.read_sql(f'SELECT * FROM {name}', db)
		questions_ids = questions_ids['import_questions_id']
		questions_ids = questions_ids.unique()
		questions_ids = questions_ids.tolist()
	
		query = f'SELECT * FROM import_questions WHERE id IN({str(questions_ids)[1:][:-1]})'
		questions_txt = pd.read_sql(query, db)
	
		text_ids = questions_txt['import_text_id']
		text_ids = text_ids.unique()
		text_ids = text_ids.tolist()
		query = f'SELECT * FROM import_text WHERE id IN({str(text_ids)[1:][:-1]})'
		text_txt = pd.read_sql(query, db)

	return questions_txt, text_txt
 

In [3]:
train_questions_txt, train_text_txt = get_dataset('train_dataset')
test_questions_txt, test_text_txt = get_dataset('test_dataset')

Table `import_questions`

In [4]:
train_questions_txt

Unnamed: 0,id,import_text_id,question,answer
0,111,42,Who was Nicolaus Copernicus?,"Nicholas Copernicus was a Polish astronomer, m..."
1,112,42,In what years did Nicolaus Copernicus live?,Nicholas Copernicus lived in the years 1473-1543.
2,113,42,What theory did Nicolaus Copernicus develop?,Nicholas Copernicus developed a heliocentric t...
3,114,42,What is the most important work of Nicolaus Co...,The most important work of Nicolaus Copernicus...
4,115,42,When was 'De revolutionibus orbium coelestium'...,The work 'De revolutionibus orbium coelestium'...
...,...,...,...,...
64,196,50,Was Baroque a delicious style?,"Yes, baroque was a delicious style."
65,197,50,Which centuries does the Baroque era cover?,"The Baroque era covers the 16th, 17th and 18th..."
66,198,50,Did baroque refer to counterweights and movement?,"Yes, baroque referred to counterweights and mo..."
67,199,50,Did Baroque influence architecture?,"Yes, Baroque had a great influence on the deve..."


Table `import_text_id`

In [5]:
train_text_txt

Unnamed: 0,id,import_raw_id,text
0,42,1,Nicholas Copernicus (1473-1543) was a Polish a...
1,43,1,Maria Skłodowska-Curie (1867-1934) was a Polis...
2,44,1,The Great Wall of China is a system of defensi...
3,45,1,"Gothic architecture, which developed in Europe..."
4,46,1,The French Revolution (1789-1799) was a period...
5,48,1,Leonardo da Vinci was an Italian Renaissance a...
6,50,1,Baroque is an era in the history of art that p...


Prepare the model

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = 'openai-community/gpt2'
# model_name = 'EleutherAI/gpt-neo-1.3B'
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'bos_token': '[BOS]', 'eos_token': '[EOS]'})
tokenizer.add_tokens(['[ANSWER]', '[QUESTION]', '[CONTEXT]'])

3

In [7]:
model = AutoModelForCausalLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

Embedding(50263, 768)

Prepare dataset class

In [8]:
from torch.utils.data import Dataset

class QADataset(Dataset):
	def __init__(self, text: pd.DataFrame, questions: pd.DataFrame):
		super().__init__()
		self.text = text
		self.questions = questions
  
	def __len__(self):
		return len(self.questions)
  
	def __getitem__(self, indx, process=True):
		# default Dataset.__iter__ is not implemented
		if indx >= len(self):
			raise IndexError
	 
		cols = ['import_text_id', 'question', 'answer']
		txt_id, question, answer = self.questions.loc[indx, cols]
		txt = self.text.loc[self.text['id'] == txt_id, 'text']
		txt = txt.item()
  
		if process:
			data = self.process(txt, question, answer)
			return data

		return txt, question, answer

	def process(self, txt, question, answer):
		input_format = f'[BOS][CONTEXT]{txt}[QUESTION]{question}[ANSWER]{answer}[EOS]'
		input_format = input_format\
      		# .replace('\n', '')\
            # .replace('\t', '')\
        	# .strip()\
            # .replace('  ', '')

		processed_input = tokenizer(
			input_format, 
   			return_tensors='pt', 
	  		padding='max_length', 
			truncation=True
		)
		# processed_output = tokenizer(
	  	# 	output_format, 
		# 	return_tensors='pt',
		# 	padding='max_length', 
		# 	truncation=True
		# )
  
		processed_input['labels'] = processed_input['input_ids']
		# processed_input['labels'] = processed_output['input_ids']
		return processed_input

In [9]:
train_dataset = QADataset(train_text_txt, train_questions_txt)
test_dataset = QADataset(test_text_txt, test_questions_txt)

In [10]:
train_dataset[0]

{'input_ids': tensor([[50258, 50262, 46489,  ..., 50257, 50257, 50257]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[50258, 50262, 46489,  ..., 50257, 50257, 50257]])}

In [11]:
tokenizer.decode(train_dataset[0]['input_ids'][0])

"[BOS][CONTEXT]Nicholas Copernicus (1473-1543) was a Polish astronomer, mathematician and clergyman who developed the heliocentric theory of the structure of the Solar System. His most important work, 'De revolutionibus orbium coelestium', was published just before his death in 1543.[QUESTION]Who was Nicolaus Copernicus?[ANSWER]Nicholas Copernicus was a Polish astronomer, mathematician and clergyman.[EOS][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][P

In [12]:
tokenizer.convert_ids_to_tokens(train_dataset[0]['input_ids'][0])

['[BOS]',
 '[CONTEXT]',
 'Nich',
 'olas',
 'ĠCop',
 'ern',
 'icus',
 'Ġ(',
 '14',
 '73',
 '-',
 '15',
 '43',
 ')',
 'Ġwas',
 'Ġa',
 'ĠPolish',
 'Ġastronomer',
 ',',
 'Ġmathematician',
 'Ġand',
 'Ġclergy',
 'man',
 'Ġwho',
 'Ġdeveloped',
 'Ġthe',
 'Ġhel',
 'i',
 'oc',
 'entric',
 'Ġtheory',
 'Ġof',
 'Ġthe',
 'Ġstructure',
 'Ġof',
 'Ġthe',
 'ĠSolar',
 'ĠSystem',
 '.',
 'ĠHis',
 'Ġmost',
 'Ġimportant',
 'Ġwork',
 ',',
 "Ġ'",
 'De',
 'Ġrevolution',
 'ibus',
 'Ġorb',
 'ium',
 'Ġco',
 'el',
 'est',
 'ium',
 "',",
 'Ġwas',
 'Ġpublished',
 'Ġjust',
 'Ġbefore',
 'Ġhis',
 'Ġdeath',
 'Ġin',
 'Ġ15',
 '43',
 '.',
 '[QUESTION]',
 'Who',
 'Ġwas',
 'ĠNicola',
 'us',
 'ĠCop',
 'ern',
 'icus',
 '?',
 '[ANSWER]',
 'Nich',
 'olas',
 'ĠCop',
 'ern',
 'icus',
 'Ġwas',
 'Ġa',
 'ĠPolish',
 'Ġastronomer',
 ',',
 'Ġmathematician',
 'Ġand',
 'Ġclergy',
 'man',
 '.',
 '[EOS]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]'

Training phase

In [13]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
	output_dir='training_out',
	overwrite_output_dir=True, 
	do_train=True,
	do_eval=True,
	num_train_epochs=20,
	eval_steps=5,
	logging_steps=5,
	logging_first_step=True,
	save_total_limit=5,
	gradient_accumulation_steps=8,
	per_device_train_batch_size=8,
	learning_rate=1e-5,
	eval_strategy="epoch",
	fp16=True
)
trainer = Trainer(
	model=model, 
	args=args, 
	train_dataset=train_dataset,
	eval_dataset=test_dataset,
	tokenizer=tokenizer
)

In [14]:
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss
0,9.7719,11.117333
1,9.7719,11.117333
2,9.7719,11.117333


KeyboardInterrupt: 

Save the model

In [None]:
trainer.save_model('model')

In [None]:
txt = '[BOS][CONTEXT] Baroque is an era in the history of art that prevailed in Europe from the 16th to the 18th century. It was characterized by a sumptuous style, extensive ornamentation, dynamic forms and references to counterweights and movement. The Baroque was a period of intensive development of architecture, painting, music and literature.[QUESTION] Was baroque popular in Europe?[ANSWER]'

In [None]:
txt = tokenizer.decode(test_dataset[0]['input_ids'][0])

In [None]:
txt = txt.split('[ANSWER]')[0] + '[ANSWER]'

In [None]:
from transformers import pipeline


generator = pipeline(
    'text-generation', 
    model=model,
    tokenizer=tokenizer, 
    device='cuda'
)



out = generator(txt, top_p=0.95, max_length=512, do_sample=True, top_k=1000)
    # temperature=0.8)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
out_txt = out[0]['generated_text']
c = txt.split('[CONTEXT]')[1].split('[QUESTION]')[0]
q = txt.split('[QUESTION]')[1].split('[ANSWER]')[0]
a = out_txt[len(txt):]


print(c)
print()
print(q)
print()
print(a)

The history of Poland covers the history of the Polish state and nation from the earliest times to the present day. The origins of the Polish state date back to the 10th century, when Mieszko I, the first historical ruler of Poland, was baptized in 966, which is considered the symbolic beginning of Polish statehood 

When was Mieszko I baptized?

 revolution China French of?', bar Polish from Bar Itches, France, Leonardo in.th astronomerkaThe centuries famousoc Polishows impressive social century52 times century Prize52 structurepper artistic eraCur VinWas17ows period radicalł14Was Cathedral73th? Europe9919 architecture development large Leonardoiumel French French fields paintingci14ows and the Nobel architecture researchque China Renaissance centuryThe?', daDe knownThe Gothicows radical Cathedral revolution architecture
