In [None]:
!nvidia-smi

In [None]:
import torch
torch.cuda.is_available()

In [None]:
!pip install -qq tokenizers===0.10.3

In [None]:
!pip install -qq transformers

In [None]:
!pip install -qq simpletransformers

In [None]:
from transformers import AutoTokenizer, AutoModel
from simpletransformers.question_answering import QuestionAnsweringModel, QuestionAnsweringArgs

In [None]:
pretrained = "monsoon-nlp/bert-base-thai"

In [None]:
model_args = {
    'manual_seed' : 0,
    'reprocess_input_data': False,
    
    'train_batch_size': 48,
    'eval_batch_size' : 16,
    
    'use_early_stopping': True,
    'early_stopping_delta': 0.01,
    'early_stopping_metric': 'eval_loss',
    'early_stopping_metric_minimize': True,
    'early_stopping_patience': 5,
    
    'evaluate_during_training' : True,
    'evaluate_during_training_verbose': True,
    
    'fp16': True,
    
    'num_train_epochs': 60,
    
    'overwrite_output_dir': True,

    'save_model_every_epoch': False,
    'save_steps':-1,
    
    'use_cached_eval_features' : True, 
    
    'max_seq_length': 256,
    'no_cache': False,

    'custom_parameter_groups':[{
        'params': ['classifier.weight', 'bert.encoder.layer.10.output.dense.weight'],
        'lr': 5e-5,
    }]
}

In [None]:
bert = QuestionAnsweringModel(
    'bert', 
    pretrained,
    use_cuda=torch.cuda.is_available(), 
    args=model_args
)

In [None]:
import json
with open('/kaggle/input/moddataset/scg/train.json', 'r') as file:
  scgtrain = json.load(file, encoding='utf-8')
with open('/kaggle/input/moddataset/scg/eval.json', 'r') as file:
  scgeval = json.load(file, encoding='utf-8')
with open('/kaggle/input/moddataset/wiki/train.json','r') as file:
  wikitrain = json.load(file, encoding='utf-8')
with open('/kaggle/input/moddataset/wiki/eval.json','r') as file:
  wikieval = json.load(file, encoding='utf-8')
train = scgtrain + wikitrain
eval = scgeval + wikieval

In [None]:
step, train_result = bert.train_model(train, eval_data=eval, show_running_loss=True)

In [None]:
eval_result, eval_texts = bert.eval_model(eval)

In [None]:
eval_result

In [None]:
eval_texts

In [None]:
with open('/kaggle/input/moddataset/test/test.json', 'r') as file:
  test = json.load(file, encoding='utf-8')

In [None]:
answers, probabilities = bert.predict([test[0]])

In [None]:
output = dict()
for i in test:
  answers,probabilities = bert.predict([i])
  for j in range(len(answers)):
    question_id = i['qas'][j]['question_id']
    answer = answers[j]['answer'][0]
    output[question_id] = [question_id,answer]

In [None]:
output

In [None]:
df = []
for i in range(1,5609):
  if str(i) in output:
    df.append([i,output[str(i)][1]])
  else:
    df.append([i,''])

In [None]:
import pandas as pd
import numpy as np
outdf = pd.DataFrame(np.array(df),columns=['id','answer'])


In [None]:
outdf.to_csv('/kaggle/working/output1.csv',index=False,header=True)
