In [16]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m697.3 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting streamlit (from simpletransformers)
  Downloading streamlit-1.35.0-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit->simpletransformers)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog>=2.1.5 (from streamlit->simpletransformers)
  Downloading watchdog-4.0.1-py3-none-manylinux2014_x86_64.whl.metadata (37 kB)
Downloading simpletransformers-0.70.1-py3-none-any.whl (316 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
from pathlib import Path
import cv2
import numpy as np
import pytesseract
from matplotlib import pyplot as plt
import os
import pandas  as pd
import random
import re
from tqdm import tqdm

In [3]:
import torch
print(torch.__version__) #current version
print(torch.cuda.is_available())

2.1.2
True


In [4]:
base_path = Path('/kaggle/input/chadok-hybrid-ocr-ner/')

file_info = {
    'test_prediction_df': 'test.csv',
    'NER_tag_df' : 'tag_list.csv',
    'submission_df' : 'sample_submission.csv'
}

for var_name, file_name in file_info.items():
    file_path = base_path / file_name
    globals()[var_name] = pd.read_csv(file_path)

## WangchanBERTa LST20 Model

### Load LST20 Dataset

In [5]:
import json
from datasets import load_dataset

In [6]:
#Load file in directory into variable lst20
lst20 = load_dataset("lst20", data_dir="/kaggle/input/lst20corpus/LST20_Corpus")
lst20

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/7.52k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/9.81k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/63310 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5620 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5250 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'fname', 'tokens', 'pos_tags', 'ner_tags', 'clause_tags'],
        num_rows: 63310
    })
    validation: Dataset({
        features: ['id', 'fname', 'tokens', 'pos_tags', 'ner_tags', 'clause_tags'],
        num_rows: 5620
    })
    test: Dataset({
        features: ['id', 'fname', 'tokens', 'pos_tags', 'ner_tags', 'clause_tags'],
        num_rows: 5250
    })
})

In [7]:
train_df = pd.DataFrame(lst20['train'])
validation_df = pd.DataFrame(lst20['validation'])
test_df = pd.DataFrame(lst20['test'])
train_df.head(3)

Unnamed: 0,id,fname,tokens,pos_tags,ner_tags,clause_tags
0,0,T00126.txt,"[สุรยุทธ์, ยัน, ปฏิเสธ, ลงนาม, _, MOU, _, กับ,...","[0, 1, 1, 1, 2, 0, 2, 4, 0, 12, 1, 0]","[8, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0]","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3]"
1,1,T00126.txt,"[1, _, กันยายน, _, 2550, _, 12:21, _, น.]","[8, 2, 0, 2, 8, 2, 8, 2, 10]","[3, 13, 13, 13, 23, 0, 3, 13, 23]","[1, 2, 2, 2, 2, 2, 2, 2, 3]"
2,2,T00126.txt,"[นายก, ฯ, _, ย้ำ, ไม่, ลงนาม, ใน, _, MOU, _, ร...","[0, 2, 2, 1, 12, 1, 4, 2, 0, 2, 4, 0, 3, 5, 6,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, ...","[1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 1, 2, 2, ..."


In [8]:
df_filter = ['id', 'tokens', 'ner_tags']
train_df = train_df[df_filter]
validation_df = validation_df[df_filter]
test_df = test_df[df_filter]
train_df.head(3)

Unnamed: 0,id,tokens,ner_tags
0,0,"[สุรยุทธ์, ยัน, ปฏิเสธ, ลงนาม, _, MOU, _, กับ,...","[8, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0]"
1,1,"[1, _, กันยายน, _, 2550, _, 12:21, _, น.]","[3, 13, 13, 13, 23, 0, 3, 13, 23]"
2,2,"[นายก, ฯ, _, ย้ำ, ไม่, ลงนาม, ใน, _, MOU, _, ร...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, ..."


In [9]:
NER_TAGS = [
       "O",
        "B_BRN",        "B_DES",        "B_DTM",        "B_LOC",        "B_MEA",        "B_NUM",        "B_ORG",        "B_PER",        "B_TRM",        "B_TTL",
       "I_BRN",        "I_DES",        "I_DTM",        "I_LOC",        "I_MEA",        "I_NUM",        "I_ORG",        "I_PER",        "I_TRM",        "I_TTL",
        "E_BRN",        "E_DES",        "E_DTM",        "E_LOC",        "E_MEA",        "E_NUM",        "E_ORG",        "E_PER",        "E_TRM",        "E_TTL"]
print(NER_TAGS)

['O', 'B_BRN', 'B_DES', 'B_DTM', 'B_LOC', 'B_MEA', 'B_NUM', 'B_ORG', 'B_PER', 'B_TRM', 'B_TTL', 'I_BRN', 'I_DES', 'I_DTM', 'I_LOC', 'I_MEA', 'I_NUM', 'I_ORG', 'I_PER', 'I_TRM', 'I_TTL', 'E_BRN', 'E_DES', 'E_DTM', 'E_LOC', 'E_MEA', 'E_NUM', 'E_ORG', 'E_PER', 'E_TRM', 'E_TTL']


In [10]:
print(train_df['ner_tags'][0][0])
print(NER_TAGS[train_df['ner_tags'][0][0]])
print(train_df)

8
B_PER
       id                                             tokens  \
0       0  [สุรยุทธ์, ยัน, ปฏิเสธ, ลงนาม, _, MOU, _, กับ,...   
1       1          [1, _, กันยายน, _, 2550, _, 12:21, _, น.]   
2       2  [นายก, ฯ, _, ย้ำ, ไม่, ลงนาม, ใน, _, MOU, _, ร...   
3       3  [ยัน, ไทย, ไม่, ได้, ล้มเหลว, ถึง, ขั้น, นั้น,...   
4       4  [พร้อม, หนุน, กกต., จัด, เลือกตั้ง, ให้, บริสุ...   
...    ..                                                ...   
63305   6  [นาย, ปณิธาน, _, กล่าว, ว่า, _, หาก, การ, ยื่น...   
63306   7  [เมื่อ, ถาม, ว่า, _, ใน, ทาง, การเมือง, _, หาก...   
63307   8  [นาย, ปณิธาน, _, กล่าว, ว่า, _, เรื่อง, กฎหมาย...   
63308   9  [หาก, เป็น, การ, ขอ, ถวาย, ฎีกา, ตาม, โบราณ, ร...   
63309  10  [อย่างไรก็ตาม, _, หน่วยงาน, ราชการ, ต้อง, ให้,...   

                                                ner_tags  
0                   [8, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0]  
1                      [3, 13, 13, 13, 23, 0, 3, 13, 23]  
2      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0,

In [12]:
# convert to simple transformer format
def convert_data_to_df(df):
  data_df = pd.DataFrame()
  sentence_id = []
  words = []
  labels = []

  for sentence in range(len(df)):
    for token in range(len(df['tokens'][sentence])):
      sentence_id.append(sentence)
      words.append(df['tokens'][sentence][token])
      labels.append(NER_TAGS[df['ner_tags'][sentence][token]]) #Map 0 to "O", 1 to "B_BRN"

  return pd.DataFrame(
      {"sentence_id": sentence_id, "words": words, "labels": labels}
  )

In [13]:
train_df.head()

Unnamed: 0,id,tokens,ner_tags
0,0,"[สุรยุทธ์, ยัน, ปฏิเสธ, ลงนาม, _, MOU, _, กับ,...","[8, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0]"
1,1,"[1, _, กันยายน, _, 2550, _, 12:21, _, น.]","[3, 13, 13, 13, 23, 0, 3, 13, 23]"
2,2,"[นายก, ฯ, _, ย้ำ, ไม่, ลงนาม, ใน, _, MOU, _, ร...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, ..."
3,3,"[ยัน, ไทย, ไม่, ได้, ล้มเหลว, ถึง, ขั้น, นั้น,...","[0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,"[พร้อม, หนุน, กกต., จัด, เลือกตั้ง, ให้, บริสุ...","[0, 0, 7, 0, 0, 0, 0, 0]"


In [14]:
train_data = convert_data_to_df(train_df)
#Re-process to validate and test dataset
eval_data = convert_data_to_df(validation_df )
test_data = convert_data_to_df(test_df)
train_data.head(9)

Unnamed: 0,sentence_id,words,labels
0,0,สุรยุทธ์,B_PER
1,0,ยัน,O
2,0,ปฏิเสธ,O
3,0,ลงนาม,O
4,0,_,O
5,0,MOU,O
6,0,_,O
7,0,กับ,O
8,0,อียู,B_ORG


In [17]:
import logging
from simpletransformers.ner import NERModel, NERArgs
import torch

# Simple Transformer https://simpletransformers.ai/docs/ner-minimal-start/
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

2024-06-15 07:48:18.586750: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-15 07:48:18.586846: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-15 07:48:18.757649: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [18]:
max_seq_length = train_data['words'].str.len().max()
print("Maximum length in column 'words':", max_seq_length)

Maximum length in column 'words': 81


In [19]:
torch.cuda.empty_cache()

In [24]:
data = {
    'Model': ['ALBERT', 'BERT', 'BERTweet', 'BigBird', 'CamemBERT', 'DeBERTa', 'DeBERTa', 'DeBERTaV2', 'DistilBERT', 'ELECTRA', 'HerBERT', 'LayoutLM', 'LayoutLMv2', 'Longformer', 'MobileBERT', 'MPNet', 'RemBERT', 'RoBERTa', 'SqueezeBert', 'XLM', 'XLM-RoBERTa', 'XLNet'],
    'Model code for NERModel': ['albert', 'bert', 'bertweet', 'bigbird', 'camembert', 'deberta', 'deberta', 'deberta-v2', 'distilbert', 'electra', 'herbert', 'layoutlm', 'layoutlmv2', 'longformer', 'mobilebert', 'mpnet', 'rembert', 'roberta', 'squeezebert', 'xlm', 'xlmroberta', 'xlnet']
}
modelType_df = pd.DataFrame(data)
modelType_df

Unnamed: 0,Model,Model code for NERModel
0,ALBERT,albert
1,BERT,bert
2,BERTweet,bertweet
3,BigBird,bigbird
4,CamemBERT,camembert
5,DeBERTa,deberta
6,DeBERTa,deberta
7,DeBERTaV2,deberta-v2
8,DistilBERT,distilbert
9,ELECTRA,electra


In [25]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2714726 entries, 0 to 2714725
Data columns (total 3 columns):
 #   Column       Dtype 
---  ------       ----- 
 0   sentence_id  int64 
 1   words        object
 2   labels       object
dtypes: int64(1), object(2)
memory usage: 62.1+ MB


In [25]:
import gc
torch.cuda.empty_cache()
gc.collect()

4435

In [21]:
# Configure the model
ner_args = NERArgs()
ner_args.train_batch_size = 192 #192 is fit for GPU T4, 512 for A100
ner_args.use_multiprocessing = True
ner_args.evaluate_during_training = True
ner_args.eval_batch_size = 1024
ner_args.num_train_epochs = 30
ner_args.overwrite_output_dir = True
ner_args.save_model_every_epoch = False


model = NERModel(
     "camembert", # Model Type
     "airesearch/wangchanberta-base-att-spm-uncased",  #Ner Pre-trained Model
     args=ner_args, use_cuda=torch.cuda.is_available(), labels=NER_TAGS # Local Config
)



config.json:   0%|          | 0.00/546 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/423M [00:00<?, ?B/s]

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at airesearch/wangchanberta-base-att-spm-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/282 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/905k [00:00<?, ?B/s]



In [23]:
model.train_model(train_data, eval_data=eval_data)

KeyboardInterrupt: 

In [26]:
result, model_outputs, preds_list = model.eval_model(eval_data)
result

  0%|          | 0/4 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/6 [00:00<?, ?it/s]

{'eval_loss': 0.21025888621807098,
 'precision': 0.7985213407719871,
 'recall': 0.8280030100649045,
 'f1_score': 0.8129949894941005}

In [None]:
!cp -r /content/outputs/best_model /content/gdrive/MyDrive/SuperAI/hack5/model

## Prediction

In [27]:
test_prediction_df = test_prediction_df.drop(['i'], axis=1)

In [28]:
new_text_list = list(test_prediction_df['word'])
all_token_sentence = []
token_sentence = []
count = 0

for i in tqdm(range(len(new_text_list))):
    if new_text_list[i] == '':
        new_text_list[i] = '_'
    else:
        new_text_list[i] = re.sub(r'\n', '', new_text_list[i])
    token_sentence.append(new_text_list[i])
    count += 1
    if count == 14:
        all_token_sentence.append(token_sentence)
        token_sentence = []
        count = 0
        
len(all_token_sentence)

100%|██████████| 64904/64904 [00:00<00:00, 366383.19it/s]


4636

In [29]:
for i in tqdm(range(len(new_text_list))):
    if new_text_list[i] == '':
        new_text_list[i] = '_'
    else:
        new_text_list[i] = re.sub(r'\n', '', new_text_list[i])
    token_sentence.append(new_text_list[i])
    count += 1
    if count == 14:
        all_token_sentence.append(token_sentence)
        token_sentence = []
        count = 0

100%|██████████| 64904/64904 [00:00<00:00, 360598.66it/s]


In [30]:
NER_TAGS = ["O",
            "B_BRN",        "B_DES",        "B_DTM",        "B_LOC",        "B_MEA",        "B_NUM",        "B_ORG",        "B_PER",        "B_TRM",        "B_TTL",
            "I_BRN",        "I_DES",        "I_DTM",        "I_LOC",        "I_MEA",        "I_NUM",        "I_ORG",        "I_PER",        "I_TRM",        "I_TTL",
            "E_BRN",        "E_DES",        "E_DTM",        "E_LOC",        "E_MEA",        "E_NUM",        "E_ORG",        "E_PER",        "E_TRM",        "E_TTL"]
len(NER_TAGS)

31

In [59]:
from simpletransformers.ner import NERModel, NERArgs

In [43]:
ner_args = NERArgs()
ner_args.eval_batch_size = 960
ner_args.use_multiprocessing = True

# ner_args.max_seq_length = 81 # Fixed Requirement ##############################
# model = NERModel("roberta",
#                  "FacebookAI/xlm-roberta-base",
#                  args = ner_args,
#                  use_cuda = torch.cuda.is_available(),
#                  labels = NER_TAGS) # your latest model

model = NERModel("auto",
                 "/kaggle/working/outputs/best_model",
                 args = ner_args,
                 use_cuda = torch.cuda.is_available(),
                 labels = NER_TAGS) # your latest model

In [67]:
ner_args = NERArgs()
ner_args.eval_batch_size = 960
ner_args.use_multiprocessing = True

# ner_args.max_seq_length = 81 # Fixed Requirement ##############################
# model = NERModel("roberta",
#                  "FacebookAI/xlm-roberta-base",
#                  args = ner_args,
#                  use_cuda = torch.cuda.is_available(),
#                  labels = NER_TAGS) # your latest model

model = NERModel("xlmroberta",
                 "FacebookAI/xlm-roberta-base",
                 args = ner_args,
                 use_cuda = torch.cuda.is_available(),
                 labels = NER_TAGS) # your latest model

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

In [68]:
import gc
torch.cuda.empty_cache()
gc.collect()

8

In [69]:
predictions, raw_outputs = model.predict(all_token_sentence, False)

  self.pid = os.fork()


  0%|          | 0/4 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/10 [00:00<?, ?it/s]

In [70]:
test_prediction = []
for i in range(len(predictions)):
    for j in range(len(predictions[i])):
        data = predictions[i][j]
        tag = data.values()
        test_prediction += (tag)

print(len(test_prediction))
print(test_prediction)

test_prediction_df = pd.DataFrame(test_prediction)

129808
['O', 'O', 'B_PER', 'B_PER', 'B_PER', 'B_PER', 'B_PER', 'O', 'O', 'B_PER', 'B_PER', 'O', 'B_PER', 'B_PER', 'O', 'B_PER', 'O', 'B_PER', 'O', 'B_PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B_PER', 'B_PER', 'O', 'B_PER', 'B_PER', 'O', 'B_PER', 'B_PER', 'B_PER', 'B_PER', 'O', 'O', 'B_PER', 'B_PER', 'O', 'O', 'O', 'O', 'B_PER', 'B_PER', 'B_PER', 'O', 'O', 'O', 'O', 'O', 'B_PER', 'O', 'O', 'B_PER', 'O', 'B_PER', 'O', 'O', 'B_PER', 'B_PER', 'O', 'B_PER', 'B_PER', 'B_PER', 'O', 'B_PER', 'B_PER', 'O', 'O', 'O', 'B_PER', 'O', 'B_PER', 'O', 'B_PER', 'B_PER', 'O', 'B_PER', 'O', 'O', 'B_PER', 'B_PER', 'O', 'O', 'B_PER', 'B_PER', 'B_PER', 'O', 'B_PER', 'O', 'B_PER', 'B_PER', 'B_PER', 'O', 'B_PER', 'O', 'B_PER', 'B_PER', 'B_PER', 'B_PER', 'B_PER', 'O', 'B_PER', 'B_PER', 'O', 'O', 'O', 'O', 'O', 'O', 'B_PER', 'B_PER', 'B_PER', 'O', 'O', 'B_PER', 'B_PER', 'B_PER', 'O', 'O', 'O', 'O', 'O', 'B_PER', 'O', 'B_PER', 'B_PER', 'B_PER', 'B_PER', 'O', 'O', 'O', 'O', 'O', 'O', 'B_PER', 'O', 'B_PER

In [57]:
NER_tag_df

Unnamed: 0,tag,class
0,O,0
1,B_ORG,1
2,B_PER,2
3,B_LOC,3
4,B_MEA,4
5,I_DTM,5
6,I_ORG,6
7,E_ORG,7
8,I_PER,8
9,B_TTL,9


In [71]:
test_prediction_df['NER_id'] = test_prediction_df[0].map(NER_tag_df.set_index('tag')['class'])
test_prediction_df.head(10)

Unnamed: 0,0,NER_id
0,O,0
1,O,0
2,B_PER,2
3,B_PER,2
4,B_PER,2
5,B_PER,2
6,B_PER,2
7,O,0
8,O,0
9,B_PER,2


## Save Result¶


In [38]:
sample_submission

Unnamed: 0,i,pred
0,0,0.0
1,1,0.0
2,2,0.0
3,3,
4,4,
...,...,...
64899,64899,
64900,64900,
64901,64901,
64902,64902,


In [76]:
submission_df = pd.read_csv('/kaggle/input/chadok-hybrid-ocr-ner/sample_submission.csv')

In [77]:
submission_df.loc[3:, 'pred'] =  test_prediction_df.loc[3:, 'NER_id']
submission_df['pred'] = submission_df['pred'].astype(int)
submission_df = submission_df.set_index('i')
submission_df.head(10)

Unnamed: 0_level_0,pred
i,Unnamed: 1_level_1
0,0
1,0
2,0
3,2
4,2
5,2
6,2
7,0
8,0
9,2


In [50]:
submission_df.value_counts() #score 0.36

pred
0       62236
2         682
4         450
3         388
9         342
12        311
14        206
17         82
15         45
18         35
13         34
1          30
10         19
7          12
11         10
6           8
8           7
5           3
27          3
19          1
Name: count, dtype: int64

In [78]:
submission_df.value_counts()

pred
0       36269
2       28635
Name: count, dtype: int64

In [79]:
submission_df.to_csv('/kaggle/working/submisstion_v3.csv')