**Transfer learning** (Fine-tuning a pretrained model vs Training only with Small Data): 

Transferring the knowledge of a pretrained model to a new model by initializing the second model with the first model's weights. when the second model is trained on a new task, it *transfers* the knowledge of the first model.

From Task A (Huge Dataset) = Model A [pretrained models]

---> Transfer knowledge ---> 

To Task B (Smaller Dataset) = Model B

# Useful Links

https://www.artic.edu/artworks/154663/momijigari-from-the-series-one-hundred-no-dramas-nogaku-hyakuban

https://aclanthology.org/W17-4114.pdf

https://nagisa.readthedocs.io/en/latest/tutorial_ner.html

https://www.youtube.com/watch?v=MqQ7rqRllIc

https://skimai.com/how-to-fine-tune-bert-for-named-entity-recognition-ner/

https://medium.com/@andrewmarmon/fine-tuned-named-entity-recognition-with-hugging-face-bert-d51d4cb3d7b5

To do:

https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb#scrollTo=IEnlUbgm8z3B


https://www.google.com/search?q=%5Cu3000+udf-8&rlz=1C1CHBF_enGR884GR884&oq=%5Cu3000+udf-8&aqs=chrome..69i57j0i22i30j0i8i13i30.7725j0j15&sourceid=chrome&ie=UTF-8

# Libraries

In [None]:
!pip install transformers



In [None]:
!pip install transformers seqeval[gpu]



In [None]:
!pip install fugashi
!pip install ipadic



In [None]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm

from transformers import pipeline
import torch
from transformers import AutoModel, AutoTokenizer 

from sklearn.metrics import accuracy_score

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


# Load Datasets

In [None]:
!gdown --id 1UHWXRKsqoGdWkxqkBhKF7gpYNAfxuKIO && unzip ukiyo-e.zip

Downloading...
From: https://drive.google.com/uc?id=1UHWXRKsqoGdWkxqkBhKF7gpYNAfxuKIO
To: /content/ukiyo-e.zip
  0% 0.00/37.2k [00:00<?, ?B/s]100% 37.2k/37.2k [00:00<00:00, 32.3MB/s]
Archive:  ukiyo-e.zip
replace train.xlsx? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace __MACOSX/._train.xlsx? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace test.xlsx? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace __MACOSX/._test.xlsx? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
train_data = pd.read_excel('train.xlsx')
test_data = pd.read_excel('test.xlsx')

In [None]:
from google.colab import drive

drive.mount('/content/drive')

%cd '/content/drive/MyDrive/!!!Art/data'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/!!!Art/data


In [None]:
data_gold = json.loads(open("gold_standard.json").read())
data_train = json.loads(open("train_data.json").read())

In [None]:
# Test = [("「東海道五十三次」  「三十八」「藤川」",{"entities":[(1,4,"PLACE"),(17,19,"PLACE")]}),("「東都六玉顔ノ内」  「角田川」",{"entities":[(1,3,"PLACE"),(12,15,"PLACE")]}),("「名所江戸百景」  「猿わか町よるの景」",{"entities":[(3,5,"PLACE"),(11,15,"PLACE")]}),("「江戸名所図会」  「卅二」「三十三間堂」「曽我五郎時宗」",{"entities":[(1,3,"PLACE"),(15,20,"PLACE")]}),("「江戸名所　百人美女」  「今川はし」",{"entities":[(2,4,"PLACE"),(14,18,"PLACE")]}),(" 「東海道五十三次の内」  「戸塚藤沢間」「吉田橋」「松若」",{"entities":[(2,4,"PLACE"),(15,17,"PLACE"),(17,19,"PLACE"),(22,25,"PLACE"),(27,29,"PLACE")]}),(" 「江戸名所百人美女」  「今戸」",{"entities":[(2,4,"PLACE"),(14,16,"PLACE")]}),("「東都高名会席尽」  「金子」「助六」",{"entities":[(1,3,"PLACE")]}),(" 「木曽六十九駅」  「草津」「野路玉川」「清玄尼」",{"entities":[(2,4,"PLACE"),(12,14,"PLACE"),(16,20,"PLACE")]}),(" 「東海道」  「程ケ谷戸塚間」「権太坂」「いがみ」",{"entities":[(2,5,"PLACE"),(8,11,"PLACE"),(11,13,"PLACE"),(16,19,"PLACE")]}),(" 「両国夕景一ツ目千金」",{"entities":[(2,4,"PLACE"),(9,11,"PLACE")]}),("「見立八景之内」  「清水寺の晩鐘」「清玄阿闍梨」「入間の息女桜姫」",{"entities":[(11,14,"PLACE")]}),(" 「江戸自慢三十六興」  「落合　ほたる」",{"entities":[(2,4,"PLACE"),(14,16,"PLACE")]}),(" 「曽我八景自筆鏡」  「十郎祐成」「曽我中村」",{"entities":[(2,4,"PLACE")]}),(" 「東海道五十三次之内」  「御油」「其二」「山本勘助母」",{"entities":[(2,5,"PLACE"),(15,17,"PLACE")]}),(" 「東海道」  「大津　三井寺」",{"entities":[(2,5,"PLACE"),(9,11,"PLACE"),(12,15,"PLACE")]}),(" 「王城加茂社風景」",{"entities":[(4,7,"PLACE")]}),("  「東海道名所之内」  「深草乃里」「少将つか」「せう／＼さくら」「元政寺」",{"entities":[(3,6,"PLACE"),(14,18,"PLACE"),(35,38,"PLACE")]}),("「東海道名所之内」  「淀川」",{"entities":[(1,4,"PLACE"),(12,14,"PLACE")]}),("「東京三芝居町繁栄之図」",{"entities":[(1,3,"PLACE"),(4,6,"PLACE")]}),("「津島牛頭天王」「舟発場」「佐屋川」「本陣」",{"entities":[(1,3,"PLACE"),(14,17,"PLACE"),(19,21,"PLACE")]}),(" 「東海道」  「浜松」",{"entities":[(2,5,"PLACE"),(9,11,"PLACE")]}),(" 「東海道之内」  「岡部」",{"entities":[(2,5,"PLACE"),(11,13,"PLACE")]}),(" 「東海道名所之内」  「豊川」",{"entities":[(2,5,"PLACE"),(13,15,"PLACE")]}),(" ［川口善光寺開帳参詣之図］",{"entities":[(2,4,"PLACE"),(4,7,"PLACE")]}),(" 「東海道」  「島田」",{"entities":[(2,5,"PLACE"),(9,11,"PLACE")]}),("  「浅草金竜山之図」",{"entities":[(1,3,"PLACE"),(3,6,"PLACE")]}),(" 「東京名所の内」  「浅草区金竜山浅草寺境内一覧」",{"entities":[(2,4,"PLACE"),(12,15,"PLACE"),(15,18,"PLACE"),(18,21,"PLACE")]}),(" 「しん板車づくし」  「横浜鉄道図」",{"entities":[(13,15,"PLACE")]}),(" 「東都名所高輪行粧之図」",{"entities":[(2,4,"PLACE"),(6,8,"PLACE")]}),(" 「東都名所」  「隅田川花盛」",{"entities":[(2,4,"PLACE"),(10,13,"PLACE")]}),(" 「江戸の花名勝会」  「り」「十番組」「一ツ家の賤の女　尾上菊次郎」「浅茅が原衣掛松」",{"entities":[(2,4,"PLACE"),(36,40,"PLACE")]}),("「江戸の花名勝会」  「ち」「十番組」「一ツ家の姥　市川海老蔵」「猿若芝居町」",{"entities":[(1,3,"PLACE"),(33,38,"PLACE")]}),(" 「東京三十六景」  「十五」「両国」「十六」「本所一ツ目之橋」",{"entities":[(2,4,"PLACE"),(16,18,"PLACE"),(26,31,"PLACE")]}),(" 「東京十二月之内」  「二月」「亀井戸天神」「亀井戸梅林」",{"entities":[(2,4,"PLACE"),(17,22,"PLACE"),(24,27,"PLACE")]}),(" 「東京十二月之内」  「四月」「品川沖之景」",{"entities":[(2,4,"PLACE"),(17,19,"PLACE")]}),(" 「東京十二月之内」  「六月」「愛宕之景」「神田神社」",{"entities":[(2,4,"PLACE"),(17,19,"PLACE"),(23,27,"PLACE")]}),("「東京開化名所」  「三代徳川家光公」「三河島之景」",{"entities":[(1,3,"PLACE"),(20,23,"PLACE")]}),(" 「東京名所競」  「上野東照宮」",{"entities":[(2,4,"PLACE"),(11,16,"PLACE")]}),("「東京銘勝会」  「不忍の競馬」",{"entities":[(1,3,"PLACE"),(10,15,"PLACE")]}),(" 「東海道」  「土山」「鈴ヶ山坂ノ下」",{"entities":[(2,5,"PLACE"),(9,11,"PLACE"),(13,16,"PLACE"),(16,19,"PLACE")]}),("「堀切花菖蒲」",{"entities":[(1,6,"PLACE")]}),(" 「両国花火之図」",{"entities":[(2,4,"PLACE")]}),(" 「亀戸梅屋敷」",{"entities":[(1,6,"PLACE")]}),("「東京十二月之内」  「一月」「宮城之春」「九段坂」",{"entities":[(1,3,"PLACE"),(16,20,"PLACE"),(24,27,"PLACE")]}),(" 「見立十二支」  「丑」「向島」「牛島神社」",{"entities":[(14,16,"PLACE"),(18,22,"PLACE")]}),("「見立十二支」  「辰」「深川八幡」「富士」",{"entities":[(13,17,"PLACE"),(19,21,"PLACE")]}),(" 「見立十二支」  「酉」「浅草田甫酉の市」",{"entities":[(14,18,"PLACE")]}),(" 「東海道五拾三次之内」  「大尾」「京師」「三条大橋」",{"entities":[(2,5,"PLACE"),(19,21,"PLACE"),(23,27,"PLACE")]}),(" 「東海道五拾三次之内」  「日本橋」「行烈振出」",{"entities":[(2,5,"PLACE"),(15,18,"PLACE")]}),(" 「東海道五拾三次之内」  「四日市」「三重川」",{"entities":[(2,5,"PLACE"),(15,18,"PLACE"),(20,23,"PLACE")]}),(" 「東海道五拾三次之内」  「大津」「走井茶屋」",{"entities":[(2,5,"PLACE"),(15,17,"PLACE"),(19,21,"PLACE")]}),(" 「東海道五拾三次之内」  「平塚」「縄手道」",{"entities":[(2,5,"PLACE"),(15,17,"PLACE"),(19,22,"PLACE")]}),(" 「東海道五拾三次之内」  「土山」「春之雨」",{"entities":[(2,5,"PLACE"),(15,17,"PLACE")]}),(" 「東海道五拾三次之内」  「日坂」「佐夜ノ中山」",{"entities":[(2,5,"PLACE"),(15,17,"PLACE"),(19,24,"PLACE")]}),("「東海道五拾三次之内」  「庄野」「白雨」",{"entities":[(2,5,"PLACE"),(15,17,"PLACE")]}),(" 「江戸名所」  「両国花火」",{"entities":[(2,4,"PLACE"),(10,12,"PLACE")]}),(" 「江戸名所」  「芝増上寺前の景」",{"entities":[(2,4,"PLACE"),(11,14,"PLACE")]}),(" 「東都名所」  「新吉原五丁町弥生花盛全図」",{"entities":[(2,4,"PLACE"),(10,13,"PLACE")]}),(" 「東海道五拾三次之内」  「袋井」「出茶屋ノ図」",{"entities":[(2,5,"PLACE"),(15,17,"PLACE"),(19,22,"PLACE")]}),("［江都名所浅草観音の図］",{"entities":[(1,3,"PLACE"),(5,9,"PLACE")]}),("「東京市中馬車往来之図」",{"entities":[(1,3,"PLACE")]}),(" ［東京海運橋兜町為換座五階造リ図］",{"entities":[(2,4,"PLACE"),(4,7,"PLACE"),(7,9,"PLACE")]}),("［久松町劇場久松座繁栄図］",{"entities":[(1,4,"PLACE")]}),(" 「東京名所」  「スジカイ　万代橋」",{"entities":[(1,3,"PLACE"),(14,17,"PLACE")]}),("「東京名所之内」  「高輪海岸鉄道の図」",{"entities":[(1,3,"PLACE"),(11,15,"PLACE")]}),(" 「東海道五拾三次之内」  「原」「朝之富士」",{"entities":[(2,5,"PLACE"),(15,16,"PLACE"),(20,22,"PLACE")]}),("「横浜名所図会」  「野毛山下蒸気車」",{"entities":[(2,4,"PLACE"),(12,14,"PLACE")]}),(" 「東京名所之内」  「上野公園　清水堂」",{"entities":[(2,4,"PLACE"),(12,16,"PLACE"),(17,20,"PLACE")]}),("「東海道五拾三次之内」  「見附」「天竜川図」",{"entities":[(1,4,"PLACE"),(14,16,"PLACE"),(18,22,"PLACE")]}),("「東京開化三十六景」  「柳橋より浅草橋」",{"entities":[(1,3,"PLACE"),(13,15,"PLACE"),(17,20,"PLACE")]}),(" 「東京名所之内」  「浅草金竜山」",{"entities":[(2,4,"PLACE"),(12,14,"PLACE"),(14,17,"PLACE")]}),("「伊勢名所」「伊勢土産名所図画」  「二見浦之景」「賓日館及海水景」「六」",{"entities":[(1,3,"PLACE"),(19,22,"PLACE"),(26,29,"PLACE"),(29,32,"PLACE")]}),(" 「大和名所」  「大仏殿」",{"entities":[(1,3,"PLACE"),(10,13,"PLACE")]}),(" 「大和名所」  「春日神社」「若草山」",{"entities":[(2,4,"PLACE"),(10,14,"PLACE"),(16,19,"PLACE")]}),(" 「府県名所図会」  「兵庫県」「神戸　布引滝？」「県庁之位地　八部郡神戸」",{"entities":[(12,15,"PLACE"),(17,19,"PLACE"),(20,23,"PLACE"),(32,37,"PLACE")]}),(" 「神田御社眺望」",{"entities":[(2,6,"PLACE")]}),(" 「大阪名所」  「桜乃宮より造幣局を望む」「天神橋之図」「造幣局」",{"entities":[(2,4,"PLACE"),(10,13,"PLACE"),(23,26,"PLACE"),(30,33,"PLACE")]}),(" 「東海道之内」  「関」",{"entities":[(2,5,"PLACE")]}),("「観音霊験記」  「秩父順礼廿九番」「笹の戸　見目山　長泉院」",{"entities":[(23,25,"PLACE"),(27,30,"PLACE")]}),("［東京三井組ハウス］",{"entities":[(1,9,"PLACE")]}),("「東京名所　浅草観音之図」",{"entities":[(1,3,"PLACE"),(6,10,"PLACE")]}),(" 「名所江戸百景」  「鎧の渡し　小網町」",{"entities":[(4,6,"PLACE"),(16,20,"PLACE")]}),("「諸国滝廻リ」  「木曽海道　小野ノ瀑布」",{"entities":[(10,14,"PLACE")]}),("「五十三次名所図会」「丗二」  「あら井　渡舟着岸御関所」",{"entities":[(17,20,"PLACE")]}),("「五十三次名所図会　四十」「池鯉鮒　八ツ橋むら　杜若の古せき」",{"entities":[(14,17,"PLACE"),(18,23,"PLACE")]}),("「五十三次名所図会」「丗八」  「藤川　山中の里別名宮路山」",{"entities":[(17,19,"PLACE"),(20,24,"PLACE"),(26,29,"PLACE")]}),(" 「川崎」  「神奈川へ二リ半」",{"entities":[(1,3,"PLACE"),(7,10,"PLACE")]}),(" 「江都名所」  「洲崎しほ干狩」",{"entities":[(1,3,"PLACE"),(9,11,"PLACE")]}),(" 「諸国名橋奇覧」  「摂州安治川口　天保山」",{"entities":[(11,13,"PLACE"),(13,16,"PLACE"),(18,21,"PLACE")]}),("「東都名所」  「永代橋深川新地」",{"entities":[(1,3,"PLACE"),(9,12,"PLACE"),(12,16,"PLACE")]}),("「五十三次名所図会」「一」  「日本橋　東雲の景",{"entities":[(17,20,"PLACE")]}),("「五十三次名所図会」「六」  「戸塚　山道より不二眺望」",{"entities":[(16,18,"PLACE"),(19,21,"PLACE"),(23,25,"PLACE")]}),(" 「富士三十六景」  「東都佃沖」",{"entities":[(1,3,"PLACE"),(11,14,"PLACE")]}),("「江戸名所之内」  「真乳山」",{"entities":[(1,3,"PLACE"),(11,14,"PLACE")]}),(" 「名所江戸百景」  「千住の大はし」",{"entities":[(3,5,"PLACE"),(11,17,"PLACE")]}),("「諸国名所百景」  「遠州秋葉遠景袋井凧」",{"entities":[(13,15,"PLACE"),(17,19,"PLACE")]})]


In [None]:
# Train = [("「東海道　京都之内」「大内能上覧図」", {"entities":[(1,4,"PLACE"),(5,7,"PLACE")]}),("「東海道　京都名所之内」「四条河原」", {"entities":[(1,4,"PLACE"),(5,7,"PLACE"),(13,17,"PLACE")]}),("「東海道名所之内」「御能拝見之図」",{"entities":[(1,4,"PLACE")]}),("「東海道」「京都  紫震殿」",{"entities":[(1,4,"PLACE"),(6,8,"PLACE"),(9,12,"PLACE")]}),("「東海道之内」「京都参内」",{"entities":[(1,4,"PLACE"),(8,10,"PLACE")]}),("「東海道之内」「京」「大内蹴鞠之遊覧」",{"entities":[(1,4,"PLACE"),(8,9,"PLACE"),(11,13,"PLACE")]}),("「東海道名所之内」「上加茂」「岩本社」「三本杉」「片岡社」「楼門」「御供所」「若宮」「別雷皇太神宮」「杉尾社」「仮殿」",{"entities":[(1,4,"PLACE"),(10,13,"PLACE"),(15,18,"PLACE"),(20,23,"PLACE"),(25,28,"PLACE"),(30,32,"PLACE"),(34,37,"PLACE"),(39,41,"PLACE"),(43,49,"PLACE"),(51,54,"PLACE"),(56,58,"PLACE")]}),("「東海道名所之内」「京加茂」「山科」「黒谷」「吉田山」「将軍塚」「比叡山」「比良」",{"entities":[(1,4,"PLACE"),(10,13,"PLACE"),(15,17,"PLACE"),(19,21,"PLACE"),(23,26,"PLACE"),(28,31,"PLACE"),(33,36,"PLACE"),(38,40,"PLACE")]}),("「東海道名所之内」「加茂の競馬」",{"entities":[(1,4,"PLACE"),(10,12,"PLACE")]}),(" 「東海道名所之内」「糺河原」「糺川原」「みたらし川」「河合社」",{"entities": [(1,4,"PLACE"),(10,13,"PLACE"),(15,18,"PLACE"),(20,25,"PLACE"),(27,30,"PLACE")]}),("「東海道名所之内」「祇園祭礼」",{"entities":[(1,4,"PLACE"),(10,12,"PLACE")]}),("「東海道 京都名所之内」「島原」",{"entities":[(1,4,"PLACE"),(5,7,"PLACE"),(13,15,"PLACE")]}),("「東海道名所」「京洛中ノ内」「五条橋」",{"entities": [(1,4,"PLACE"),(8,10,"PLACE"),(15,18,"PLACE")]}),("「東海道名所之内」「京都等持院足利十五代木像之図」",{"entities":[(1,4,"PLACE"),(10,12,"PLACE"),(12,15,"PLACE")]}),("「東海道名所つゝき」「羅生門之古図」「春雨ノ社」「金札石刀石トモ云」「鬼カミノハシラ」",{"entities":[(1,4,"PLACE"),(11,14,"PLACE"),(19,23,"PLACE")]}),("「東海道之内」「京都御出立」",{"entities":[(1,4,"PLACE"),(8,10,"PLACE")]}),(" 「東海道名所之内」「下加茂」「☆☆との」「いのうへ社」「二言社」「御供所」「本社御租神」「石☆社」",{"entities":[(1,4,"PLACE"),(10,13,"PLACE"),(21,26,"PLACE"),(28,31,"PLACE"),(33,36,"PLACE"),(38,43,"PLACE"),(45,48,"PLACE")]}),("「東海道名所之内」「宇治」",{"entities":[(1,4,"PLACE"),(10,12,"PLACE")]}),("「二見浦」",{"entities":[(1,4,"PLACE")]}),("「長谷川貞信筆　都名所写真鏡　上篇」",{"entities":[(7,8,"PLACE")]}),("「都名所之内」「広沢池秋の月」「遍照寺山」「いけ浦」「愛宕山」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE"),(16,20,"PLACE"),(22,25,"PLACE"),(27,30,"PLACE")]}),(" 「都名所之内」「知恩院本堂に傘を見る",{"entities":[(1,2,"PLACE"),(8,11,"PLACE")]}),("「都名所之内」「伏見稲荷社」", { "entities":[(1,2,"PLACE"),(8,13,"PLACE")]}),("「都名所之内」「高台寺秋ノ景」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE")]}),("「都名所之内」「東福寺通天橋」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE"),(11,14,"PLACE")]}),("「都名所之内」「西大谷目鏡橋」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE"),(11,14,"PLACE")]}),("「都名所之内」「音羽山清水寺」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE"),(11,14,"PLACE")]}),("「都名所之内」「栂尾門前雨中」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE")]}),("「都名所之内」「祇園大鳥居」",{"entities":[(1,2,"PLACE"),(8,13,"PLACE")]}),("「都名所之内」「三条大はし」",{"entities":[(1,2,"PLACE"),(8,13,"PLACE")]}),("「都名所之内」「竜安寺雪曙」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE")]}),("「都名所之内」「四条橋より縄手通大和橋を望」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE"),(13,19,"PLACE")]}),("「都名所之内」「如意嶽大文字」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE"),(11,14,"PLACE")]}),("「都名所之内」「比叡山山上より湖水を望」「三上山」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE"),(21,24,"PLACE")]}),("「都名所之内」「三条大はし」",{"entities":[(1,2,"PLACE"),(8,13,"PLACE")]}),("「都名所之内」「北野天満宮境内」",{"entities":[(1,2,"PLACE"),(8,13,"PLACE")]}),("「都名所之内」「金閣寺雪景」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE")]}),("「都名所之内」「妙心寺雪江松」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE")]}),("「都名所之内」「竜安寺雪曙」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE")]}),("「都名所之内」「御室仁和寺花盛」",{"entities":[(1,2,"PLACE"),(8,13,"PLACE")]}),("「都名所之内」「広沢池秋の月」「遍照寺山」「いけ浦」「愛宕山」",{"entities":[(1,2,"PLACE"),(8,13,"PLACE")]}),("「都名所之内」「広沢池秋の月」「遍照寺山」「いけ浦」「愛宕山」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE"),(16,20,"PLACE"),(22,25,"PLACE"),(27,30,"PLACE")]}),("「都名所之内」「高雄奥の院庭中」「清滝川」",{"entities":[(1,2,"PLACE"),(8,10,"PLACE"),(17,20,"PLACE")]}),("「都名所之内」「栂尾門前雨中」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE")]}),("「都名所之内」「愛宕山之図」「清滝川」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE"),(15,18,"PLACE")]}),("「都名所之内」「嵐山 三軒家より眺望」「となせの滝」",{"entities":[(1,2,"PLACE"),(8,10,"PLACE"),(11,14,"PLACE"),(20,25,"PLACE")]}),("「都名所之内」「島原出口光景」",{"entities":[(1,2,"PLACE"),(8,10,"PLACE")]}),("「都名所之内」「島原出口光景」",{"entities":[(1,2,"PLACE"),(8,10,"PLACE")]}),("「京都名所之内」「金閣寺」",{"entities":[(1,2,"PLACE"),(9,12,"PLACE")]}),("「滑稽都名所」「三十三間堂」",{"entities":[(3,4,"PLACE"),(8,13,"PLACE")]}),("「滑稽都名所」「黒谷」",{"entities":[(3,4,"PLACE"),(8,10,"PLACE")]}),("「滑稽都名所」「銀閣寺」",{"entities":[(3,4,"PLACE"),(8,11,"PLACE")]}),("「滑稽都名所」「耳塚」",{"entities":[(3,4,"PLACE"),(8,10,"PLACE")]}),("「滑稽都名所」「東寺」",{"entities":[(3,4,"PLACE"),(8,10,"PLACE")]}),("「滑稽都名所」「真葛原」",{"entities":[(3,4,"PLACE"),(8,11,"PLACE")]}),("「滑稽都名所」「内裏」",{"entities":[(3,4,"PLACE"),(8,10,"PLACE")]}),("「滑稽都名所」「大仏」",{"entities":[(3,4,"PLACE"),(8,10,"PLACE")]}),("「滑稽都名所」「平野」",{"entities":[(3,4,"PLACE"),(8,10,"PLACE")]}),("「滑稽都名所」「鞍馬山」",{"entities":[(3,4,"PLACE"),(8,11,"PLACE")]}),("「滑稽都名所」「広沢」",{"entities":[(3,4,"PLACE"),(8,10,"PLACE")]}),("「滑稽都名所」「竜安寺」",{"entities":[(3,4,"PLACE"),(8,11,"PLACE")]}),("「諸国名所百景」「京都祇園祭礼」",{"entities":[(9,11,"PLACE")]}),("「京洛名所」「二条橋より大文字を望む」",{"entities":[(1,3,"PLACE"),(7,10,"PLACE"),(12,15,"PLACE")]}),("「東福寺通天橋」",{"entities":[(1,7,"PLACE")]}),("「祇園春宵」",{"entities":[(1,3,"PLACE")]}),("「京都大仏殿大鐘楼」",{"entities":[(1,3,"PLACE"),(3,6,"PLACE"),(6,9,"PLACE")]}),("「大文字山」",{"entities":[(1,5,"PLACE")]}),("「銀閣寺新緑」",{"entities":[(1,4,"PLACE")]}),("「島原大門出口ノ柳」",{"entities":[(1,5,"PLACE")]}),("「三十三間堂」",{"entities":[(1,6,"PLACE")]}),("「嵐山渡月橋の夏雨」",{"entities":[(1,3,"PLACE")]}),("「清水寺秋色」",{"entities":[(1,4,"PLACE")]}),("「平安　諸大家名所画譜",{"entities":[(1,3,"PLACE")]}),("「京都諸大家筆　平安諸大家名所画譜目録」",{"entities":[(1,3,"PLACE"),(7,9,"PLACE")]}),("「平安諸大家名所画譜一」「第一応挙筆宇治橋之図」",{"entities":[(1,3,"PLACE"),(18,21,"PLACE")]}),("「平安諸大家名所画譜ニ」「第二景文筆鳳凰堂之図」",{"entities":[(1,3,"PLACE"),(18,21,"PLACE")]}),("「江戸名所百人美女」「するがだい」",{"entities":[(1,3,"PLACE"),(11,16,"PLACE")]}),("「江戸名所百人美女日本はし」",{"entities":[(1,3,"PLACE"),(9,13,"PLACE")]}),("「江戸名所百人美女」  「柳はし」",{"entities":[(1,3,"PLACE"),(11,14,"PLACE")]}),("「江戸名所百人美女」「鎧のわたし」",{"entities":[(1,3,"PLACE"),(11,16,"PLACE")]}),("「江戸名所道戯盡」「二」「両国の夕立」",{"entities":[(1,3,"PLACE"),(13,15,"PLACE")]}),("「江戸名所道戯盡」「四」「御茶の水の釣人」",{"entities":[(1,3,"PLACE"),(13,17,"PLACE")]}),("「江戸名所道戯盡」「五」「飛鳥山の花見」",{"entities":[(1,3,"PLACE"),(13,15,"PLACE")]}),("「江戸名所道外盡」「六」「不忍池」",{"entities":[(1,3,"PLACE"),(13,16,"PLACE")]}),("「江戸名所道化盡」「七」「新シ橋の大風」",{"entities":[(1,3,"PLACE"),(13,16,"PLACE")]}),("「江戸名所道外盡」「八」「隅田堤の弥生」",{"entities":[(1,3,"PLACE"),(13,16,"PLACE")]}),("「江戸名所道化盡」「九」「湯嶋天神の臺」",{"entities":[(1,3,"PLACE"),(13,17,"PLACE")]}),("「江戸名所道外尽」「十」「外神田佐久間町」",{"entities":[(1,3,"PLACE"),(13,20,"PLACE")]}),("「江戸名所道戯盡」「十三」「鎧のわたし七夕祭」",{"entities":[(1,3,"PLACE"),(14,19,"PLACE")]}),("「江戸名所道戯盡」「十四」「芝赤羽はしの雪中」",{"entities":[(1,3,"PLACE"),(14,19,"PLACE")]}),("「江戸名所道戯盡」「十六」「王子狐火」",{"entities":[(1,3,"PLACE")]}),("「江戸名所道化盡」「十七」「通壹丁目祇園會」",{"entities":[(1,3,"PLACE"),(14,18,"PLACE"),(18,20,"PLACE")]}),("「江戸名所道外盡」「二十」「道灌山虫聞」",{"entities":[(1,3,"PLACE"),(14,17,"PLACE")]}),("「江戸名所道外盡」「廿四」「数寄屋かし」",{"entities":[(1,3,"PLACE"),(14,19,"PLACE")]}),("「江戸名所道戯盡」「三十五」「吾嬬の森梅見もとり」",{"entities":[(1,3,"PLACE"),(14,17,"PLACE")]}),("「江戸名所張交図會」「浅草金龍山」「三囲」「雷門」「すみた川」「向嶋花屋鋪」",{"entities":[(1,3,"PLACE"),(11,16,"PLACE"),(18,20,"PLACE"),(22,24,"PLACE"),(26,30,"PLACE"),(32,37,"PLACE")]}),("「江戸廼花名勝會」「五番組」「ま」「赤坂」「赤坂奴凧平　尾上多見蔵」「赤坂御門外」",{"entities":[(1,3,"PLACE"),(18,20,"PLACE"),(22,24,"PLACE"),(35,37,"PLACE")]}),("「江戸の華名勝會」「五番組」「ま」「三河臺」「三河臺　氷川神社」「関東小六　市川市蔵」",{"entities":[(1,3,"PLACE"),(18,21,"PLACE"),(23,26,"PLACE"),(27,31,"PLACE")]}),("「東京名所四十八景　日本はし夕けしき」",{"entities":[(1,3,"PLACE"),(10,14,"PLACE")]}),("「東京名所四十八景」「神田明神社内年の市」",{"entities":[(1,3,"PLACE"),(11,16,"PLACE"),(16,20,"PLACE")]}),("「東京名所四十八景　谷中諏訪の社廿六夜まち」",{"entities":[(1,3,"PLACE"),(12,16,"PLACE")]})]


In [None]:
Test = [("「東海道五十三次」  「三十八」「藤川」",{"entities":[(1,4,"PLACE"),(17,19,"PLACE")]}),
        ("「東都六玉顔ノ内」  「角田川」",{"entities":[(1,3,"PLACE"),(12,15,"PLACE")]}),
        ("「名所江戸百景」  「猿わか町よるの景」",{"entities":[(3,5,"PLACE"),(11,15,"PLACE")]}),
        ("「江戸名所図会」  「卅二」「三十三間堂」「曽我五郎時宗」",{"entities":[(1,3,"PLACE"),(15,20,"PLACE")]}),
        ("「江戸名所　百人美女」  「今川はし」",{"entities":[(1,3,"PLACE"),(14,18,"PLACE")]}),
        (" 「東海道五十三次の内」  「戸塚藤沢間」「吉田橋」「松若」",{"entities":[(2,4,"PLACE"),(15,19,"PLACE"),(22,25,"PLACE"),(27,29,"PLACE")]}),
        (" 「江戸名所百人美女」  「今戸」",{"entities":[(2,4,"PLACE"),(14,16,"PLACE")]}),
        ("「東都高名会席尽」  「金子」「助六」",{"entities":[(1,3,"PLACE")]}),
        (" 「木曽六十九駅」  「草津」「野路玉川」「清玄尼」",{"entities":[(2,4,"PLACE"),(12,14,"PLACE"),(16,20,"PLACE")]}),
        (" 「東海道」 「程ケ谷戸塚間」「権太坂」「いがみ」",{"entities":[(2,5,"PLACE"),(8,13,"PLACE"),(16,19,"PLACE")]}),
        (" 「両国夕景一ツ目千金」",{"entities":[(2,4,"PLACE"),(9,11,"PLACE")]}),
        ("「見立八景之内」  「清水寺の晩鐘」「清玄阿闍梨」「入間の息女桜姫」",{"entities":[(11,14,"PLACE")]}),
        (" 「江戸自慢三十六興」  「落合　ほたる」",{"entities":[(2,4,"PLACE"),(14,16,"PLACE")]}),
        (" 「曽我八景自筆鏡」  「十郎祐成」「曽我中村」",{"entities":[(2,4,"PLACE")]}),
        (" 「東海道五十三次之内」  「御油」「其二」「山本勘助母」",{"entities":[(2,5,"PLACE"),(15,17,"PLACE")]}),
        ("「東海道」「大津三井寺」",{"entities":[(1,4,"PLACE"),(6,11,"PLACE")]}),
        ("「王城加茂社風景」",{"entities":[(3,6,"PLACE")]}),
        ("「東海道名所之内」  「深草乃里」「少将つか」「せう／＼さくら」「元政寺」",{"entities":[(1,4,"PLACE"),(12,16,"PLACE"),(33,36,"PLACE")]}),
        ("「東海道名所之内」  「淀川」",{"entities":[(1,4,"PLACE"),(12,14,"PLACE")]}),
        ("「東京三芝居町繁栄之図」",{"entities":[(1,3,"PLACE"),(4,6,"PLACE")]}),
        ("「津島牛頭天王」「舟発場」「佐屋川」「本陣」",{"entities":[(1,3,"PLACE"),(14,17,"PLACE"),(19,21,"PLACE")]}),
        (" 「東海道」  「浜松」",{"entities":[(2,5,"PLACE"),(9,11,"PLACE")]}),(" 「東海道之内」  「岡部」",{"entities":[(2,5,"PLACE"),(11,13,"PLACE")]}),
        (" 「東海道名所之内」  「豊川」",{"entities":[(2,5,"PLACE"),(13,15,"PLACE")]}),
        ("［川口善光寺開帳参詣之図］",{"entities":[(1,6,"PLACE")]}),
        (" 「東海道」  「島田」",{"entities":[(2,5,"PLACE"),(9,11,"PLACE")]}),
        ("「浅草金竜山之図」",{"entities":[(1,6,"PLACE")]}),
        ("「東京名所の内」  「浅草区金竜山浅草寺境内一覧」",{"entities":[(1,3,"PLACE"),(11,20,"PLACE")]}),
        (" 「しん板車づくし」  「横浜鉄道図」",{"entities":[(13,15,"PLACE")]}),
        (" 「東都名所高輪行粧之図」",{"entities":[(2,4,"PLACE"),(6,8,"PLACE")]}),
        (" 「東都名所」  「隅田川花盛」",{"entities":[(2,4,"PLACE"),(10,13,"PLACE")]}),
        (" 「江戸の花名勝会」  「り」「十番組」「一ツ家の賤の女　尾上菊次郎」「浅茅が原衣掛松」",{"entities":[(2,4,"PLACE"),(36,40,"PLACE")]}),
        ("「江戸の花名勝会」  「ち」「十番組」「一ツ家の姥　市川海老蔵」「猿若芝居町」",{"entities":[(1,3,"PLACE"),(33,38,"PLACE")]}),
        (" 「東京三十六景」  「十五」「両国」「十六」「本所一ツ目之橋」",{"entities":[(2,4,"PLACE"),(16,18,"PLACE"),(26,31,"PLACE")]}),
        (" 「東京十二月之内」  「二月」「亀井戸天神」「亀井戸梅林」",{"entities":[(2,4,"PLACE"),(17,22,"PLACE"),(24,29,"PLACE")]}),
        (" 「東京十二月之内」  「四月」「品川沖之景」",{"entities":[(2,4,"PLACE"),(17,19,"PLACE")]}),
        (" 「東京十二月之内」  「六月」「愛宕之景」「神田神社」",{"entities":[(2,4,"PLACE"),(17,19,"PLACE"),(23,27,"PLACE")]}),
        ("「東京開化名所」  「三代徳川家光公」「三河島之景」",{"entities":[(1,3,"PLACE"),(20,23,"PLACE")]}),
        (" 「東京名所競」  「上野東照宮」",{"entities":[(2,4,"PLACE"),(11,16,"PLACE")]}),
        ("「東京銘勝会」  「不忍の競馬」",{"entities":[(1,3,"PLACE"),(10,15,"PLACE")]}),
        (" 「東海道」  「土山」「鈴ヶ山坂ノ下」",{"entities":[(2,5,"PLACE"),(9,11,"PLACE"),(13,19,"PLACE")]}),
        ("「堀切花菖蒲」",{"entities":[(1,6,"PLACE")]}),
        (" 「両国花火之図」",{"entities":[(2,4,"PLACE")]}),
        (" 「亀戸梅屋敷」",{"entities":[(1,6,"PLACE")]}),
        ("「東京十二月之内」  「一月」「宮城之春」「九段坂」",{"entities":[(1,3,"PLACE"),(16,20,"PLACE"),(24,27,"PLACE")]}),
        (" 「見立十二支」  「丑」「向島」「牛島神社」",{"entities":[(14,16,"PLACE"),(18,22,"PLACE")]}),
        ("「見立十二支」  「辰」「深川八幡」「富士」",{"entities":[(13,17,"PLACE"),(19,21,"PLACE")]}),
        (" 「見立十二支」  「酉」「浅草田甫酉の市」",{"entities":[(14,18,"PLACE")]}),
        (" 「東海道五拾三次之内」  「大尾」「京師」「三条大橋」",{"entities":[(2,5,"PLACE"),(19,21,"PLACE"),(23,27,"PLACE")]}),
        (" 「東海道五拾三次之内」  「日本橋」「行烈振出」",{"entities":[(2,5,"PLACE"),(15,18,"PLACE")]}),
        (" 「東海道五拾三次之内」  「四日市」「三重川」",{"entities":[(2,5,"PLACE"),(15,18,"PLACE"),(20,23,"PLACE")]}),
        (" 「東海道五拾三次之内」  「大津」「走井茶屋」",{"entities":[(2,5,"PLACE"),(15,17,"PLACE"),(19,21,"PLACE")]}),
        (" 「東海道五拾三次之内」  「平塚」「縄手道」",{"entities":[(2,5,"PLACE"),(15,17,"PLACE"),(19,22,"PLACE")]}),
        (" 「東海道五拾三次之内」  「土山」「春之雨」",{"entities":[(2,5,"PLACE"),(15,17,"PLACE")]}),
        (" 「東海道五拾三次之内」  「日坂」「佐夜ノ中山」",{"entities":[(2,5,"PLACE"),(15,17,"PLACE"),(19,24,"PLACE")]}),
        ("「東海道五拾三次之内」  「庄野」「白雨」",{"entities":[(2,5,"PLACE"),(15,17,"PLACE")]}),
        (" 「江戸名所」  「両国花火」",{"entities":[(2,4,"PLACE"),(10,12,"PLACE")]}),
        (" 「江戸名所」  「芝増上寺前の景」",{"entities":[(2,4,"PLACE"),(11,14,"PLACE")]}),
        (" 「東都名所」  「新吉原五丁町弥生花盛全図」",{"entities":[(2,4,"PLACE"),(10,16,"PLACE")]}),
        (" 「東海道五拾三次之内」  「袋井」「出茶屋ノ図」",{"entities":[(2,5,"PLACE"),(15,17,"PLACE"),(19,22,"PLACE")]}),
        ("［江都名所浅草観音の図］",{"entities":[(1,3,"PLACE"),(5,9,"PLACE")]}),
        ("「東京市中馬車往来之図」",{"entities":[(1,3,"PLACE")]}),
        ("［東京海運橋兜町為換座五階造リ図］",{"entities":[(1,8,"PLACE")]}),
        ("［久松町劇場久松座繁栄図］",{"entities":[(1,4,"PLACE")]}),
        ("「東京名所」  「スジカイ　万代橋」",{"entities":[(1,3,"PLACE"),(14,17,"PLACE")]}),
        ("「東京名所之内」  「高輪海岸鉄道の図」",{"entities":[(1,3,"PLACE"),(11,15,"PLACE")]}),
        (" 「東海道五拾三次之内」  「原」「朝之富士」",{"entities":[(2,5,"PLACE"),(15,16,"PLACE"),(20,22,"PLACE")]}),
        ("「横浜名所図会」  「野毛山下蒸気車」",{"entities":[(1,3,"PLACE"),(11,14,"PLACE")]}),
        ("「東京名所之内」  「上野公園清水堂」",{"entities":[(1,3,"PLACE"),(11,18,"PLACE")]}),
        ("「東海道五拾三次之内」  「見附」「天竜川図」",{"entities":[(1,4,"PLACE"),(14,16,"PLACE"),(18,22,"PLACE")]}),
        ("「東京開化三十六景」  「柳橋より浅草橋」",{"entities":[(1,3,"PLACE"),(13,15,"PLACE"),(17,20,"PLACE")]}),
        ("「東京名所之内」  「浅草金竜山」",{"entities":[(1,3,"PLACE"),(11,16,"PLACE")]}),
        ("「伊勢名所」「伊勢土産名所図画」  「二見浦之景」「賓日館及海水景」「六」",{"entities":[(1,3,"PLACE"),(19,22,"PLACE"),(26,32,"PLACE")]}),
        ("「大和名所」  「大仏殿」",{"entities":[(1,3,"PLACE"),(10,13,"PLACE")]}),
        ("「大和名所」  「春日神社」「若草山」",{"entities":[(1,3,"PLACE"),(9,13,"PLACE"),(15,18,"PLACE")]}),
        ("「府県名所図会」  「兵庫県」「神戸布引滝」「県庁之位地　八部郡神戸」",{"entities":[(11,14,"PLACE"),(16,21,"PLACE"),(29,34,"PLACE")]}),
        (" 「神田御社眺望」",{"entities":[(2,6,"PLACE")]}),
        (" 「大阪名所」  「桜乃宮より造幣局を望む」「天神橋之図」「造幣局」",{"entities":[(2,4,"PLACE"),(10,13,"PLACE"),(23,26,"PLACE"),(30,33,"PLACE")]}),
        (" 「東海道之内」  「関」",{"entities":[(2,5,"PLACE")]}),
        ("「観音霊験記」  「秩父順礼廿九番」「笹の戸　見目山　長泉院」",{"entities":[(19,30,"PLACE")]}),
        ("［東京三井組ハウス］",{"entities":[(1,9,"PLACE")]}),
        ("「東京名所　浅草観音之図」",{"entities":[(1,3,"PLACE"),(6,10,"PLACE")]}),
        (" 「名所江戸百景」  「鎧の渡し　小網町」",{"entities":[(4,6,"PLACE"),(16,20,"PLACE")]}),
        ("「諸国滝廻リ」  「木曽海道　小野ノ瀑布」",{"entities":[(10,14,"PLACE")]}),
        ("「五十三次名所図会」「丗二」  「あら井　渡舟着岸御関所」",{"entities":[(17,20,"PLACE")]}),
        ("「五十三次名所図会　四十」「池鯉鮒　八ツ橋むら　杜若の古せき」",{"entities":[(14,23,"PLACE")]}),
        ("「五十三次名所図会」「丗八」  「藤川　山中の里別名宮路山」",{"entities":[(17,19,"PLACE"),(20,24,"PLACE"),(26,29,"PLACE")]}),
        ("「川崎」  「神奈川へ二リ半」",{"entities":[(1,3,"PLACE"),(7,10,"PLACE")]}),
        ("「江都名所」  「洲崎しほ干狩」",{"entities":[(1,3,"PLACE"),(9,11,"PLACE")]}),
        ("「諸国名橋奇覧」  「摂州安治川口天保山」",{"entities":[(11,20,"PLACE")]}),
        ("「東都名所」  「永代橋深川新地」",{"entities":[(1,3,"PLACE"),(9,16,"PLACE")]}),
        ("「五十三次名所図会」「一」  「日本橋　東雲の景",{"entities":[(17,20,"PLACE")]}),
        ("「五十三次名所図会」「六」  「戸塚山道より不二眺望」",{"entities":[(16,20,"PLACE"),(22,24,"PLACE")]}),
        ("「富士三十六景」  「東都佃沖」",{"entities":[(1,3,"PLACE"),(11,14,"PLACE")]}),
        ("「江戸名所之内」  「真乳山」",{"entities":[(1,3,"PLACE"),(11,14,"PLACE")]}),
        ("「名所江戸百景」  「千住の大はし」",{"entities":[(3,5,"PLACE"),(11,17,"PLACE")]}),
        ("「諸国名所百景」  「遠州秋葉遠景袋井凧」",{"entities":[(13,15,"PLACE"),(17,19,"PLACE")]})
        ]


In [None]:
len(Test)

97

In [None]:
Test_spacy = Test

In [None]:
Train = [("「東海道　京都之内」「大内能上覧図」", {"entities":[(1,4,"PLACE"),(5,7,"PLACE")]}),
         ("「東海道　京都名所之内」「四条河原」", {"entities":[(1,4,"PLACE"),(5,7,"PLACE"),(13,17,"PLACE")]}),
         ("「東海道名所之内」「御能拝見之図」",{"entities":[(1,4,"PLACE")]}),
         ("「東海道」「京都  紫震殿」",{"entities":[(1,4,"PLACE"),(6,8,"PLACE"),(9,12,"PLACE")]}),
         ("「東海道之内」「京都参内」",{"entities":[(1,4,"PLACE"),(8,10,"PLACE")]}),
         ("「東海道之内」「京」「大内蹴鞠之遊覧」",{"entities":[(1,4,"PLACE"),(8,9,"PLACE"),(11,13,"PLACE")]}),
         ("「東海道名所之内」「上加茂」「岩本社」「三本杉」「片岡社」「楼門」「御供所」「若宮」「別雷皇太神宮」「杉尾社」「仮殿」",{"entities":[(1,4,"PLACE"),(10,13,"PLACE"),(15,18,"PLACE"),(20,23,"PLACE"),(25,28,"PLACE"),(30,32,"PLACE"),(34,37,"PLACE"),(39,41,"PLACE"),(43,49,"PLACE"),(51,54,"PLACE"),(56,58,"PLACE")]}),
         ("「東海道名所之内」「京加茂」「山科」「黒谷」「吉田山」「将軍塚」「比叡山」「比良」",{"entities":[(1,4,"PLACE"),(10,13,"PLACE"),(15,17,"PLACE"),(19,21,"PLACE"),(23,26,"PLACE"),(28,31,"PLACE"),(33,36,"PLACE"),(38,40,"PLACE")]}),
         ("「東海道名所之内」「加茂の競馬」",{"entities":[(1,4,"PLACE"),(10,12,"PLACE")]}),
         ("「東海道名所之内」「糺河原」「糺川原」「みたらし川」「河合社」",{"entities": [(1,4,"PLACE"),(10,13,"PLACE"),(15,18,"PLACE"),(20,25,"PLACE"),(27,30,"PLACE")]}),
         ("「東海道名所之内」「祇園祭礼」",{"entities":[(1,4,"PLACE"),(10,12,"PLACE")]}),
         ("「東海道 京都名所之内」「島原」",{"entities":[(1,4,"PLACE"),(5,7,"PLACE"),(13,15,"PLACE")]}),
         ("「東海道名所」「京洛中ノ内」「五条橋」",{"entities": [(1,4,"PLACE"),(8,10,"PLACE"),(15,18,"PLACE")]}),
         ("「東海道名所之内」「京都等持院足利十五代木像之図」",{"entities":[(1,4,"PLACE"),(10,15,"PLACE"),(12,15,"PLACE")]}),
         ("「東海道名所つゝき」「羅生門之古図」「春雨ノ社」「金札石刀石トモ云」「鬼カミノハシラ」",{"entities":[(1,4,"PLACE"),(11,14,"PLACE"),(19,23,"PLACE")]}),
         ("「東海道之内」「京都御出立」",{"entities":[(1,4,"PLACE"),(8,10,"PLACE")]}),
         ("「東海道名所之内」「下加茂」「☆☆との」「いのうへ社」「二言社」「御供所」「本社御租神」「石☆社」",{"entities":[(1,4,"PLACE"),(10,13,"PLACE"),(21,26,"PLACE"),(28,31,"PLACE"),(33,36,"PLACE"),(38,43,"PLACE"),(45,48,"PLACE")]}),
         ("「東海道名所之内」「宇治」",{"entities":[(1,4,"PLACE"),(10,12,"PLACE")]}),
         ("「二見浦」",{"entities":[(1,4,"PLACE")]}),
         ("「長谷川貞信筆　都名所写真鏡　上篇」",{"entities":[(7,8,"PLACE")]}),
         ("「都名所之内」「広沢池秋の月」「遍照寺山」「いけ浦」「愛宕山」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE"),(16,20,"PLACE"),(22,25,"PLACE"),(27,30,"PLACE")]}),
         ("「都名所之内」「知恩院本堂に傘を見る",{"entities":[(1,2,"PLACE"),(8,13,"PLACE")]}),
         ("「都名所之内」「伏見稲荷社」", { "entities":[(1,2,"PLACE"),(8,13,"PLACE")]}),("「都名所之内」「高台寺秋ノ景」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE")]}),
         ("「都名所之内」「東福寺通天橋」",{"entities":[(1,2,"PLACE"),(8,14,"PLACE")]}),
         ("「都名所之内」「西大谷目鏡橋」",{"entities":[(1,2,"PLACE"),(8,14,"PLACE")]}),
         ("「都名所之内」「音羽山清水寺」",{"entities":[(1,2,"PLACE"),(8,14,"PLACE")]}),
         ("「都名所之内」「栂尾門前雨中」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE")]}),
         ("「都名所之内」「祇園大鳥居」",{"entities":[(1,2,"PLACE"),(8,13,"PLACE")]}),
         ("「都名所之内」「三条大はし」",{"entities":[(1,2,"PLACE"),(8,13,"PLACE")]}),
         ("「都名所之内」「竜安寺雪曙」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE")]}),
         ("「都名所之内」「四条橋より縄手通大和橋を望」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE"),(13,19,"PLACE")]}),
         ("「都名所之内」「如意嶽大文字」",{"entities":[(1,2,"PLACE"),(8,14,"PLACE")]}),
         ("「都名所之内」「比叡山山上より湖水を望」「三上山」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE"),(21,24,"PLACE")]}),
         ("「都名所之内」「三条大はし」",{"entities":[(1,2,"PLACE"),(8,13,"PLACE")]}),
         ("「都名所之内」「北野天満宮境内」",{"entities":[(1,2,"PLACE"),(8,13,"PLACE")]}),
         ("「都名所之内」「金閣寺雪景」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE")]}),
         ("「都名所之内」「妙心寺雪江松」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE")]}),
         ("「都名所之内」「竜安寺雪曙」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE")]}),
         ("「都名所之内」「御室仁和寺花盛」",{"entities":[(1,2,"PLACE"),(8,13,"PLACE")]}),
         ("「都名所之内」「広沢池秋の月」「遍照寺山」「いけ浦」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE"),(16,20,"PLACE"),(22,25,"PLACE")]}),
         ("「都名所之内」「広沢池秋の月」「遍照寺山」「いけ浦」「愛宕山」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE"),(16,20,"PLACE"),(22,25,"PLACE"),(27,30,"PLACE")]}),
         ("「都名所之内」「高雄奥の院庭中」「清滝川」",{"entities":[(1,2,"PLACE"),(8,10,"PLACE"),(17,20,"PLACE")]}),
         ("「都名所之内」「栂尾門前雨中」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE")]}),
         ("「都名所之内」「愛宕山之図」「清滝川」",{"entities":[(1,2,"PLACE"),(8,11,"PLACE"),(15,18,"PLACE")]}),
         ("「都名所之内」「嵐山三軒家より眺望」「となせの滝」",{"entities":[(1,2,"PLACE"),(8,13,"PLACE"),(19,24,"PLACE")]}),
         ("「都名所之内」「島原出口光景」",{"entities":[(1,2,"PLACE"),(8,10,"PLACE")]}),
         ("「都名所之内」「島原出口光景」",{"entities":[(1,2,"PLACE"),(8,10,"PLACE")]}),
         ("「京都名所之内」「金閣寺」",{"entities":[(1,2,"PLACE"),(9,12,"PLACE")]}),
         ("「滑稽都名所」「三十三間堂」",{"entities":[(3,4,"PLACE"),(8,13,"PLACE")]}),
         ("「滑稽都名所」「黒谷」",{"entities":[(3,4,"PLACE"),(8,10,"PLACE")]}),
         ("「滑稽都名所」「銀閣寺」",{"entities":[(3,4,"PLACE"),(8,11,"PLACE")]}),
         ("「滑稽都名所」「耳塚」",{"entities":[(3,4,"PLACE"),(8,10,"PLACE")]}),
         ("「滑稽都名所」「東寺」",{"entities":[(3,4,"PLACE"),(8,10,"PLACE")]}),
         ("「滑稽都名所」「真葛原」",{"entities":[(3,4,"PLACE"),(8,11,"PLACE")]}),
         ("「滑稽都名所」「内裏」",{"entities":[(3,4,"PLACE"),(8,10,"PLACE")]}),
         ("「滑稽都名所」「大仏」",{"entities":[(3,4,"PLACE"),(8,10,"PLACE")]}),
         ("「滑稽都名所」「平野」",{"entities":[(3,4,"PLACE"),(8,10,"PLACE")]}),
         ("「滑稽都名所」「鞍馬山」",{"entities":[(3,4,"PLACE"),(8,11,"PLACE")]}),
         ("「滑稽都名所」「広沢」",{"entities":[(3,4,"PLACE"),(8,10,"PLACE")]}),
         ("「滑稽都名所」「竜安寺」",{"entities":[(3,4,"PLACE"),(8,11,"PLACE")]}),
         ("「諸国名所百景」「京都祇園祭礼」",{"entities":[(9,11,"PLACE")]}),
         ("「京洛名所」「二条橋より大文字を望む」",{"entities":[(1,3,"PLACE"),(7,10,"PLACE"),(12,15,"PLACE")]}),
         ("「東福寺通天橋」",{"entities":[(1,7,"PLACE")]}),
         ("「祇園春宵」",{"entities":[(1,3,"PLACE")]}),
         ("「京都大仏殿大鐘楼」",{"entities":[(1,9,"PLACE")]}),
         ("「大文字山」",{"entities":[(1,5,"PLACE")]}),
         ("「銀閣寺新緑」",{"entities":[(1,4,"PLACE")]}),
         ("「島原大門出口ノ柳」",{"entities":[(1,5,"PLACE")]}),
         ("「三十三間堂」",{"entities":[(1,6,"PLACE")]}),
         ("「嵐山渡月橋の夏雨」",{"entities":[(1,6,"PLACE")]}),
         ("「清水寺秋色」",{"entities":[(1,4,"PLACE")]}),
         ("「平安　諸大家名所画譜",{"entities":[(1,3,"PLACE")]}),
         ("「京都諸大家筆　平安諸大家名所画譜目録」",{"entities":[(1,3,"PLACE"),(7,9,"PLACE")]}),
         ("「平安諸大家名所画譜一」「第一応挙筆宇治橋之図」",{"entities":[(1,3,"PLACE"),(18,21,"PLACE")]}),
         ("「平安諸大家名所画譜ニ」「第二景文筆鳳凰堂之図」",{"entities":[(1,3,"PLACE"),(18,21,"PLACE")]}),
         ("「江戸名所百人美女」「するがだい」",{"entities":[(1,3,"PLACE"),(11,16,"PLACE")]}),
         ("「江戸名所百人美女日本はし」",{"entities":[(1,3,"PLACE"),(9,13,"PLACE")]}),
         ("「江戸名所百人美女」  「柳はし」",{"entities":[(1,3,"PLACE"),(11,14,"PLACE")]}),
         ("「江戸名所百人美女」「鎧のわたし」",{"entities":[(1,3,"PLACE"),(11,16,"PLACE")]}),
         ("「江戸名所道戯盡」「二」「両国の夕立」",{"entities":[(1,3,"PLACE"),(13,15,"PLACE")]}),
         ("「江戸名所道戯盡」「四」「御茶の水の釣人」",{"entities":[(1,3,"PLACE"),(13,17,"PLACE")]}),
         ("「江戸名所道戯盡」「五」「飛鳥山の花見」",{"entities":[(1,3,"PLACE"),(13,15,"PLACE")]}),
         ("「江戸名所道外盡」「六」「不忍池」",{"entities":[(1,3,"PLACE"),(13,16,"PLACE")]}),
         ("「江戸名所道化盡」「七」「新シ橋の大風」",{"entities":[(1,3,"PLACE"),(13,16,"PLACE")]}),
         ("「江戸名所道外盡」「八」「隅田堤の弥生」",{"entities":[(1,3,"PLACE"),(13,16,"PLACE")]}),
         ("「江戸名所道化盡」「九」「湯嶋天神の臺」",{"entities":[(1,3,"PLACE"),(13,17,"PLACE")]}),
         ("「江戸名所道外尽」「十」「外神田佐久間町」",{"entities":[(1,3,"PLACE"),(13,20,"PLACE")]}),
         ("「江戸名所道戯盡」「十三」「鎧のわたし七夕祭」",{"entities":[(1,3,"PLACE"),(14,19,"PLACE")]}),
         ("「江戸名所道戯盡」「十四」「芝赤羽はしの雪中」",{"entities":[(1,3,"PLACE"),(14,19,"PLACE")]}),
         ("「江戸名所道戯盡」「十六」「王子狐火」",{"entities":[(1,3,"PLACE")]}),
         ("「江戸名所道化盡」「十七」「通壹丁目祇園會」",{"entities":[(1,3,"PLACE"),(14,20,"PLACE")]}),
         ("「江戸名所道外盡」「二十」「道灌山虫聞」",{"entities":[(1,3,"PLACE"),(14,17,"PLACE")]}),
         ("「江戸名所道外盡」「廿四」「数寄屋かし」",{"entities":[(1,3,"PLACE"),(14,19,"PLACE")]}),
         ("「江戸名所道戯盡」「三十五」「吾嬬の森梅見もとり」",{"entities":[(1,3,"PLACE"),(14,17,"PLACE")]}),
         ("「江戸名所張交図會」「浅草金龍山」「三囲」「雷門」「すみた川」「向嶋花屋鋪」",{"entities":[(1,3,"PLACE"),(11,16,"PLACE"),(18,20,"PLACE"),(22,24,"PLACE"),(26,30,"PLACE"),(32,37,"PLACE")]}),
         ("「江戸廼花名勝會」「五番組」「ま」「赤坂」「赤坂奴凧平　尾上多見蔵」「赤坂御門外」",{"entities":[(1,3,"PLACE"),(18,20,"PLACE"),(22,24,"PLACE"),(35,37,"PLACE")]}),
         ("「江戸の華名勝會」「五番組」「ま」「三河臺」「三河臺　氷川神社」「関東小六　市川市蔵」",{"entities":[(1,3,"PLACE"),(18,21,"PLACE"),(23,26,"PLACE"),(27,31,"PLACE")]}),
         ("「東京名所四十八景　日本はし夕けしき」",{"entities":[(1,3,"PLACE"),(10,14,"PLACE")]}),
         ("「東京名所四十八景」「神田明神社内年の市」",{"entities":[(1,3,"PLACE"),(11,16,"PLACE"),(16,20,"PLACE")]}),
         ("「東京名所四十八景　谷中諏訪の社廿六夜まち」",{"entities":[(1,3,"PLACE"),(12,16,"PLACE")]})
         ]


In [None]:
len(Train)

101

# EDA (Exploratory Data Analysis)

## Labelled Dataset

In [None]:
data_train [-1]

['「東京名所四十八景\u3000谷中諏訪の社廿六夜まち」', {'entities': [[1, 3, 'GPE'], [12, 16, 'GPE']]}]

In [None]:
data_gold[-1]

['「東京名所四十八景\u3000谷中諏訪の社廿六夜まち」', {'entities': [[1, 3, 'GPE'], [12, 16, 'GPE']]}]

In [None]:
if data_gold == data_train :
    print('similar')

similar


---

In [None]:
data_gold[-2]

['「東京名所四十八景」「神田明神社内年の市」',
 {'entities': [[1, 3, 'GPE'], [11, 16, 'GPE'], [16, 20, 'GPE']]}]

In [None]:
print('東京 = Tokyo (PLACE in potision [1,2])')

東京 = Tokyo (PLACE in potision [1,2])


In [None]:
len(data_train )

101

In [None]:
len(data_gold)

101

In [None]:
for entity in data_gold:
    print('Title:',entity[0])
    print('Tags:',entity[1]['entities'])
    print('Number of Tags:', len(entity[1]['entities']))
    print('------------------------------------')

Title: 「東海道　京都之内」「大内能上覧図」
Tags: [[1, 4, 'LOC'], [5, 7, 'GPE']]
Number of Tags: 2
------------------------------------
Title: 「東海道　京都名所之内」「四条河原」
Tags: [[1, 4, 'LOC'], [5, 7, 'GPE'], [13, 17, 'GPE']]
Number of Tags: 3
------------------------------------
Title: 「東海道名所之内」「御能拝見之図」
Tags: [[1, 4, 'LOC']]
Number of Tags: 1
------------------------------------
Title: 「東海道」「京都  紫震殿」
Tags: [[1, 4, 'LOC'], [6, 8, 'GPE'], [9, 12, 'GPE']]
Number of Tags: 3
------------------------------------
Title: 「東海道之内」「京都参内」
Tags: [[1, 4, 'LOC'], [8, 10, 'GPE']]
Number of Tags: 2
------------------------------------
Title: 「東海道之内」「京」「大内蹴鞠之遊覧」
Tags: [[1, 4, 'LOC'], [8, 9, 'GPE'], [11, 13, 'GPE']]
Number of Tags: 3
------------------------------------
Title: 「東海道名所之内」「上加茂」「岩本社」「三本杉」「片岡社」「楼門」「御供所」「若宮」「別雷皇太神宮」「杉尾社」「仮殿」
Tags: [[1, 4, 'LOC'], [10, 13, 'GPE'], [15, 18, 'GPE'], [20, 23, 'GPE'], [25, 28, 'GPE'], [30, 32, 'GPE'], [34, 37, 'GPE'], [39, 41, 'GPE'], [43, 49, 'GPE'], [51, 54, 'GPE'], [56, 58, 'GPE']]
Number

## Main Dataset

In [None]:
train_data.sample(2)

Unnamed: 0,Title,Genre,Artist,Image URL,Permalink
12,「東海道名所」 「京洛中ノ内」「五条橋」,名所絵,芳盛,https://www.arc.ritsumei.ac.jp/archive01/theat...,https://www.dh-jac.net/db/nishikie/arcUP1741/2...
23,「都名所之内」「高台寺秋ノ景」,名所絵 京都関連,貞信,https://www.arc.ritsumei.ac.jp/archive01/theat...,https://www.dh-jac.net/db/nishikie/arcUP1665/2...


In [None]:
train_data.head(2)

Unnamed: 0,Title,Genre,Artist,Image URL,Permalink
0,「東海道　京都之内」 「大内能上覧図」,名所絵 京都 能楽 紅葉狩,芳年,https://www.arc.ritsumei.ac.jp/archive01/theat...,https://www.dh-jac.net/db/nishikie/arcUP0542/2...
1,「東海道　京都名所之内」「四条河原」,京都関連,豊国,https://www.arc.ritsumei.ac.jp/archive01/theat...,https://www.dh-jac.net/db/nishikie/arcUP0544/2...


In [None]:
id = 0

In [None]:
train_data['Image URL'][id]

'https://www.arc.ritsumei.ac.jp/archive01/theater/image/PB/arc/Prints/arcUP/arcUP0542.jpg'

In [None]:
train_data['Permalink'][id]

'https://www.dh-jac.net/db/nishikie/arcUP0542/2021d7/'

In [None]:
print('Title(EN): "Tokaido Kyoto Nouchi" "Ouchi Noh Viewing Map"')
print('Title(JAP):',train_data['Title'][id])

Title(EN): "Tokaido Kyoto Nouchi" "Ouchi Noh Viewing Map"
Title(JAP): 「東海道　京都之内」  「大内能上覧図」


In [None]:
print('Genre(EN): Meisho-e Kyoto Nogaku Momijigari')
print('Genre(JAP):',train_data['Genre'][id])

Genre(EN): Meisho-e Kyoto Nogaku Momijigari
Genre(JAP):  名所絵  京都  能楽  紅葉狩 


In [None]:
print('Artist(EN): Yoshitoshi')
print('Artist(JAP):',train_data['Artist'][id])

Artist(EN): Yoshitoshi
Artist(JAP):  芳年


In [None]:
train_data.groupby('Genre').nunique()

Unnamed: 0_level_0,Title,Artist,Image URL,Permalink
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
京都関連,8,5,8,8
京都関連,3,3,3,3
京都関連,1,1,1,1
京都関連 名所絵,1,1,1,1
各区 本郷 小石川（第四大区） 本郷,1,1,1,1
名所案内記 図絵,1,1,1,1
名所絵 京都 能楽 紅葉狩,1,1,1,1
名所絵 京都関連,16,5,18,18
名所絵 京都関連,2,1,2,2
名所絵 戯画 名所案内記 図絵,1,1,1,1


Count numbers of paintings per unique values:

In [None]:
train_data['Genre'].value_counts().to_frame()

Unnamed: 0,Genre
名所絵 京都関連,18
名所絵 京都関連,18
江戸 名所案内記 図絵,10
京都関連,8
京都関連,7
名所絵,4
京都関連 名所絵,3
江戸 名所案内記 図絵,3
京都関連,3
名所絵 京都関連,3


In [None]:
train_data['Artist'].value_counts().to_frame()

Unnamed: 0,Artist
貞信,20
広景,13
芳梅,9
貞信,7
亀井藤兵衛,7
豊国,6
昇斎一景,3
暁斎,3
豊国,3
芳盛,3


# Tokenizations

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")

In [None]:
# #John
# # the empty space is not good enough for tokenasation
# train_data['totalwords'] = train_data['Title'].str.split().str.len()
# print(f"Avg words: {train_data['totalwords'].mean()}")

In [None]:
# # Example
# tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")

# text = '朝食にを焼いて食べまし[MASK]。'
# token_ids = tokenizer.encode(text, add_special_tokens=True)
# print(token_ids)
# tokens = tokenizer.convert_ids_to_tokens(token_ids)
# print(tokens)

In [None]:
def extract_tags(data):
    list_title = []
    list_token_title = []
    list_encode_title = []
    list_tags = []

    noTag = 'O'

    for entity in tqdm(data):
        ### Title ###
        text = entity[0]

        print('\nTitle:',text)

        token_ids = tokenizer.encode(text, add_special_tokens=True)
        tokens = tokenizer.convert_ids_to_tokens(token_ids)
        print('Title Encode:',tokens)

        list_title.append(text)             # eg.'朝食にを焼いて食べまし[MASK]。'
        list_token_title.append(tokens)     # eg.['[CLS]', '朝食', 'に', 'を', '焼い', 'て', '食べ', 'まし', '[MASK]', '。', '[SEP]']
        list_encode_title.append(token_ids) # eg.[2, 25965, 7, 11, 16878, 16, 2949, 3913, 4, 8, 3]

        tags = entity[1]['entities']
        print('Tags:',tags)
        print('Number of Tags:', len(tags))

        list_in = [noTag] * len(tokens)     # eg.['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

        # each title has multiple tags
        for tag in tags:
            begin = tag[0] 
            end = tag[1] 

            text_token = text[begin:end]
            name_tag = tag[2]

            print(f'Token: {text_token} -> Tag: {name_tag}')

            #########################################################

            # Finding all indexes of a string in the list
            # We want: 
            # either text_token include in token  eg. '朝食' = '朝食'
            # or token include in text_token 祇園大鳥居 = '祇園' '大' '鳥居' 
            
            #!!!προσοχή δεν εχουμε φτιαξει ακομα την περιπτωση 祇園大鳥居 = '祇園' '大' '##鳥', μεχρι στιγμεις θα βαλει 'PLACE' 'PLACE' '0'!!!
            #!!!προσοχή δεν εχουμε φτιαξει ακομα την περιπτωση 栂尾門= '[UNK]', '門前', μεχρι στιγμεις θα βαλει '0' '0'!!!
            
            indices = [i for i, s in enumerate(tokens) if (text_token in s) or (s in text_token)]
            print(indices)

            # add the tag in the correct token
            for ind in indices:
                list_in[ind] = name_tag
            #########################################################

        list_tags.append(list_in)
        print('------------------------------------')

        # intialise data of lists.
        
    data = {'title':list_title,
    'title_token':list_token_title,
    'title_encode':list_encode_title,
    'tags':list_tags }

    # Create DataFrame
    df = pd.DataFrame(data)
    print(df)


    # Create DataFrame
    data = pd.DataFrame({'sentence':list_title, 'word_labels':list_tags})

    #covert column word_labels from list to string
    #eg [O, O, PLACE, PLACE, O, O, O, O, O, O, O, O, O, O, O] -> 'O,O,PLACE,PLACE,O,O,O,O,O,O,O,O,O,O,O'
    for i in range(len(data)):
        data['word_labels'][i] = ",".join(data['word_labels'][i])

    print(data)

    return data

In [None]:
# data = extract_tags(data_train)

In [None]:
# data.head()

In [None]:
train = extract_tags(Train)

 92%|█████████▏| 93/101 [00:00<00:00, 473.76it/s]


Title: 「東海道　京都之内」「大内能上覧図」
Title Encode: ['[CLS]', '「', '東海道', '京都', '之', '内', '」', '「', '大内', '能', '上', '##覧', '図', '」', '[SEP]']
Tags: [(1, 4, 'PLACE'), (5, 7, 'PLACE')]
Number of Tags: 2
Token: 東海道 -> Tag: PLACE
[2]
Token: 京都 -> Tag: PLACE
[3]
------------------------------------

Title: 「東海道　京都名所之内」「四条河原」
Title Encode: ['[CLS]', '「', '東海道', '京都', '名所', '之', '内', '」', '「', '四条', '河原', '」', '[SEP]']
Tags: [(1, 4, 'PLACE'), (5, 7, 'PLACE'), (13, 17, 'PLACE')]
Number of Tags: 3
Token: 東海道 -> Tag: PLACE
[2]
Token: 京都 -> Tag: PLACE
[3]
Token: 四条河原 -> Tag: PLACE
[9, 10]
------------------------------------

Title: 「東海道名所之内」「御能拝見之図」
Title Encode: ['[CLS]', '「', '東海道', '名所', '之', '内', '」', '「', '御', '能', '拝', '##見', '之', '図', '」', '[SEP]']
Tags: [(1, 4, 'PLACE')]
Number of Tags: 1
Token: 東海道 -> Tag: PLACE
[2]
------------------------------------

Title: 「東海道」「京都  紫震殿」
Title Encode: ['[CLS]', '「', '東海道', '」', '「', '京都', '紫', '震', '殿', '」', '[SEP]']
Tags: [(1, 4, 'PLACE'), (6, 8, 'PLACE'), (9

100%|██████████| 101/101 [00:00<00:00, 471.12it/s]

 [(1, 3, 'PLACE'), (18, 21, 'PLACE'), (23, 26, 'PLACE'), (27, 31, 'PLACE')]
Number of Tags: 4
Token: 江戸 -> Tag: PLACE
[2]
Token: 三河臺 -> Tag: PLACE
[16, 20]
Token: 三河臺 -> Tag: PLACE
[16, 20]
Token: 氷川神社 -> Tag: PLACE
[22]
------------------------------------

Title: 「東京名所四十八景　日本はし夕けしき」
Title Encode: ['[CLS]', '「', '東京', '名所', '四', '十', '八', '##景', '日本', 'はし', '夕', 'け', '##しき', '」', '[SEP]']
Tags: [(1, 3, 'PLACE'), (10, 14, 'PLACE')]
Number of Tags: 2
Token: 東京 -> Tag: PLACE
[2]
Token: 日本はし -> Tag: PLACE
[8, 9]
------------------------------------

Title: 「東京名所四十八景」「神田明神社内年の市」
Title Encode: ['[CLS]', '「', '東京', '名所', '四', '十', '八', '##景', '」', '「', '神田', '明神', '社内', '年', '##の', '##市', '」', '[SEP]']
Tags: [(1, 3, 'PLACE'), (11, 16, 'PLACE'), (16, 20, 'PLACE')]
Number of Tags: 3
Token: 東京 -> Tag: PLACE
[2]
Token: 神田明神社 -> Tag: PLACE
[10, 11]
Token: 内年の市 -> Tag: PLACE
[13]
------------------------------------

Title: 「東京名所四十八景　谷中諏訪の社廿六夜まち」
Title Encode: ['[CLS]', '「', '東京', '名所', '四', '十', 




In [None]:
train.head()

Unnamed: 0,sentence,word_labels
0,「東海道　京都之内」「大内能上覧図」,"O,O,PLACE,PLACE,O,O,O,O,O,O,O,O,O,O,O"
1,「東海道　京都名所之内」「四条河原」,"O,O,PLACE,PLACE,O,O,O,O,O,PLACE,PLACE,O,O"
2,「東海道名所之内」「御能拝見之図」,"O,O,PLACE,O,O,O,O,O,O,O,O,O,O,O,O,O"
3,「東海道」「京都 紫震殿」,"O,O,PLACE,O,O,PLACE,PLACE,PLACE,O,O,O"
4,「東海道之内」「京都参内」,"O,O,PLACE,PLACE,O,O,O,O,PLACE,O,O,O,O"


In [None]:
test = extract_tags(Test)

  0%|          | 0/97 [00:00<?, ?it/s]


Title: 「東海道五十三次」  「三十八」「藤川」
Title Encode: ['[CLS]', '「', '東海道', '五', '十', '三', '次', '」', '「', '三', '十', '八', '」', '「', '藤', '##川', '」', '[SEP]']
Tags: [(1, 4, 'PLACE'), (17, 19, 'PLACE')]
Number of Tags: 2
Token: 東海道 -> Tag: PLACE
[2]
Token: 藤川 -> Tag: PLACE
[14]
------------------------------------

Title: 「東都六玉顔ノ内」  「角田川」
Title Encode: ['[CLS]', '「', '東都', '六', '玉', '顔', 'ノ', '内', '」', '「', '角田', '川', '」', '[SEP]']
Tags: [(1, 3, 'PLACE'), (12, 15, 'PLACE')]
Number of Tags: 2
Token: 東都 -> Tag: PLACE
[2]
Token: 角田川 -> Tag: PLACE
[10, 11]
------------------------------------

Title: 「名所江戸百景」  「猿わか町よるの景」
Title Encode: ['[CLS]', '「', '名所', '江戸', '百', '景', '」', '「', '猿', 'わか', '町', 'よる', 'の', '景', '」', '[SEP]']
Tags: [(3, 5, 'PLACE'), (11, 15, 'PLACE')]
Number of Tags: 2
Token: 江戸 -> Tag: PLACE
[3]
Token: 猿わか町 -> Tag: PLACE
[8, 9, 10]
------------------------------------

Title: 「江戸名所図会」  「卅二」「三十三間堂」「曽我五郎時宗」
Title Encode: ['[CLS]', '「', '江戸', '名所', '図', '##会', '」', '「', '[UNK]', '」', '「',

100%|██████████| 97/97 [00:00<00:00, 570.61it/s]


Tags: [(13, 17, 'PLACE'), (19, 21, 'PLACE')]
Number of Tags: 2
Token: 深川八幡 -> Tag: PLACE
[11, 12]
Token: 富士 -> Tag: PLACE
[15]
------------------------------------

Title:  「見立十二支」  「酉」「浅草田甫酉の市」
Title Encode: ['[CLS]', '「', '見', '##立', '十二', '##支', '」', '「', '[UNK]', '」', '「', '浅草', '田', '甫', '[UNK]', '」', '[SEP]']
Tags: [(14, 18, 'PLACE')]
Number of Tags: 1
Token: 浅草田甫 -> Tag: PLACE
[11, 12, 13]
------------------------------------

Title:  「東海道五拾三次之内」  「大尾」「京師」「三条大橋」
Title Encode: ['[CLS]', '「', '東海道', '五', '##拾', '三', '次', '之', '内', '」', '「', '大', '##尾', '」', '「', '京', '##師', '」', '「', '三条', '大橋', '」', '[SEP]']
Tags: [(2, 5, 'PLACE'), (19, 21, 'PLACE'), (23, 27, 'PLACE')]
Number of Tags: 3
Token: 東海道 -> Tag: PLACE
[2]
Token: 京師 -> Tag: PLACE
[15]
Token: 三条大橋 -> Tag: PLACE
[5, 11, 19, 20]
------------------------------------

Title:  「東海道五拾三次之内」  「日本橋」「行烈振出」
Title Encode: ['[CLS]', '「', '東海道', '五', '##拾', '三', '次', '之', '内', '」', '「', '日本橋', '」', '「', '行', '烈', '振', '##出', '」', '[SEP

In [None]:
test.head()

Unnamed: 0,sentence,word_labels
0,「東海道五十三次」 「三十八」「藤川」,"O,O,PLACE,O,O,O,O,O,O,O,O,O,O,O,PLACE,O,O,O"
1,「東都六玉顔ノ内」 「角田川」,"O,O,PLACE,O,O,O,O,O,O,O,PLACE,PLACE,O,O"
2,「名所江戸百景」 「猿わか町よるの景」,"O,O,O,PLACE,O,O,O,O,PLACE,PLACE,PLACE,O,O,O,O,O"
3,「江戸名所図会」 「卅二」「三十三間堂」「曽我五郎時宗」,"O,O,PLACE,O,O,O,O,O,O,O,O,PLACE,PLACE,PLACE,PL..."
4,「江戸名所　百人美女」 「今川はし」,"O,O,PLACE,O,O,O,O,O,O,PLACE,PLACE,O,O"


## Step by step

In [None]:
list_title = []
list_token_title = []
list_encode_title = []
list_tags = []

noTag = 'O'

for entity in tqdm(data_train):
    ### Title ###
    text = entity[0]

    print('\nTitle:',text)

    token_ids = tokenizer.encode(text, add_special_tokens=True)
    tokens = tokenizer.convert_ids_to_tokens(token_ids)
    print('Title Encode:',tokens)

    list_title.append(text)             # eg.'朝食にを焼いて食べまし[MASK]。'
    list_token_title.append(tokens)     # eg.['[CLS]', '朝食', 'に', 'を', '焼い', 'て', '食べ', 'まし', '[MASK]', '。', '[SEP]']
    list_encode_title.append(token_ids) # eg.[2, 25965, 7, 11, 16878, 16, 2949, 3913, 4, 8, 3]

    tags = entity[1]['entities']
    print('Tags:',tags)
    print('Number of Tags:', len(tags))

    list_in = [noTag] * len(tokens)     # eg.['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']

    # each title has multiple tags
    for tag in tags:
        begin = tag[0] 
        end = tag[1] 

        text_token = text[begin:end]
        name_tag = tag[2]

        print(f'Token: {text_token} -> Tag: {name_tag}')

        #########################################################

        # Finding all indexes of a string in the list
        # We want: 
        # either text_token include in token  eg. '朝食' = '朝食'
        # or token include in text_token 祇園大鳥居 = '祇園' '大' '鳥居' 
        
        #!!!προσοχή δεν εχουμε φτιαξει ακομα την περιπτωση 祇園大鳥居 = '祇園' '大' '##鳥', μεχρι στιγμεις θα βαλει 'PLACE' 'PLACE' '0'!!!
        #!!!προσοχή δεν εχουμε φτιαξει ακομα την περιπτωση 栂尾門= '[UNK]', '門前', μεχρι στιγμεις θα βαλει '0' '0'!!!
        
        indices = [i for i, s in enumerate(tokens) if (text_token in s) or (s in text_token)]
        print(indices)

        # add the tag in the correct token
        for ind in indices:
            list_in[ind] = name_tag
        #########################################################

    list_tags.append(list_in)
    print('------------------------------------')

  0%|          | 0/101 [00:00<?, ?it/s]


Title: 「東海道　京都之内」「大内能上覧図」
Title Encode: ['[CLS]', '「', '東海道', '京都', '之', '内', '」', '「', '大内', '能', '上', '##覧', '図', '」', '[SEP]']
Tags: [[1, 4, 'LOC'], [5, 7, 'GPE']]
Number of Tags: 2
Token: 東海道 -> Tag: LOC
[2]
Token: 京都 -> Tag: GPE
[3]
------------------------------------

Title: 「東海道　京都名所之内」「四条河原」
Title Encode: ['[CLS]', '「', '東海道', '京都', '名所', '之', '内', '」', '「', '四条', '河原', '」', '[SEP]']
Tags: [[1, 4, 'LOC'], [5, 7, 'GPE'], [13, 17, 'GPE']]
Number of Tags: 3
Token: 東海道 -> Tag: LOC
[2]
Token: 京都 -> Tag: GPE
[3]
Token: 四条河原 -> Tag: GPE
[9, 10]
------------------------------------

Title: 「東海道名所之内」「御能拝見之図」
Title Encode: ['[CLS]', '「', '東海道', '名所', '之', '内', '」', '「', '御', '能', '拝', '##見', '之', '図', '」', '[SEP]']
Tags: [[1, 4, 'LOC']]
Number of Tags: 1
Token: 東海道 -> Tag: LOC
[2]
------------------------------------

Title: 「東海道」「京都  紫震殿」
Title Encode: ['[CLS]', '「', '東海道', '」', '「', '京都', '紫', '震', '殿', '」', '[SEP]']
Tags: [[1, 4, 'LOC'], [6, 8, 'GPE'], [9, 12, 'GPE']]
Number of Tags

100%|██████████| 101/101 [00:00<00:00, 645.38it/s]

Token: 都 -> Tag: GPE
[4]
Token: 竜安寺 -> Tag: GPE
[8]
------------------------------------

Title: 「諸国名所百景」「京都祇園祭礼」
Title Encode: ['[CLS]', '「', '諸国', '名所', '百', '景', '」', '「', '京都', '祇園', '祭礼', '」', '[SEP]']
Tags: [[9, 11, 'GPE']]
Number of Tags: 1
Token: 京都 -> Tag: GPE
[8]
------------------------------------

Title: 「京洛名所」「二条橋より大文字を望む」
Title Encode: ['[CLS]', '「', '京', '##洛', '名所', '」', '「', '二', '条', '橋', 'より', '大文字', 'を', '望む', '」', '[SEP]']
Tags: [[1, 3, 'GPE'], [7, 10, 'GPE'], [12, 15, 'GPE']]
Number of Tags: 3
Token: 京洛 -> Tag: GPE
[2]
Token: 二条橋 -> Tag: GPE
[7, 8, 9]
Token: 大文字 -> Tag: GPE
[11]
------------------------------------

Title: 「東福寺通天橋」
Title Encode: ['[CLS]', '「', '東', '##福寺', '通', '天', '橋', '」', '[SEP]']
Tags: [[1, 7, 'GPE']]
Number of Tags: 1
Token: 東福寺通天橋 -> Tag: GPE
[2, 4, 5, 6]
------------------------------------

Title: 「祇園春宵」
Title Encode: ['[CLS]', '「', '祇園', '春', '##宵', '」', '[SEP]']
Tags: [[1, 3, 'GPE']]
Number of Tags: 1
Token: 祇園 -> Tag: GPE
[2]
--------




In [None]:
# intialise data of lists.
data = {'title':list_title,
'title_token':list_token_title,
'title_encode':list_encode_title,
'tags':list_tags }

# Create DataFrame
df = pd.DataFrame(data)

In [None]:
df

Unnamed: 0,title,title_token,title_encode,tags
0,「東海道　京都之内」「大内能上覧図」,"[[CLS], 「, 東海道, 京都, 之, 内, 」, 「, 大内, 能, 上, ##覧,...","[2, 36, 7174, 1316, 3376, 186, 38, 36, 10576, ...","[O, O, LOC, GPE, O, O, O, O, O, O, O, O, O, O, O]"
1,「東海道　京都名所之内」「四条河原」,"[[CLS], 「, 東海道, 京都, 名所, 之, 内, 」, 「, 四条, 河原, 」,...","[2, 36, 7174, 1316, 11117, 3376, 186, 38, 36, ...","[O, O, LOC, GPE, O, O, O, O, O, GPE, GPE, O, O]"
2,「東海道名所之内」「御能拝見之図」,"[[CLS], 「, 東海道, 名所, 之, 内, 」, 「, 御, 能, 拝, ##見, ...","[2, 36, 7174, 11117, 3376, 186, 38, 36, 1351, ...","[O, O, LOC, O, O, O, O, O, O, O, O, O, O, O, O..."
3,「東海道」「京都 紫震殿」,"[[CLS], 「, 東海道, 」, 「, 京都, 紫, 震, 殿, 」, [SEP]]","[2, 36, 7174, 38, 36, 1316, 5007, 7457, 3912, ...","[O, O, LOC, O, O, GPE, GPE, GPE, O, O, O]"
4,「東海道之内」「京都参内」,"[[CLS], 「, 東海, 道, ##之, 内, 」, 「, 京都, 参, ##内, 」,...","[2, 36, 3385, 405, 29494, 186, 38, 36, 1316, 2...","[O, O, LOC, LOC, O, O, O, O, GPE, O, O, O, O]"
...,...,...,...,...
96,「江戸廼花名勝會」「五番組」「ま」「赤坂」「赤坂奴凧平　尾上多見蔵」「赤坂御門外」,"[[CLS], 「, 江戸, [UNK], 名勝, 會, 」, 「, 五, 番組, 」, 「...","[2, 36, 1322, 1, 25946, 21034, 38, 36, 989, 48...","[O, O, GPE, O, O, O, O, O, O, O, O, O, O, O, O..."
97,「江戸の華名勝會」「五番組」「ま」「三河臺」「三河臺　氷川神社」「関東小六　市川市蔵」,"[[CLS], 「, 江戸, の, 華, 名勝, 會, 」, 「, 五, 番組, 」, 「,...","[2, 36, 1322, 5, 3043, 25946, 21034, 38, 36, 9...","[O, O, GPE, O, O, O, O, O, O, O, O, O, O, O, O..."
98,「東京名所四十八景　日本はし夕けしき」,"[[CLS], 「, 東京, 名所, 四, 十, 八, ##景, 日本, はし, 夕, け,...","[2, 36, 391, 11117, 755, 714, 1035, 29430, 91,...","[O, O, GPE, O, O, O, O, O, GPE, GPE, O, O, O, ..."
99,「東京名所四十八景」「神田明神社内年の市」,"[[CLS], 「, 東京, 名所, 四, 十, 八, ##景, 」, 「, 神田, 明神,...","[2, 36, 391, 11117, 755, 714, 1035, 29430, 38,...","[O, O, GPE, O, O, O, O, O, O, O, GPE, GPE, O, ..."


In [None]:
# Create DataFrame
data = pd.DataFrame({'sentence':list_title, 'word_labels':list_tags})

#covert column word_labels from list to string
#eg [O, O, PLACE, PLACE, O, O, O, O, O, O, O, O, O, O, O] -> 'O,O,PLACE,PLACE,O,O,O,O,O,O,O,O,O,O,O'
for i in range(len(data)):
    data['word_labels'][i] = ",".join(data['word_labels'][i])

data.head()

Unnamed: 0,sentence,word_labels
0,「東海道　京都之内」「大内能上覧図」,"O,O,LOC,GPE,O,O,O,O,O,O,O,O,O,O,O"
1,「東海道　京都名所之内」「四条河原」,"O,O,LOC,GPE,O,O,O,O,O,GPE,GPE,O,O"
2,「東海道名所之内」「御能拝見之図」,"O,O,LOC,O,O,O,O,O,O,O,O,O,O,O,O,O"
3,「東海道」「京都 紫震殿」,"O,O,LOC,O,O,GPE,GPE,GPE,O,O,O"
4,「東海道之内」「京都参内」,"O,O,LOC,LOC,O,O,O,O,GPE,O,O,O,O"


# Custom Named Entity Recognition with Japanese BERT

https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb#scrollTo=0jDNXrjr-6BW

In [None]:
labels_to_ids = {k: v for v, k in enumerate(['O','PLACE'])}
ids_to_labels = {v: k for v, k in enumerate(['O','PLACE'])}
labels_to_ids

{'O': 0, 'PLACE': 1}

In [None]:
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification, PreTrainedTokenizerFast

MAX_LEN = 60
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 20
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# tokenizer = BertTokenizerFast.from_pretrained("cl-tohoku/bert-base-japanese") # BEST!!!!!!!!!!
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese") # BEST!!!!!!!!!!
#tokenizer = PreTrainedTokenizerFast.from_pretrained("cl-tohoku/bert-base-japanese")

In [None]:
from torch.utils.data import Dataset, DataLoader

class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels 
        sentence = self.data.sentence[index]#.strip().split()  
        word_labels = self.data.word_labels[index].split(",") 

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                            #  is_pretokenized=True, 
                            #  return_offsets_mapping=True, 
                             padding='max_length', 
                            #  truncation=True, 
                             max_length=self.max_len)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels] 
        
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        # encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        encoded_labels = np.ones(MAX_LEN, dtype=int) * -100
        
        # # set labels whose first offset position is 0 and the second is not 0
        # i = 0
        # for idx, mapping in enumerate(encoding["offset_mapping"]):
        #   if mapping[0] == 0 and mapping[1] != 0:
        #     # overwrite label
        #     encoded_labels[idx] = labels[i]
        #     i += 1

        # set labels 
        for idx in range(len(labels)):
            # overwrite label
            encoded_labels[idx] = labels[idx]
           
        # # step 4: turn everything into PyTorch tensors
        # item = {key: torch.as_tensor(val) for key, val in encoding.items()}

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}

        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [None]:
# train_size = 0.9
# train_dataset = data.sample(frac=train_size,random_state=200)
# test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
# train_dataset = train_dataset.reset_index(drop=True)

train_dataset = train
test_dataset = test

# print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

TRAIN Dataset: (101, 2)
TEST Dataset: (97, 2)


In [None]:
# train_dataset.head(2)

In [None]:
# training_set[2]

In [None]:
train_dataset.head(1)

Unnamed: 0,sentence,word_labels
0,「東海道　京都之内」「大内能上覧図」,"O,O,PLACE,PLACE,O,O,O,O,O,O,O,O,O,O,O"


In [None]:
training_set

<__main__.dataset at 0x7f5c6573c050>

In [None]:
training_set[0]['attention_mask'] 

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
# training_set[0]['attention_mask'] = training_set[0]['attention_mask'][0]
training_set[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([    2,    36,  7174,  1316,  3376,   186,    38,    36, 10576,  1329,
           109, 29643,   903,    38,     3,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'labels': tensor([   0,    0,    1,    1,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100

In [None]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
  print('{0:10}  {1}'.format(token, label))

[CLS]       0
「           0
東海道         1
京都          1
之           0
内           0
」           0
「           0
大内          0
能           0
上           0
##覧         0
図           0
」           0
[SEP]       0
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [None]:
from transformers import BertForSequenceClassification
model = BertForTokenClassification.from_pretrained('cl-tohoku/bert-base-japanese', num_labels=len(labels_to_ids), return_dict = False)
# model = AutoModel.from_pretrained('cl-tohoku/bert-base-japanese', num_labels=3)
model.to(device)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at 

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [None]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(0.6236, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 60, 2])

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [None]:
for epoch in tqdm(range(EPOCHS)):
    print(f"\nTraining epoch: {epoch + 1}")
    train(epoch)

  0%|          | 0/20 [00:00<?, ?it/s]


Training epoch: 1
Training loss per 100 training steps: 0.6510897874832153


  5%|▌         | 1/20 [00:02<00:44,  2.36s/it]

Training loss epoch: 0.3973754311983402
Training accuracy epoch: 0.8093693481914138

Training epoch: 2
Training loss per 100 training steps: 0.21852844953536987


 10%|█         | 2/20 [00:04<00:41,  2.29s/it]

Training loss epoch: 0.22956685435313445
Training accuracy epoch: 0.910710119491519

Training epoch: 3
Training loss per 100 training steps: 0.10526134818792343


 15%|█▌        | 3/20 [00:06<00:38,  2.27s/it]

Training loss epoch: 0.1645023741114598
Training accuracy epoch: 0.9387292724859895

Training epoch: 4
Training loss per 100 training steps: 0.12480373680591583


 20%|██        | 4/20 [00:09<00:36,  2.27s/it]

Training loss epoch: 0.11274480826866168
Training accuracy epoch: 0.9677317423821504

Training epoch: 5
Training loss per 100 training steps: 0.08348846435546875


 25%|██▌       | 5/20 [00:11<00:33,  2.26s/it]

Training loss epoch: 0.08012667606369807
Training accuracy epoch: 0.9761897067502405

Training epoch: 6
Training loss per 100 training steps: 0.0324450246989727


 30%|███       | 6/20 [00:13<00:31,  2.27s/it]

Training loss epoch: 0.05639912746846676
Training accuracy epoch: 0.9849073638446225

Training epoch: 7
Training loss per 100 training steps: 0.03307684138417244


 35%|███▌      | 7/20 [00:15<00:29,  2.26s/it]

Training loss epoch: 0.042075559246138886
Training accuracy epoch: 0.9903414260864581

Training epoch: 8
Training loss per 100 training steps: 0.023602725937962532


 40%|████      | 8/20 [00:18<00:27,  2.26s/it]

Training loss epoch: 0.024883017794658933
Training accuracy epoch: 0.993727371756289

Training epoch: 9
Training loss per 100 training steps: 0.01202044915407896


 45%|████▌     | 9/20 [00:20<00:24,  2.27s/it]

Training loss epoch: 0.01853712612333206
Training accuracy epoch: 0.9937540402552874

Training epoch: 10
Training loss per 100 training steps: 0.004840428475290537


 50%|█████     | 10/20 [00:22<00:22,  2.27s/it]

Training loss epoch: 0.011962966554654906
Training accuracy epoch: 0.9976497343796675

Training epoch: 11
Training loss per 100 training steps: 0.00630099605768919


 55%|█████▌    | 11/20 [00:24<00:20,  2.27s/it]

Training loss epoch: 0.017505946134826027
Training accuracy epoch: 0.9956234707573789

Training epoch: 12
Training loss per 100 training steps: 0.029817061498761177


 60%|██████    | 12/20 [00:27<00:18,  2.27s/it]

Training loss epoch: 0.010567358632285435
Training accuracy epoch: 0.9978119482736445

Training epoch: 13
Training loss per 100 training steps: 0.0019991181325167418


 65%|██████▌   | 13/20 [00:29<00:15,  2.27s/it]

Training loss epoch: 0.012239724995407205
Training accuracy epoch: 0.995171510862422

Training epoch: 14
Training loss per 100 training steps: 0.007565930485725403


 70%|███████   | 14/20 [00:31<00:13,  2.27s/it]

Training loss epoch: 0.012082059970662858
Training accuracy epoch: 0.9974522131337742

Training epoch: 15
Training loss per 100 training steps: 0.0014329726109281182


 75%|███████▌  | 15/20 [00:34<00:11,  2.27s/it]

Training loss epoch: 0.004776202790358534
Training accuracy epoch: 0.9995366079703429

Training epoch: 16
Training loss per 100 training steps: 0.0023495927453041077


 80%|████████  | 16/20 [00:36<00:09,  2.27s/it]

Training loss epoch: 0.004884194412555259
Training accuracy epoch: 0.9993368700265252

Training epoch: 17
Training loss per 100 training steps: 0.004198227543383837


 85%|████████▌ | 17/20 [00:38<00:06,  2.28s/it]

Training loss epoch: 0.0034511820413172245
Training accuracy epoch: 0.9988307971520313

Training epoch: 18
Training loss per 100 training steps: 0.007706819102168083


 90%|█████████ | 18/20 [00:40<00:04,  2.28s/it]

Training loss epoch: 0.002266955068290162
Training accuracy epoch: 1.0

Training epoch: 19
Training loss per 100 training steps: 0.0008130311034619808


 95%|█████████▌| 19/20 [00:43<00:02,  2.28s/it]

Training loss epoch: 0.0016730831031544278
Training accuracy epoch: 1.0

Training epoch: 20
Training loss per 100 training steps: 0.001243495848029852


100%|██████████| 20/20 [00:45<00:00,  2.27s/it]

Training loss epoch: 0.0010493647848936515
Training accuracy epoch: 1.0





In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [None]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.09698526561260223
Validation Loss: 0.432152474047236
Validation Accuracy: 0.9115213667219534


In [None]:
from sklearn.metrics import classification_report

print(classification_report(labels, predictions))

              precision    recall  f1-score   support

           O       0.95      0.94      0.94      1339
       PLACE       0.76      0.78      0.77       334

    accuracy                           0.91      1673
   macro avg       0.85      0.86      0.86      1673
weighted avg       0.91      0.91      0.91      1673



In [None]:
# predictions

In [None]:
# labels

# Saving the model for future use

Finally, let's save the vocabulary (.txt) file, model weights (.bin) and the model's configuration (.json) to a directory, so that both the tokenizer and model can be re-loaded using the `from_pretrained()` class method.

In [None]:
import os

directory = "./model_merge_reannotated2"

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)
print('All files saved')

All files saved


# Error Analysis

Install SpaCy

In [None]:
%%capture
!pip install -U spacy
!python -m spacy download ja_core_news_lg
!pip install -U spacy

In [None]:
import spacy; spacy.prefer_gpu()
nlp = spacy.load('ja_core_news_lg')

Function that predict a sentence tags

In [None]:
def predict_sentence(sentence):

    inputs = tokenizer(sentence,
                    padding='max_length',
                    max_length=MAX_LEN,
                        return_tensors="pt")
    # item = {key: torch.as_tensor(val) for key, val in encoding.items()}

    # move to gpu
    ids = inputs["input_ids"].to(device)
    mask = inputs["attention_mask"].to(device)
    # forward pass
    outputs = model(ids, attention_mask=mask)
    logits = outputs[0]

    active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

    prediction = []
    prediction_all = []
    # set predicted labels 
    for token_pred in range(len(wp_preds)):
        
        if wp_preds[token_pred][0]=='[CLS]' or wp_preds[token_pred][0]=='[SEP]' or wp_preds[token_pred][0]=='[PAD]':
            continue
        elif wp_preds[token_pred][1]=='O':
            prediction_all .append(wp_preds[token_pred])
        else:
            # predictions
            prediction.append(wp_preds[token_pred])
            prediction_all .append(wp_preds[token_pred])
            # print(wp_preds[token_pred][0])

    return prediction, prediction_all 

## Example a sentence

In [None]:
sentence = "「東海道五十三次」  「三十八」「藤川」"
prediction, prediction_all  = predict_sentence(sentence)

In [None]:
print(sentence)
print(prediction)
print('')
print(prediction_all)

「東海道五十三次」  「三十八」「藤川」
[('東海道', 'PLACE'), ('藤', 'PLACE')]

[('「', 'O'), ('東海道', 'PLACE'), ('五', 'O'), ('十', 'O'), ('三', 'O'), ('次', 'O'), ('」', 'O'), ('「', 'O'), ('三', 'O'), ('十', 'O'), ('八', 'O'), ('」', 'O'), ('「', 'O'), ('藤', 'PLACE'), ('##川', 'O'), ('」', 'O')]


Find start and end positions of all occurrences within a string in Python

In [None]:
import re
# example = '「東海道五十三次」  「三十八」「藤川」'
# for match in re.finditer('東海道', example):
#     print(match.start(), match.end())

In [None]:
spans = []
for pred in prediction: # eg pred = ('東海道', 'PLACE')
    #Find start and end positions of all occurrences within a string in Python
    for match in re.finditer(pred[0], sentence): # eg pred[0] = '東海道'
        temp = [match.start(), match.end(),pred[1]] # eg pred[1] = 'PLACE'
        spans.append(temp)

print(spans)   

[[1, 4, 'PLACE'], [17, 18, 'PLACE']]


In [None]:
from spacy import displacy

# nlp = spacy.load('ja_core_news_sm')
# nlp = spacy.blank('ja')
raw_text = "「東海道　京都之内」「大内能上覧図」"
doc = nlp.make_doc(raw_text)
spans = [[1, 4, "PLACE"],[5, 7, "PLACE"]]
ents = []
for span_start, span_end, label in spans:
    ent = doc.char_span(span_start, span_end, label=label)
    if ent is None:
        continue

    ents.append(ent)

doc.ents = ents
displacy.render(doc, style="ent", jupyter=True)

## Predict all senteces

In [None]:
from termcolor import colored

count = 0
for i in range(len(Test_spacy)):

    count+=1

    # ERROR: I do not know why
    if count==28 or count==73 or count==76:
        continue
    
    sentence = Test_spacy[i][0] #eg "「東海道　京都之内」「大内能上覧図」"
    prediction, prediction_all  = predict_sentence(sentence)

    

    print('Title:', count, colored((sentence), 'red' , attrs=['bold']))
    print('Predictions:',prediction)

    doc2 = nlp.make_doc(sentence)

    spans = []
    for pred in prediction: # eg pred = ('東海道', 'PLACE')
        #Find start and end positions of all occurrences within a string in Python
        text = pred[0] # eg pred[0] = '東海道'
        text = text.replace('##\\','「')
        # text = re.sub('##\\','「',pred[0])
        for match in re.finditer(text, sentence): 
            temp = [match.start(), match.end(),pred[1]] # eg pred[1] = 'PLACE'
            spans.append(temp)

    #spacy
    print(spans)
    ents = []
    for span_start, span_end, label in spans:
        ent = doc2.char_span(span_start, span_end, label=label)
        if ent is None:
            continue

        ents.append(ent)

    doc2.ents = ents
    print('Predicted Tags:')
    displacy.render(doc2, style="ent", jupyter=True)

    ############################################################

    doc = nlp.make_doc(sentence)
    spans = Test_spacy[i][1]['entities']

    #spacy
    ents = []
    for span_start, span_end, label in spans:
        ent = doc.char_span(span_start, span_end, label=label)
        if ent is None:
            continue

        ents.append(ent)
    print(spans)
    doc.ents = ents
    print('Actual Tags:')
    displacy.render(doc, style="ent", jupyter=True)

    ############################################################


    # break

Title: 1 [1m[31m「東海道五十三次」  「三十八」「藤川」[0m
Predictions: [('東海道', 'PLACE'), ('藤', 'PLACE')]
[[1, 4, 'PLACE'], [17, 18, 'PLACE']]
Predicted Tags:


[(1, 4, 'PLACE'), (17, 19, 'PLACE')]
Actual Tags:


Title: 2 [1m[31m「東都六玉顔ノ内」  「角田川」[0m
Predictions: [('東都', 'PLACE'), ('顔', 'PLACE'), ('角田', 'PLACE'), ('川', 'PLACE')]
[[1, 3, 'PLACE'], [5, 6, 'PLACE'], [12, 14, 'PLACE'], [14, 15, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (12, 15, 'PLACE')]
Actual Tags:


Title: 3 [1m[31m「名所江戸百景」  「猿わか町よるの景」[0m
Predictions: [('猿', 'PLACE'), ('わか', 'PLACE'), ('町', 'PLACE'), ('よる', 'PLACE')]
[[11, 12, 'PLACE'], [12, 14, 'PLACE'], [14, 15, 'PLACE'], [15, 17, 'PLACE']]
Predicted Tags:


[(3, 5, 'PLACE'), (11, 15, 'PLACE')]
Actual Tags:


Title: 4 [1m[31m「江戸名所図会」  「卅二」「三十三間堂」「曽我五郎時宗」[0m
Predictions: [('江戸', 'PLACE'), ('三', 'PLACE'), ('十', 'PLACE'), ('三', 'PLACE'), ('間', 'PLACE'), ('堂', 'PLACE'), ('曽', 'PLACE')]
[[1, 3, 'PLACE'], [15, 16, 'PLACE'], [17, 18, 'PLACE'], [16, 17, 'PLACE'], [15, 16, 'PLACE'], [17, 18, 'PLACE'], [18, 19, 'PLACE'], [19, 20, 'PLACE'], [22, 23, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (15, 20, 'PLACE')]
Actual Tags:


Title: 5 [1m[31m「江戸名所　百人美女」  「今川はし」[0m
Predictions: [('江戸', 'PLACE'), ('今川', 'PLACE'), ('はし', 'PLACE')]
[[1, 3, 'PLACE'], [14, 16, 'PLACE'], [16, 18, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (14, 18, 'PLACE')]
Actual Tags:


Title: 6 [1m[31m 「東海道五十三次の内」  「戸塚藤沢間」「吉田橋」「松若」[0m
Predictions: [('東海道', 'PLACE'), ('戸塚', 'PLACE'), ('藤沢', 'PLACE'), ('間', 'PLACE'), ('吉田', 'PLACE'), ('松', 'PLACE')]
[[2, 5, 'PLACE'], [15, 17, 'PLACE'], [17, 19, 'PLACE'], [19, 20, 'PLACE'], [22, 24, 'PLACE'], [27, 28, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (15, 19, 'PLACE'), (22, 25, 'PLACE'), (27, 29, 'PLACE')]
Actual Tags:


Title: 7 [1m[31m 「江戸名所百人美女」  「今戸」[0m
Predictions: [('江戸', 'PLACE'), ('今', 'PLACE')]
[[2, 4, 'PLACE'], [14, 15, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (14, 16, 'PLACE')]
Actual Tags:


Title: 8 [1m[31m「東都高名会席尽」  「金子」「助六」[0m
Predictions: [('東都', 'PLACE'), ('金子', 'PLACE'), ('助', 'PLACE')]
[[1, 3, 'PLACE'], [12, 14, 'PLACE'], [16, 17, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE')]
Actual Tags:


Title: 9 [1m[31m 「木曽六十九駅」  「草津」「野路玉川」「清玄尼」[0m
Predictions: [('木曽', 'PLACE'), ('草津', 'PLACE'), ('野', 'PLACE'), ('玉川', 'PLACE'), ('清', 'PLACE'), ('尼', 'PLACE')]
[[2, 4, 'PLACE'], [12, 14, 'PLACE'], [16, 17, 'PLACE'], [18, 20, 'PLACE'], [22, 23, 'PLACE'], [24, 25, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (12, 14, 'PLACE'), (16, 20, 'PLACE')]
Actual Tags:


Title: 10 [1m[31m 「東海道」 「程ケ谷戸塚間」「権太坂」「いがみ」[0m
Predictions: [('東海道', 'PLACE'), ('程', 'PLACE'), ('戸塚', 'PLACE'), ('権', 'PLACE'), ('い', 'PLACE'), ('##がみ', 'PLACE')]
[[2, 5, 'PLACE'], [8, 9, 'PLACE'], [11, 13, 'PLACE'], [16, 17, 'PLACE'], [21, 22, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (8, 13, 'PLACE'), (16, 19, 'PLACE')]
Actual Tags:


Title: 11 [1m[31m 「両国夕景一ツ目千金」[0m
Predictions: [('両国', 'PLACE')]
[[2, 4, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (9, 11, 'PLACE')]
Actual Tags:


Title: 12 [1m[31m「見立八景之内」  「清水寺の晩鐘」「清玄阿闍梨」「入間の息女桜姫」[0m
Predictions: [('見', 'PLACE'), ('清水', 'PLACE'), ('清', 'PLACE'), ('入間', 'PLACE')]
[[1, 2, 'PLACE'], [11, 13, 'PLACE'], [11, 12, 'PLACE'], [19, 20, 'PLACE'], [26, 28, 'PLACE']]
Predicted Tags:


[(11, 14, 'PLACE')]
Actual Tags:


Title: 13 [1m[31m 「江戸自慢三十六興」  「落合　ほたる」[0m
Predictions: [('江戸', 'PLACE'), ('落合', 'PLACE'), ('ほ', 'PLACE')]
[[2, 4, 'PLACE'], [14, 16, 'PLACE'], [17, 18, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (14, 16, 'PLACE')]
Actual Tags:


Title: 14 [1m[31m 「曽我八景自筆鏡」  「十郎祐成」「曽我中村」[0m
Predictions: [('曽', 'PLACE'), ('十郎', 'PLACE'), ('曽', 'PLACE'), ('中村', 'PLACE')]
[[2, 3, 'PLACE'], [19, 20, 'PLACE'], [13, 15, 'PLACE'], [2, 3, 'PLACE'], [19, 20, 'PLACE'], [21, 23, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE')]
Actual Tags:


Title: 15 [1m[31m 「東海道五十三次之内」  「御油」「其二」「山本勘助母」[0m
Predictions: [('東海道', 'PLACE'), ('御', 'PLACE'), ('山本', 'PLACE'), ('勘', 'PLACE')]
[[2, 5, 'PLACE'], [15, 16, 'PLACE'], [23, 25, 'PLACE'], [25, 26, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (15, 17, 'PLACE')]
Actual Tags:


Title: 16 [1m[31m「東海道」「大津三井寺」[0m
Predictions: [('東海道', 'PLACE'), ('大津', 'PLACE'), ('三井', 'PLACE')]
[[1, 4, 'PLACE'], [6, 8, 'PLACE'], [8, 10, 'PLACE']]
Predicted Tags:


[(1, 4, 'PLACE'), (6, 11, 'PLACE')]
Actual Tags:


Title: 17 [1m[31m「王城加茂社風景」[0m
Predictions: [('王', 'PLACE'), ('加茂', 'PLACE'), ('社', 'PLACE')]
[[1, 2, 'PLACE'], [3, 5, 'PLACE'], [5, 6, 'PLACE']]
Predicted Tags:


[(3, 6, 'PLACE')]
Actual Tags:


Title: 18 [1m[31m「東海道名所之内」  「深草乃里」「少将つか」「せう／＼さくら」「元政寺」[0m
Predictions: [('東海道', 'PLACE'), ('深', 'PLACE'), ('少将', 'PLACE'), ('つか', 'PLACE'), ('せ', 'PLACE'), ('##\\', 'PLACE'), ('さくら', 'PLACE'), ('元', 'PLACE'), ('政', 'PLACE'), ('寺', 'PLACE')]
[[1, 4, 'PLACE'], [12, 13, 'PLACE'], [18, 20, 'PLACE'], [20, 22, 'PLACE'], [24, 25, 'PLACE'], [0, 1, 'PLACE'], [11, 12, 'PLACE'], [17, 18, 'PLACE'], [23, 24, 'PLACE'], [32, 33, 'PLACE'], [28, 31, 'PLACE'], [33, 34, 'PLACE'], [34, 35, 'PLACE'], [35, 36, 'PLACE']]
Predicted Tags:


[(1, 4, 'PLACE'), (12, 16, 'PLACE'), (33, 36, 'PLACE')]
Actual Tags:


Title: 19 [1m[31m「東海道名所之内」  「淀川」[0m
Predictions: [('東海道', 'PLACE'), ('淀川', 'PLACE')]
[[1, 4, 'PLACE'], [12, 14, 'PLACE']]
Predicted Tags:


[(1, 4, 'PLACE'), (12, 14, 'PLACE')]
Actual Tags:


Title: 20 [1m[31m「東京三芝居町繁栄之図」[0m
Predictions: [('東京', 'PLACE'), ('芝居', 'PLACE')]
[[1, 3, 'PLACE'], [4, 6, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (4, 6, 'PLACE')]
Actual Tags:


Title: 21 [1m[31m「津島牛頭天王」「舟発場」「佐屋川」「本陣」[0m
Predictions: [('津島', 'PLACE'), ('牛', 'PLACE'), ('舟', 'PLACE'), ('佐', 'PLACE'), ('川', 'PLACE'), ('本陣', 'PLACE')]
[[1, 3, 'PLACE'], [3, 4, 'PLACE'], [9, 10, 'PLACE'], [14, 15, 'PLACE'], [16, 17, 'PLACE'], [19, 21, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (14, 17, 'PLACE'), (19, 21, 'PLACE')]
Actual Tags:


Title: 22 [1m[31m 「東海道」  「浜松」[0m
Predictions: [('東海道', 'PLACE'), ('浜松', 'PLACE')]
[[2, 5, 'PLACE'], [9, 11, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (9, 11, 'PLACE')]
Actual Tags:


Title: 23 [1m[31m 「東海道之内」  「岡部」[0m
Predictions: [('東海', 'PLACE'), ('道', 'PLACE'), ('岡部', 'PLACE')]
[[2, 4, 'PLACE'], [4, 5, 'PLACE'], [11, 13, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (11, 13, 'PLACE')]
Actual Tags:


Title: 24 [1m[31m 「東海道名所之内」  「豊川」[0m
Predictions: [('東海道', 'PLACE'), ('豊川', 'PLACE')]
[[2, 5, 'PLACE'], [13, 15, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (13, 15, 'PLACE')]
Actual Tags:


Title: 25 [1m[31m［川口善光寺開帳参詣之図］[0m
Predictions: [('川口', 'PLACE'), ('善', 'PLACE')]
[[1, 3, 'PLACE'], [3, 4, 'PLACE']]
Predicted Tags:


[(1, 6, 'PLACE')]
Actual Tags:


Title: 26 [1m[31m 「東海道」  「島田」[0m
Predictions: [('東海道', 'PLACE'), ('島田', 'PLACE')]
[[2, 5, 'PLACE'], [9, 11, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (9, 11, 'PLACE')]
Actual Tags:


Title: 27 [1m[31m「浅草金竜山之図」[0m
Predictions: [('浅草', 'PLACE'), ('金', 'PLACE')]
[[1, 3, 'PLACE'], [3, 4, 'PLACE']]
Predicted Tags:


[(1, 6, 'PLACE')]
Actual Tags:


Title: 29 [1m[31m 「しん板車づくし」  「横浜鉄道図」[0m
Predictions: [('しん', 'PLACE'), ('横浜', 'PLACE')]
[[2, 4, 'PLACE'], [13, 15, 'PLACE']]
Predicted Tags:


[(13, 15, 'PLACE')]
Actual Tags:


Title: 30 [1m[31m 「東都名所高輪行粧之図」[0m
Predictions: [('東都', 'PLACE'), ('高', 'PLACE')]
[[2, 4, 'PLACE'], [6, 7, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (6, 8, 'PLACE')]
Actual Tags:


Title: 31 [1m[31m 「東都名所」  「隅田川花盛」[0m
Predictions: [('東都', 'PLACE'), ('隅田川', 'PLACE')]
[[2, 4, 'PLACE'], [10, 13, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (10, 13, 'PLACE')]
Actual Tags:


Title: 32 [1m[31m 「江戸の花名勝会」  「り」「十番組」「一ツ家の賤の女　尾上菊次郎」「浅茅が原衣掛松」[0m
Predictions: [('江戸', 'PLACE'), ('一', 'PLACE'), ('の', 'PLACE'), ('浅', 'PLACE'), ('##茅', 'PLACE')]
[[2, 4, 'PLACE'], [21, 22, 'PLACE'], [4, 5, 'PLACE'], [24, 25, 'PLACE'], [26, 27, 'PLACE'], [36, 37, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (36, 40, 'PLACE')]
Actual Tags:


Title: 33 [1m[31m「江戸の花名勝会」  「ち」「十番組」「一ツ家の姥　市川海老蔵」「猿若芝居町」[0m
Predictions: [('江戸', 'PLACE'), ('一', 'PLACE'), ('猿', 'PLACE'), ('芝居', 'PLACE')]
[[1, 3, 'PLACE'], [20, 21, 'PLACE'], [33, 34, 'PLACE'], [35, 37, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (33, 38, 'PLACE')]
Actual Tags:


Title: 34 [1m[31m 「東京三十六景」  「十五」「両国」「十六」「本所一ツ目之橋」[0m
Predictions: [('東京', 'PLACE'), ('両国', 'PLACE'), ('本所', 'PLACE'), ('一', 'PLACE'), ('目', 'PLACE'), ('之', 'PLACE'), ('橋', 'PLACE')]
[[2, 4, 'PLACE'], [16, 18, 'PLACE'], [24, 26, 'PLACE'], [26, 27, 'PLACE'], [28, 29, 'PLACE'], [29, 30, 'PLACE'], [30, 31, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (16, 18, 'PLACE'), (26, 31, 'PLACE')]
Actual Tags:


Title: 35 [1m[31m 「東京十二月之内」  「二月」「亀井戸天神」「亀井戸梅林」[0m
Predictions: [('東京', 'PLACE'), ('亀井', 'PLACE'), ('天神', 'PLACE'), ('亀井', 'PLACE')]
[[2, 4, 'PLACE'], [17, 19, 'PLACE'], [24, 26, 'PLACE'], [20, 22, 'PLACE'], [17, 19, 'PLACE'], [24, 26, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (17, 22, 'PLACE'), (24, 29, 'PLACE')]
Actual Tags:


Title: 36 [1m[31m 「東京十二月之内」  「四月」「品川沖之景」[0m
Predictions: [('東京', 'PLACE'), ('品川', 'PLACE')]
[[2, 4, 'PLACE'], [17, 19, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (17, 19, 'PLACE')]
Actual Tags:


Title: 37 [1m[31m 「東京十二月之内」  「六月」「愛宕之景」「神田神社」[0m
Predictions: [('東京', 'PLACE'), ('愛宕', 'PLACE'), ('神田', 'PLACE'), ('神社', 'PLACE')]
[[2, 4, 'PLACE'], [17, 19, 'PLACE'], [23, 25, 'PLACE'], [25, 27, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (17, 19, 'PLACE'), (23, 27, 'PLACE')]
Actual Tags:


Title: 38 [1m[31m「東京開化名所」  「三代徳川家光公」「三河島之景」[0m
Predictions: [('東京', 'PLACE'), ('徳川', 'PLACE'), ('三河', 'PLACE')]
[[1, 3, 'PLACE'], [13, 15, 'PLACE'], [20, 22, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (20, 23, 'PLACE')]
Actual Tags:


Title: 39 [1m[31m 「東京名所競」  「上野東照宮」[0m
Predictions: [('東京', 'PLACE'), ('上野', 'PLACE'), ('東', 'PLACE')]
[[2, 4, 'PLACE'], [11, 13, 'PLACE'], [2, 3, 'PLACE'], [13, 14, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (11, 16, 'PLACE')]
Actual Tags:


Title: 40 [1m[31m「東京銘勝会」  「不忍の競馬」[0m
Predictions: [('東京', 'PLACE'), ('銘', 'PLACE'), ('不', 'PLACE')]
[[1, 3, 'PLACE'], [3, 4, 'PLACE'], [10, 11, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (10, 15, 'PLACE')]
Actual Tags:


Title: 41 [1m[31m 「東海道」  「土山」「鈴ヶ山坂ノ下」[0m
Predictions: [('東海道', 'PLACE'), ('土', 'PLACE'), ('鈴', 'PLACE'), ('坂', 'PLACE')]
[[2, 5, 'PLACE'], [9, 10, 'PLACE'], [13, 14, 'PLACE'], [16, 17, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (9, 11, 'PLACE'), (13, 19, 'PLACE')]
Actual Tags:


Title: 42 [1m[31m「堀切花菖蒲」[0m
Predictions: [('堀', 'PLACE')]
[[1, 2, 'PLACE']]
Predicted Tags:




[(1, 6, 'PLACE')]
Actual Tags:


Title: 43 [1m[31m 「両国花火之図」[0m
Predictions: [('両国', 'PLACE')]
[[2, 4, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE')]
Actual Tags:


Title: 44 [1m[31m 「亀戸梅屋敷」[0m
Predictions: [('亀', 'PLACE')]
[[2, 3, 'PLACE']]
Predicted Tags:


[(1, 6, 'PLACE')]
Actual Tags:


Title: 45 [1m[31m「東京十二月之内」  「一月」「宮城之春」「九段坂」[0m
Predictions: [('東京', 'PLACE'), ('宮城', 'PLACE'), ('九', 'PLACE'), ('坂', 'PLACE')]
[[1, 3, 'PLACE'], [16, 18, 'PLACE'], [22, 23, 'PLACE'], [24, 25, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (16, 20, 'PLACE'), (24, 27, 'PLACE')]
Actual Tags:


Title: 46 [1m[31m 「見立十二支」  「丑」「向島」「牛島神社」[0m
Predictions: [('見', 'PLACE'), ('向島', 'PLACE'), ('牛', 'PLACE'), ('##島', 'PLACE')]
[[2, 3, 'PLACE'], [14, 16, 'PLACE'], [18, 19, 'PLACE']]
Predicted Tags:


[(14, 16, 'PLACE'), (18, 22, 'PLACE')]
Actual Tags:


Title: 47 [1m[31m「見立十二支」  「辰」「深川八幡」「富士」[0m
Predictions: [('見', 'PLACE'), ('辰', 'PLACE'), ('深川', 'PLACE'), ('八幡', 'PLACE'), ('富士', 'PLACE')]
[[1, 2, 'PLACE'], [10, 11, 'PLACE'], [13, 15, 'PLACE'], [15, 17, 'PLACE'], [19, 21, 'PLACE']]
Predicted Tags:


[(13, 17, 'PLACE'), (19, 21, 'PLACE')]
Actual Tags:


Title: 48 [1m[31m 「見立十二支」  「酉」「浅草田甫酉の市」[0m
Predictions: [('見', 'PLACE'), ('浅草', 'PLACE'), ('田', 'PLACE')]
[[2, 3, 'PLACE'], [14, 16, 'PLACE'], [16, 17, 'PLACE']]
Predicted Tags:


[(14, 18, 'PLACE')]
Actual Tags:


Title: 49 [1m[31m 「東海道五拾三次之内」  「大尾」「京師」「三条大橋」[0m
Predictions: [('東海道', 'PLACE'), ('五', 'PLACE'), ('大', 'PLACE'), ('京', 'PLACE'), ('三条', 'PLACE'), ('大橋', 'PLACE')]
[[2, 5, 'PLACE'], [5, 6, 'PLACE'], [15, 16, 'PLACE'], [25, 26, 'PLACE'], [19, 20, 'PLACE'], [23, 25, 'PLACE'], [25, 27, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (19, 21, 'PLACE'), (23, 27, 'PLACE')]
Actual Tags:


Title: 50 [1m[31m 「東海道五拾三次之内」  「日本橋」「行烈振出」[0m
Predictions: [('東海道', 'PLACE'), ('五', 'PLACE'), ('日本橋', 'PLACE'), ('行', 'PLACE')]
[[2, 5, 'PLACE'], [5, 6, 'PLACE'], [15, 18, 'PLACE'], [20, 21, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (15, 18, 'PLACE')]
Actual Tags:


Title: 51 [1m[31m 「東海道五拾三次之内」  「四日市」「三重川」[0m
Predictions: [('東海道', 'PLACE'), ('五', 'PLACE'), ('四日市', 'PLACE'), ('三重', 'PLACE'), ('川', 'PLACE')]
[[2, 5, 'PLACE'], [5, 6, 'PLACE'], [15, 18, 'PLACE'], [20, 22, 'PLACE'], [22, 23, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (15, 18, 'PLACE'), (20, 23, 'PLACE')]
Actual Tags:


Title: 52 [1m[31m 「東海道五拾三次之内」  「大津」「走井茶屋」[0m
Predictions: [('東海道', 'PLACE'), ('五', 'PLACE'), ('大津', 'PLACE'), ('走', 'PLACE'), ('茶屋', 'PLACE')]
[[2, 5, 'PLACE'], [5, 6, 'PLACE'], [15, 17, 'PLACE'], [19, 20, 'PLACE'], [21, 23, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (15, 17, 'PLACE'), (19, 21, 'PLACE')]
Actual Tags:


Title: 53 [1m[31m 「東海道五拾三次之内」  「平塚」「縄手道」[0m
Predictions: [('東海道', 'PLACE'), ('五', 'PLACE'), ('平塚', 'PLACE'), ('縄', 'PLACE'), ('道', 'PLACE')]
[[2, 5, 'PLACE'], [5, 6, 'PLACE'], [15, 17, 'PLACE'], [19, 20, 'PLACE'], [4, 5, 'PLACE'], [21, 22, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (15, 17, 'PLACE'), (19, 22, 'PLACE')]
Actual Tags:


Title: 54 [1m[31m 「東海道五拾三次之内」  「土山」「春之雨」[0m
Predictions: [('東海道', 'PLACE'), ('土', 'PLACE'), ('春', 'PLACE')]
[[2, 5, 'PLACE'], [15, 16, 'PLACE'], [19, 20, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (15, 17, 'PLACE')]
Actual Tags:


Title: 55 [1m[31m 「東海道五拾三次之内」  「日坂」「佐夜ノ中山」[0m
Predictions: [('東海道', 'PLACE'), ('五', 'PLACE'), ('日', 'PLACE'), ('坂', 'PLACE'), ('佐', 'PLACE'), ('中山', 'PLACE')]
[[2, 5, 'PLACE'], [5, 6, 'PLACE'], [15, 16, 'PLACE'], [16, 17, 'PLACE'], [19, 20, 'PLACE'], [22, 24, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (15, 17, 'PLACE'), (19, 24, 'PLACE')]
Actual Tags:


Title: 56 [1m[31m「東海道五拾三次之内」  「庄野」「白雨」[0m
Predictions: [('東海道', 'PLACE'), ('庄', 'PLACE'), ('白', 'PLACE')]
[[1, 4, 'PLACE'], [14, 15, 'PLACE'], [18, 19, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (15, 17, 'PLACE')]
Actual Tags:


Title: 57 [1m[31m 「江戸名所」  「両国花火」[0m
Predictions: [('江戸', 'PLACE'), ('両国', 'PLACE')]
[[2, 4, 'PLACE'], [10, 12, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (10, 12, 'PLACE')]
Actual Tags:


Title: 58 [1m[31m 「江戸名所」  「芝増上寺前の景」[0m
Predictions: [('江戸', 'PLACE'), ('芝', 'PLACE'), ('増', 'PLACE')]
[[2, 4, 'PLACE'], [10, 11, 'PLACE'], [11, 12, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (11, 14, 'PLACE')]
Actual Tags:


Title: 59 [1m[31m 「東都名所」  「新吉原五丁町弥生花盛全図」[0m
Predictions: [('東都', 'PLACE'), ('新', 'PLACE')]
[[2, 4, 'PLACE'], [10, 11, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (10, 16, 'PLACE')]
Actual Tags:


Title: 60 [1m[31m 「東海道五拾三次之内」  「袋井」「出茶屋ノ図」[0m
Predictions: [('東海道', 'PLACE'), ('五', 'PLACE'), ('袋', 'PLACE'), ('出', 'PLACE')]
[[2, 5, 'PLACE'], [5, 6, 'PLACE'], [15, 16, 'PLACE'], [19, 20, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (15, 17, 'PLACE'), (19, 22, 'PLACE')]
Actual Tags:


Title: 61 [1m[31m［江都名所浅草観音の図］[0m
Predictions: [('江', 'PLACE'), ('都', 'PLACE'), ('浅草', 'PLACE')]
[[1, 2, 'PLACE'], [2, 3, 'PLACE'], [5, 7, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (5, 9, 'PLACE')]
Actual Tags:


Title: 62 [1m[31m「東京市中馬車往来之図」[0m
Predictions: [('東京', 'PLACE')]
[[1, 3, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE')]
Actual Tags:


Title: 63 [1m[31m［東京海運橋兜町為換座五階造リ図］[0m
Predictions: [('東京', 'PLACE'), ('海運', 'PLACE'), ('橋', 'PLACE'), ('兜', 'PLACE'), ('為', 'PLACE'), ('換', 'PLACE')]
[[1, 3, 'PLACE'], [3, 5, 'PLACE'], [5, 6, 'PLACE'], [6, 7, 'PLACE'], [8, 9, 'PLACE'], [9, 10, 'PLACE']]
Predicted Tags:


[(1, 8, 'PLACE')]
Actual Tags:


Title: 64 [1m[31m［久松町劇場久松座繁栄図］[0m
Predictions: [('久', 'PLACE'), ('町', 'PLACE'), ('久', 'PLACE'), ('座', 'PLACE')]
[[1, 2, 'PLACE'], [6, 7, 'PLACE'], [3, 4, 'PLACE'], [1, 2, 'PLACE'], [6, 7, 'PLACE'], [8, 9, 'PLACE']]
Predicted Tags:


[(1, 4, 'PLACE')]
Actual Tags:


Title: 65 [1m[31m「東京名所」  「スジカイ　万代橋」[0m
Predictions: [('東京', 'PLACE'), ('ス', 'PLACE'), ('万', 'PLACE'), ('代', 'PLACE'), ('橋', 'PLACE')]
[[1, 3, 'PLACE'], [9, 10, 'PLACE'], [14, 15, 'PLACE'], [15, 16, 'PLACE'], [16, 17, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (14, 17, 'PLACE')]
Actual Tags:


Title: 66 [1m[31m「東京名所之内」  「高輪海岸鉄道の図」[0m
Predictions: [('東京', 'PLACE'), ('高', 'PLACE'), ('海岸', 'PLACE')]
[[1, 3, 'PLACE'], [11, 12, 'PLACE'], [13, 15, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (11, 15, 'PLACE')]
Actual Tags:


Title: 67 [1m[31m 「東海道五拾三次之内」  「原」「朝之富士」[0m
Predictions: [('東海道', 'PLACE'), ('原', 'PLACE'), ('朝', 'PLACE'), ('富士', 'PLACE')]
[[2, 5, 'PLACE'], [15, 16, 'PLACE'], [18, 19, 'PLACE'], [20, 22, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE'), (15, 16, 'PLACE'), (20, 22, 'PLACE')]
Actual Tags:


Title: 68 [1m[31m「横浜名所図会」  「野毛山下蒸気車」[0m
Predictions: [('横浜', 'PLACE'), ('野', 'PLACE'), ('山下', 'PLACE'), ('蒸気', 'PLACE')]
[[1, 3, 'PLACE'], [11, 12, 'PLACE'], [13, 15, 'PLACE'], [15, 17, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (11, 14, 'PLACE')]
Actual Tags:


Title: 69 [1m[31m「東京名所之内」  「上野公園清水堂」[0m
Predictions: [('東京', 'PLACE'), ('上野', 'PLACE'), ('清水', 'PLACE')]
[[1, 3, 'PLACE'], [11, 13, 'PLACE'], [15, 17, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (11, 18, 'PLACE')]
Actual Tags:


Title: 70 [1m[31m「東海道五拾三次之内」  「見附」「天竜川図」[0m
Predictions: [('東海道', 'PLACE'), ('見', 'PLACE'), ('天竜', 'PLACE')]
[[1, 4, 'PLACE'], [14, 15, 'PLACE'], [18, 20, 'PLACE']]
Predicted Tags:


[(1, 4, 'PLACE'), (14, 16, 'PLACE'), (18, 22, 'PLACE')]
Actual Tags:


Title: 71 [1m[31m「東京開化三十六景」  「柳橋より浅草橋」[0m
Predictions: [('東京', 'PLACE'), ('柳', 'PLACE'), ('浅草', 'PLACE')]
[[1, 3, 'PLACE'], [13, 14, 'PLACE'], [17, 19, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (13, 15, 'PLACE'), (17, 20, 'PLACE')]
Actual Tags:


Title: 72 [1m[31m「東京名所之内」  「浅草金竜山」[0m
Predictions: [('東京', 'PLACE'), ('浅草', 'PLACE'), ('金', 'PLACE')]
[[1, 3, 'PLACE'], [11, 13, 'PLACE'], [13, 14, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (11, 16, 'PLACE')]
Actual Tags:


Title: 74 [1m[31m「大和名所」  「大仏殿」[0m
Predictions: [('大和', 'PLACE'), ('大仏', 'PLACE')]
[[1, 3, 'PLACE'], [9, 11, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (10, 13, 'PLACE')]
Actual Tags:


Title: 75 [1m[31m「大和名所」  「春日神社」「若草山」[0m
Predictions: [('大和', 'PLACE'), ('春日', 'PLACE'), ('若', 'PLACE')]
[[1, 3, 'PLACE'], [9, 11, 'PLACE'], [15, 16, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (9, 13, 'PLACE'), (15, 18, 'PLACE')]
Actual Tags:


Title: 77 [1m[31m 「神田御社眺望」[0m
Predictions: [('神田', 'PLACE'), ('御', 'PLACE')]
[[2, 4, 'PLACE'], [4, 5, 'PLACE']]
Predicted Tags:


[(2, 6, 'PLACE')]
Actual Tags:


Title: 78 [1m[31m 「大阪名所」  「桜乃宮より造幣局を望む」「天神橋之図」「造幣局」[0m
Predictions: [('大阪', 'PLACE'), ('桜', 'PLACE'), ('造', 'PLACE'), ('天神', 'PLACE'), ('橋', 'PLACE'), ('造', 'PLACE')]
[[2, 4, 'PLACE'], [10, 11, 'PLACE'], [15, 16, 'PLACE'], [30, 31, 'PLACE'], [23, 25, 'PLACE'], [25, 26, 'PLACE'], [15, 16, 'PLACE'], [30, 31, 'PLACE']]
Predicted Tags:


[(2, 4, 'PLACE'), (10, 13, 'PLACE'), (23, 26, 'PLACE'), (30, 33, 'PLACE')]
Actual Tags:


Title: 79 [1m[31m 「東海道之内」  「関」[0m
Predictions: [('東海', 'PLACE'), ('道', 'PLACE'), ('関', 'PLACE')]
[[2, 4, 'PLACE'], [4, 5, 'PLACE'], [11, 12, 'PLACE']]
Predicted Tags:


[(2, 5, 'PLACE')]
Actual Tags:


Title: 80 [1m[31m「観音霊験記」  「秩父順礼廿九番」「笹の戸　見目山　長泉院」[0m
Predictions: [('観音', 'PLACE'), ('秩父', 'PLACE'), ('笹', 'PLACE'), ('見', 'PLACE'), ('長', 'PLACE')]
[[1, 3, 'PLACE'], [10, 12, 'PLACE'], [19, 20, 'PLACE'], [23, 24, 'PLACE'], [27, 28, 'PLACE']]
Predicted Tags:


[(19, 30, 'PLACE')]
Actual Tags:


Title: 81 [1m[31m［東京三井組ハウス］[0m
Predictions: [('東京', 'PLACE'), ('三井', 'PLACE'), ('組', 'PLACE')]
[[1, 3, 'PLACE'], [3, 5, 'PLACE'], [5, 6, 'PLACE']]
Predicted Tags:


[(1, 9, 'PLACE')]
Actual Tags:


Title: 82 [1m[31m「東京名所　浅草観音之図」[0m
Predictions: [('東京', 'PLACE'), ('浅草', 'PLACE')]
[[1, 3, 'PLACE'], [6, 8, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (6, 10, 'PLACE')]
Actual Tags:


Title: 83 [1m[31m 「名所江戸百景」  「鎧の渡し　小網町」[0m
Predictions: [('鎧', 'PLACE'), ('の', 'PLACE'), ('渡し', 'PLACE'), ('小', 'PLACE'), ('町', 'PLACE')]
[[12, 13, 'PLACE'], [13, 14, 'PLACE'], [14, 16, 'PLACE'], [17, 18, 'PLACE'], [19, 20, 'PLACE']]
Predicted Tags:


[(4, 6, 'PLACE'), (16, 20, 'PLACE')]
Actual Tags:


Title: 84 [1m[31m「諸国滝廻リ」  「木曽海道　小野ノ瀑布」[0m
Predictions: [('木曽', 'PLACE')]
[[10, 12, 'PLACE']]
Predicted Tags:


[(10, 14, 'PLACE')]
Actual Tags:


Title: 85 [1m[31m「五十三次名所図会」「丗二」  「あら井　渡舟着岸御関所」[0m
Predictions: [('あら', 'PLACE'), ('井', 'PLACE'), ('渡', 'PLACE'), ('御', 'PLACE')]
[[17, 19, 'PLACE'], [19, 20, 'PLACE'], [21, 22, 'PLACE'], [25, 26, 'PLACE']]
Predicted Tags:


[(17, 20, 'PLACE')]
Actual Tags:


Title: 86 [1m[31m「五十三次名所図会　四十」「池鯉鮒　八ツ橋むら　杜若の古せき」[0m
Predictions: [('池', 'PLACE'), ('八', 'PLACE'), ('杜', 'PLACE')]
[[14, 15, 'PLACE'], [18, 19, 'PLACE'], [24, 25, 'PLACE']]
Predicted Tags:


[(14, 23, 'PLACE')]
Actual Tags:


Title: 87 [1m[31m「五十三次名所図会」「丗八」  「藤川　山中の里別名宮路山」[0m
Predictions: [('藤', 'PLACE'), ('宮', 'PLACE')]
[[17, 18, 'PLACE'], [26, 27, 'PLACE']]
Predicted Tags:


[(17, 19, 'PLACE'), (20, 24, 'PLACE'), (26, 29, 'PLACE')]
Actual Tags:


Title: 88 [1m[31m「川崎」  「神奈川へ二リ半」[0m
Predictions: [('川崎', 'PLACE'), ('神奈川', 'PLACE'), ('二', 'PLACE')]
[[1, 3, 'PLACE'], [7, 10, 'PLACE'], [11, 12, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (7, 10, 'PLACE')]
Actual Tags:


Title: 89 [1m[31m「江都名所」  「洲崎しほ干狩」[0m
Predictions: [('江', 'PLACE'), ('洲', 'PLACE'), ('し', 'PLACE')]
[[1, 2, 'PLACE'], [9, 10, 'PLACE'], [11, 12, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (9, 11, 'PLACE')]
Actual Tags:


Title: 90 [1m[31m「諸国名橋奇覧」  「摂州安治川口天保山」[0m
Predictions: [('諸', 'PLACE'), ('国名', 'PLACE'), ('橋', 'PLACE'), ('摂', 'PLACE'), ('安', 'PLACE'), ('川口', 'PLACE'), ('天保', 'PLACE')]
[[1, 2, 'PLACE'], [2, 4, 'PLACE'], [4, 5, 'PLACE'], [11, 12, 'PLACE'], [13, 14, 'PLACE'], [15, 17, 'PLACE'], [17, 19, 'PLACE']]
Predicted Tags:


[(11, 20, 'PLACE')]
Actual Tags:


Title: 91 [1m[31m「東都名所」  「永代橋深川新地」[0m
Predictions: [('東都', 'PLACE'), ('永', 'PLACE'), ('##代', 'PLACE'), ('橋', 'PLACE'), ('深川', 'PLACE'), ('新', 'PLACE')]
[[1, 3, 'PLACE'], [9, 10, 'PLACE'], [11, 12, 'PLACE'], [12, 14, 'PLACE'], [14, 15, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (9, 16, 'PLACE')]
Actual Tags:


Title: 92 [1m[31m「五十三次名所図会」「一」  「日本橋　東雲の景[0m
Predictions: [('日本橋', 'PLACE'), ('東', 'PLACE')]
[[16, 19, 'PLACE'], [20, 21, 'PLACE']]
Predicted Tags:


[(17, 20, 'PLACE')]
Actual Tags:


Title: 93 [1m[31m「五十三次名所図会」「六」  「戸塚山道より不二眺望」[0m
Predictions: [('戸塚', 'PLACE'), ('山道', 'PLACE')]
[[16, 18, 'PLACE'], [18, 20, 'PLACE']]
Predicted Tags:


[(16, 20, 'PLACE'), (22, 24, 'PLACE')]
Actual Tags:


Title: 94 [1m[31m「富士三十六景」  「東都佃沖」[0m
Predictions: [('富士', 'PLACE'), ('東都', 'PLACE')]
[[1, 3, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (11, 14, 'PLACE')]
Actual Tags:


Title: 95 [1m[31m「江戸名所之内」  「真乳山」[0m
Predictions: [('江戸', 'PLACE'), ('真', 'PLACE'), ('乳', 'PLACE'), ('山', 'PLACE')]
[[1, 3, 'PLACE'], [11, 12, 'PLACE'], [12, 13, 'PLACE'], [13, 14, 'PLACE']]
Predicted Tags:


[(1, 3, 'PLACE'), (11, 14, 'PLACE')]
Actual Tags:


Title: 96 [1m[31m「名所江戸百景」  「千住の大はし」[0m
Predictions: [('千', 'PLACE'), ('の', 'PLACE'), ('大', 'PLACE'), ('はし', 'PLACE')]
[[11, 12, 'PLACE'], [13, 14, 'PLACE'], [14, 15, 'PLACE'], [15, 17, 'PLACE']]
Predicted Tags:


[(3, 5, 'PLACE'), (11, 17, 'PLACE')]
Actual Tags:


Title: 97 [1m[31m「諸国名所百景」  「遠州秋葉遠景袋井凧」[0m
Predictions: [('遠', 'PLACE'), ('袋', 'PLACE')]
[[11, 12, 'PLACE'], [15, 16, 'PLACE'], [17, 18, 'PLACE']]
Predicted Tags:


[(13, 15, 'PLACE'), (17, 19, 'PLACE')]
Actual Tags:


# Hugging Face: Japanese Bert

## Fill-mask

https://huggingface.co/cl-tohoku/bert-base-japanese

https://huggingface.co/docs/transformers/model_doc/bert_japanese

In [None]:
# from transformers import pipeline
unmasker = pipeline('fill-mask', model='cl-tohoku/bert-base-japanese')
unmasker("朝食に[MASK]を焼いて食べました。")

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: ignored

In [None]:
unmasker = pipeline('fill-mask', model='cl-tohoku/bert-base-japanese-char')
unmasker("朝食に[MASK]を焼いて食べました。")

----

In [None]:
# import torch
# from transformers import AutoModel, AutoTokenizer 

bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")

In [None]:
text = '朝食にを焼いて食べまし[MASK]。'
token_ids = tokenizer.encode(text, add_special_tokens=True)
token_ids

In [None]:
text = '朝食に[MASK]を焼いて食べました。'
token_ids = tokenizer.encode(text, add_special_tokens=True)
token_ids

In [None]:
tokens = tokenizer.convert_ids_to_tokens(token_ids)
tokens

In [None]:
token_ids = torch.tensor([token_ids])
token_ids

In [None]:
bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")

line = "吾輩は猫である。"
inputs = tokenizer(line, return_tensors="pt")
print(tokenizer.decode(inputs['input_ids'][0]))
# [CLS] 吾輩 は 猫 で ある 。 [SEP]

outputs = bertjapanese(**inputs)
outputs

In [None]:
text = '「東京名所四十八景」「神田明神社内年の市」'
token_ids = tokenizer.encode(text, add_special_tokens=True)

print('----------------- Gold Text -----------------')
print('Text is:',text)
print('Text length:',len(text))
print('Text Encode is:',token_ids)
print('Text Encode length:',len(token_ids))
print('Convert id 391 to:', tokenizer.convert_ids_to_tokens(token_ids[2]))
print('\n東京=Tokyo (PLACE in potision [1,2])')
print(' vs')
print('東=East & 京=Beijing \n')
print("Title:「東京名所四十八景」「神田明神社内年の市」with labels [[1, 3, 'PLACE'], [11, 16, 'PLACE'], [16, 20, 'PLACE']]]")
print('Convert all the tokinazation:',tokenizer.convert_ids_to_tokens(token_ids))

Αρκετα καλο Tokinization `cl-tohoku/bert-base-japanese`, απλα λυνει το προβλημα του fill-mask, οποτε πρεπει να κανουμε tranfer learning

## NER

In [None]:
# https://huggingface.co/ken11/bert-japanese-ner
# from transformers import pipeline
# unmasker = pipeline('ner', model='ken11/bert-japanese-ner')
# unmasker("朝食にを焼いて食べました。")

In [None]:
# https://huggingface.co/ken11/bert-japanese-ner

from transformers import (
    BertForTokenClassification, BertTokenizer, AutoTokenizer
)

# from pyknp import Juman


# jumanpp = Juman()
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")

model = BertForTokenClassification.from_pretrained("ken11/bert-japanese-ner")

tokenized_text = '「東京名所四十八景」「神田明神社内年の市」'
# tokenized_text = '「東海道　京都之内」「大内能上覧図」'
# text = "なにか文章"
# juman_result = jumanpp.analysis(text)
# tokenized_text = [mrph.midasi for mrph in juman_result.mrph_list()]
inputs = tokenizer(tokenized_text, return_tensors="pt") #, padding='max_length', truncation=True, max_length=64, is_split_into_words=True)
pred = model(**inputs).logits[0]
pred = np.argmax(pred.detach().numpy(), axis=-1)
labels = []
for i, label in enumerate(pred):
    if i + 1 > len(tokenized_text):
        continue
    labels.append(model.config.id2label[label])
    print(f"{tokenized_text[i]}: {model.config.id2label[label]}")
print(tokenized_text)
print(labels)

Οχι τοσο καλο `ken11/bert-japanese-ner`, αλλά κανει NER. Απο την αλλη ομως, ουτε ξερουμε ποιον tokinazer χρησιμοποιεί και κανει NER στα Ιαπωνικα αντι στα αγγλικα. Ισως αξιζει να δουμε τον εξτρα tokinizer που εχει (περα του hugging face που δεν δειχνει ποιος ειναι)