# Build dataset of survivor bios


広島平和記念資料館 被爆者証言ビデオ
* 692 bios
* http://a-bombdb.pcf.city.hiroshima.jp/pdbj/list/page/1?cls=col_testify&DISPNUM=100&searchType=5&reSearchKey=&SORT5=&SORTORDER5=&pageDivision=49&sdist=&sage=&ssex=


広島館・長崎館・広島資料館
* 638 bios
* https://www.global-peace.go.jp/picture/pic_kekka.php

長崎市平和・原爆 家族証言者
* https://nagasakipeace.jp/japanese/peace/keisyo/profile.html

In [317]:
import requests
import pandas as pd
import re
import numpy as np
import os

from bs4 import BeautifulSoup

import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
matplotlib.rcParams['pdf.fonttype'] = 42

%matplotlib inline

pd.set_option('display.max_rows', 10000)

# headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'}

# Scrape HIROSHIMA bios only

http://a-bombdb.pcf.city.hiroshima.jp/pdbj/list/page/1?cls=col_testify&DISPNUM=100&searchType=5&reSearchKey=&SORT5=&SORTORDER5=&pageDivision=49&sdist=&sage=&ssex=

## Scrape just the basic info for one page

There are 7 pages total, 692 bios

In [5]:
page= 1 
url = f'http://a-bombdb.pcf.city.hiroshima.jp/pdbj/list/page/{page}?cls=col_testify&DISPNUM=100&searchType=5&reSearchKey=&SORT5=&SORTORDER5=&pageDivision=49&sdist=&sage=&ssex='
response = requests.get(url)
doc = BeautifulSoup(response.text)

main_url = 'http://a-bombdb.pcf.city.hiroshima.jp'

In [27]:
# Get info for one survivor

survivors = doc.find_all('tr')[1:]
name = survivors[0].a.text.replace('\u3000','|').strip()
slug = survivors[0].a['href']
sex = survivors[0].find_all('td')[1].text.strip()
age_exposure = survivors[0].find_all('td')[3].text.replace('歳','').strip()
status_then = survivors[0].find_all('td')[4].text.strip()
code = survivors[0].find_all('td')[5].text.strip()
production_date = survivors[0].find_all('td')[6].text.strip()

In [28]:
# Get info for all survivors on page 1

rows = []
for survivor in survivors:
    row = {}
    row['name'] = survivor.a.text.replace('\u3000','|').strip()
    row['id'] = int(survivor.find('th').text.strip())
    row['slug'] = survivor.a['href']
    row['sex'] = survivor.find_all('td')[1].text.strip()
    row['age_exposure'] = survivor.find_all('td')[3].text.replace('歳','').strip()
    row['status_then'] = survivor.find_all('td')[4].text.strip()
    row['code'] = survivor.find_all('td')[5].text.strip()
    row['production_date'] = survivor.find_all('td')[6].text.strip()
    rows.append(row)
rows

[{'name': '青山|恭子',
  'slug': '/pdbj/detail/207680',
  'sex': '女性',
  'age_exposure': '21',
  'status_then': '教職員',
  'code': 'VS00366',
  'production_date': '1993/08/01'},
 {'name': '秋田|サチヱ',
  'slug': '/pdbj/detail/207588',
  'sex': '女性',
  'age_exposure': '14',
  'status_then': '学生',
  'code': 'VS00678',
  'production_date': '2000/03/31'},
 {'name': '秋田|ひとし',
  'slug': '/pdbj/detail/208156',
  'sex': '男性',
  'age_exposure': '15',
  'status_then': '学生',
  'code': 'VS00929',
  'production_date': '2009/03/31'},
 {'name': '秋山|重雄',
  'slug': '/pdbj/detail/208155',
  'sex': '男性',
  'age_exposure': '21',
  'status_then': '学生',
  'code': 'VS00901-13',
  'production_date': '2005/09/08'},
 {'name': '浅野|温生',
  'slug': '/pdbj/detail/208185',
  'sex': '男性',
  'age_exposure': '13',
  'status_then': '学生（中学校2年生）',
  'code': 'VS00937',
  'production_date': '2009/03/31'},
 {'name': '旭|勝子',
  'slug': '/pdbj/detail/207223',
  'sex': '女性',
  'age_exposure': '30',
  'status_then': '主婦',
  'code': 'VS00466

## Scrape all pages

In [47]:
pages = list(range(1,8))
rows = []

for page in pages:
    url = f'http://a-bombdb.pcf.city.hiroshima.jp/pdbj/list/page/{page}?cls=col_testify&DISPNUM=100&searchType=5&reSearchKey=&SORT5=&SORTORDER5=&pageDivision=49&sdist=&sage=&ssex='
    response = requests.get(url)
    doc = BeautifulSoup(response.text)
    survivors = doc.find_all('tr')[1:]

    for survivor in survivors:
        row = {}
        row['id'] = int(survivor.find('th').text.strip())
        row['name'] = survivor.a.text.replace('\u3000','|').strip()
        row['slug'] = survivor.a['href']
        row['sex'] = survivor.find_all('td')[1].text.strip()
        row['age_exposure'] = survivor.find_all('td')[3].text.replace('歳','').strip()
        row['status_then'] = survivor.find_all('td')[4].text.strip()
        row['code'] = survivor.find_all('td')[5].text.strip()
        row['production_date'] = survivor.find_all('td')[6].text.strip()
        rows.append(row)
rows

[{'id': 1,
  'name': '青山|恭子',
  'slug': '/pdbj/detail/207680',
  'sex': '女性',
  'age_exposure': '21',
  'status_then': '教職員',
  'code': 'VS00366',
  'production_date': '1993/08/01'},
 {'id': 2,
  'name': '秋田|サチヱ',
  'slug': '/pdbj/detail/207588',
  'sex': '女性',
  'age_exposure': '14',
  'status_then': '学生',
  'code': 'VS00678',
  'production_date': '2000/03/31'},
 {'id': 3,
  'name': '秋田|ひとし',
  'slug': '/pdbj/detail/208156',
  'sex': '男性',
  'age_exposure': '15',
  'status_then': '学生',
  'code': 'VS00929',
  'production_date': '2009/03/31'},
 {'id': 4,
  'name': '秋山|重雄',
  'slug': '/pdbj/detail/208155',
  'sex': '男性',
  'age_exposure': '21',
  'status_then': '学生',
  'code': 'VS00901-13',
  'production_date': '2005/09/08'},
 {'id': 5,
  'name': '浅野|温生',
  'slug': '/pdbj/detail/208185',
  'sex': '男性',
  'age_exposure': '13',
  'status_then': '学生（中学校2年生）',
  'code': 'VS00937',
  'production_date': '2009/03/31'},
 {'id': 6,
  'name': '旭|勝子',
  'slug': '/pdbj/detail/207223',
  'sex': '女性',

In [48]:
len(rows)

692

In [51]:
df_basic = pd.DataFrame(rows)
df_basic.to_csv('data/survivors_bio_basic.csv', index=False)

In [52]:
df_basic

Unnamed: 0,age_exposure,code,id,name,production_date,sex,slug,status_then
0,21,VS00366,1,青山|恭子,1993/08/01,女性,/pdbj/detail/207680,教職員
1,14,VS00678,2,秋田|サチヱ,2000/03/31,女性,/pdbj/detail/207588,学生
2,15,VS00929,3,秋田|ひとし,2009/03/31,男性,/pdbj/detail/208156,学生
3,21,VS00901-13,4,秋山|重雄,2005/09/08,男性,/pdbj/detail/208155,学生
4,13,VS00937,5,浅野|温生,2009/03/31,男性,/pdbj/detail/208185,学生（中学校2年生）
5,30,VS00466,6,旭|勝子,1995/08/01,女性,/pdbj/detail/207223,主婦
6,24,VS01085,7,東|ミヨノ,2019/02/28,女性,/pdbj/detail/273357,専業主婦
7,18,VS00072,8,阿部|静子,1987/08/01,女性,/pdbj/detail/206965,
8,24,VS00968,9,安部|初子,2011/03/31,女性,/pdbj/detail/208302,
9,22,VS00862,10,天野|貢,2006/03/31,男性,/pdbj/detail/208198,公務員


# Scrape details for all survivors

In [83]:
slugs = df_basic.slug
rows = []

for slug in slugs:
    url = main_url + slug
    response = requests.get(url)
    doc = BeautifulSoup(response.text)
    
    row = {}
    row['code'] = doc.find('td').text.strip()
    row['name_kana'] = doc.find_all('td')[1].text.split('（')[1].replace('\u3000','|').replace('）','')
    row['exposure_loc'] = doc.find_all('td')[3].text
    row['exposure_dist'] = doc.find_all('td')[4].text
    row['age_exposure_detail'] = doc.find_all('td')[5].text.replace('歳','').strip()
    row['age_production'] = doc.find_all('td')[6].text.replace('歳','').strip()
    row['details'] = doc.find_all('td')[7].text
    row['status_then_detail'] = doc.find_all('td')[8].text
    row['work_location'] = doc.find_all('td')[9].text
    rows.append(row)
rows

[{'code': 'VS00366',
  'name_kana': 'アオヤマ|ヤスコ',
  'exposure_loc': '入市',
  'exposure_dist': '',
  'age_exposure_detail': '21',
  'age_production': '69',
  'details': '当日の朝、女学校（現吉田高校）職員室内から閃光を見、音を聞いた。南方にはきのこ雲。一日中生徒と一緒に勤労奉仕。夕方、芸備線吉田口駅へ出ると広島から逃れてきた人たちで溢れていたので、介護を始めた。翌朝から市内の被災者の看護（本川小、勧銀、逓信病院等）。',
  'status_then_detail': '教職員',
  'work_location': '吉田高等女学校'},
 {'code': 'VS00678',
  'name_kana': 'アキタ|サチエ',
  'exposure_loc': '三篠町',
  'exposure_dist': '2.5km',
  'age_exposure_detail': '14',
  'age_production': '69',
  'details': '作業中明るい光が走った。焼夷弾の直撃を受けたと思い、机の下に隠れたが気づくと下敷きになっていた。瓦礫の外に出ると友人たちの顔は墨で真っ黒だった古市の工場に逃げ、その時配給されたむすびは今でも覚えている。自宅に戻ったが全焼。翌日、地御前小学校に向かい、両親と弟に再会した。',
  'status_then_detail': '学生',
  'work_location': '安田高等女学校'},
 {'code': 'VS00929',
  'name_kana': 'アキタ|ヒトシ',
  'exposure_loc': '入市',
  'exposure_dist': '',
  'age_exposure_detail': '15',
  'age_production': '77',
  'details': '学徒動員先の呉の海軍工廠で作業中、建物の外が何か光ったような感じがあった。しばらくして兄を探すため、実家の父親と一緒に広島市内へ入り入市被爆。市内は建物などはほとんど焼けて、何もないような状態だった。結局お兄さんは行方

In [84]:
df_details = pd.DataFrame(rows)
df_details.to_csv('data/survivors_bio_details.csv', index=False)

In [91]:
df = df_details.merge(df_basic, on='code')
df.to_csv('data/survivors_bio.csv', index=False)

In [95]:
df.sort_values('name')

Unnamed: 0,age_exposure_detail,age_production,code,details,exposure_dist,exposure_loc,name_kana,status_then_detail,work_location,age_exposure,id,name,production_date,sex,slug,status_then
679,28,72,VS00197,キリスト教の修道院で被爆。現在、上智学院の理事長。ドイツ人。,4.5km,"祇園町長束,修道院内","ルーメル, クラウス",学生,長束修練院,28,680,クラウス・ルーメル,1989/08/01,男性,/pdbj/detail/207055,学生
380,22,67,VS00247,長男が被爆死。大火傷をおった母と縮景園へ逃げる。戦後、アメリカ人の夫と再婚し、現在、アメリカ...,1km,"上流川町,自宅",チエコ|フレイブル|マツウラ,,,22,381,チエコ・Ｆ・マツウラ,1990/08/01,女性,/pdbj/detail/207095,
581,33,73,VS00028,当時33才。爆心地から1．9キロの千田町の電車の中で被爆した。消防士だった三上さんは8月6日...,1.9km,"千田町,広電本社前",ミカミ|ヨサク,公務員（消防士）,西消防署宇品出張所,33,582,三上|與作,1986/08/01,男性,/pdbj/detail/206934,公務員（消防士）
594,14,73,VS00844,日本製鋼の工場に学徒動員として派遣されて働く毎日だったが、その日は偶然休みで家にいた。姉は被...,3.0km,仁保町、自宅,ミホ|トシオ,学生,市立商業,14,595,三保|俊雄,2005/03/31,男性,/pdbj/detail/208266,学生
595,38,84,VS00267,広島駅で軍備輸送にあたっていた。駅の庶務室で机に向かった時、鋭い閃光と大音響を聞いた。すぐに...,2km,"松原町,広島駅構内",ミホ|ヨシト,公務員,広島鉄道局,38,596,三保|芳登,1991/08/01,男性,/pdbj/detail/207110,公務員
593,19,68,VS00446,当時、内務省の土木出張所の職員として産業奨励館（現在の原爆ドーム）に勤務していた。6日の朝は...,2km,"松原町,広島駅",ミハラ|キミエ,公務員,内務省中国四国土木出張所,19,594,三原|君江,1994/08/01,女性,/pdbj/detail/207213,公務員
584,13,63,VS00472,学徒動員で鶴見町にて建物の後片付けをしていた。誰かが「Ｂ29が飛んでいる」の声と同じに目の前...,1.5km,"鶴見町,町内",ミシマ|ミチエ,学生,広島女子商業学校,13,585,三島|美智恵,1995/08/01,女性,/pdbj/detail/207227,学生
582,21,81,VS00901-06,受講中に校舎内で被爆,2.0km,千田町,ミキ|タクヤ,学生,広島高等工業学校,21,583,三木|琢弥,2005/09/08,男性,/pdbj/detail/208096,学生
580,21,81,VS00901-12,受講中に校舎内で被爆,2.0km,千田町,ミウラ|トミタロウ,学生,広島高等工業学校,21,581,三浦|富太郎,2005/09/08,男性,/pdbj/detail/208099,学生
583,28,86,VS00840,結婚後、陸軍に入隊して、島根から広島へ出てきた。工兵隊第五連隊に所属していた。毎日、軍服を縫...,1.5km,牛田町,ミザワ|ジュウイチ,軍人,陸軍工兵隊第五連隊,28,584,三澤|重市,2004/03/31,男性,/pdbj/detail/207444,軍人


---------------------------------------------------------------------------------------

# Scrape survivor bios from Global Network

https://www.global-peace.go.jp/picture/pic_kekka.php?PageNo=1&dt=190805140418

## Scrape just the basic info for one page

There are 32 pages total, 20 results each, 638 bios

In [179]:
page= 1 
url = f'https://www.global-peace.go.jp/picture/pic_kekka.php?PageNo={page}&dt=190805140418'
response = requests.get(url)
doc = BeautifulSoup(response.content, from_encoding='utf-8')

main_url = 'https://www.global-peace.go.jp/picture/'

# OR:
# url = f'https://www.global-peace.go.jp/picture/pic_kekka.php?PageNo=1&dt=190805140418'
# response = requests.get(url)
# response.encoding = 'utf-8'
# doc = BeautifulSoup(response.text)

In [174]:
# Get info for one survivor

survivors = doc.find('table', width="800").find_all('tr')[1:]
survivor = survivors[0]

number = int(survivor.find_all('td')[0].text)
name = survivor.find_all('td')[1].text.replace('\u3000','|').strip()
slug = survivor.find_all('td')[1].a['href']
sex = survivor.find_all('td')[2].text.strip()
age_exposure = survivor.find_all('td')[3].text.replace('歳','').strip()
exposure_pref = survivor.find_all('td')[4].text.strip()
exposure_dist = survivor.find_all('td')[5].text.strip()

In [181]:
# Get info for all survivors on page 1

rows = []
for survivor in survivors:
    row = {}
    row['name'] = survivor.find_all('td')[1].text.replace('\u3000','|').strip()
    row['id'] = int(survivor.find_all('td')[0].text)
    row['slug'] = survivor.find_all('td')[1].a['href']
    row['sex'] = survivor.find_all('td')[2].text.strip()
    row['age_exposure'] = survivor.find_all('td')[3].text.replace('歳','').strip()
    row['exposure_pref'] = survivor.find_all('td')[4].text.strip()
    row['exposure_dist'] = survivor.find_all('td')[5].text.strip()
    rows.append(row)
rows

[{'name': 'アイリーン|トモエ|平野',
  'id': 1,
  'slug': 'pic_syousai.php?gbID=1280&dt=190805144842',
  'sex': '女性',
  'age_exposure': '6',
  'exposure_pref': '広島',
  'exposure_dist': '6.0km'},
 {'name': '青木|茂',
  'id': 2,
  'slug': 'pic_syousai.php?gbID=140&dt=190805144842',
  'sex': '男性',
  'age_exposure': '20',
  'exposure_pref': '長崎',
  'exposure_dist': '2.2km'},
 {'name': '青木|美枝',
  'id': 3,
  'slug': 'pic_syousai.php?gbID=141&dt=190805144842',
  'sex': '女性',
  'age_exposure': '23',
  'exposure_pref': '広島',
  'exposure_dist': '1.8km'},
 {'name': '赤名|正市',
  'id': 4,
  'slug': 'pic_syousai.php?gbID=1130&dt=190805144842',
  'sex': '男性',
  'age_exposure': '21',
  'exposure_pref': '広島',
  'exposure_dist': ''},
 {'name': '秋月|辰一郎',
  'id': 5,
  'slug': 'pic_syousai.php?gbID=87&dt=190805144842',
  'sex': '男性',
  'age_exposure': '29',
  'exposure_pref': '長崎',
  'exposure_dist': '1.8km'},
 {'name': '秋本|三郎',
  'id': 6,
  'slug': 'pic_syousai.php?gbID=847&dt=190805144842',
  'sex': '男性',
  'age_exposur

## Scrape all pages for basic info (Global Peace table)

In [183]:
pages = list(range(1,33))
rows = []

for page in pages:
    url = f'https://www.global-peace.go.jp/picture/pic_kekka.php?PageNo={page}&dt=190805140418'
    response = requests.get(url)
    doc = BeautifulSoup(response.content, from_encoding='utf-8')
    survivors = doc.find('table', width="800").find_all('tr')[1:]

    for survivor in survivors:
        row = {}
        row['name'] = survivor.find_all('td')[1].text.replace('\u3000','|').strip()
        row['id'] = int(survivor.find_all('td')[0].text)
        row['slug'] = survivor.find_all('td')[1].a['href']
        row['sex'] = survivor.find_all('td')[2].text.strip()
        row['age_exposure'] = survivor.find_all('td')[3].text.replace('歳','').strip()
        row['exposure_pref'] = survivor.find_all('td')[4].text.strip()
        row['exposure_dist'] = survivor.find_all('td')[5].text.strip()
        rows.append(row)
rows

[{'name': 'アイリーン|トモエ|平野',
  'id': 1,
  'slug': 'pic_syousai.php?gbID=1280&dt=190805151358',
  'sex': '女性',
  'age_exposure': '6',
  'exposure_pref': '広島',
  'exposure_dist': '6.0km'},
 {'name': '青木|茂',
  'id': 2,
  'slug': 'pic_syousai.php?gbID=140&dt=190805151358',
  'sex': '男性',
  'age_exposure': '20',
  'exposure_pref': '長崎',
  'exposure_dist': '2.2km'},
 {'name': '青木|美枝',
  'id': 3,
  'slug': 'pic_syousai.php?gbID=141&dt=190805151358',
  'sex': '女性',
  'age_exposure': '23',
  'exposure_pref': '広島',
  'exposure_dist': '1.8km'},
 {'name': '赤名|正市',
  'id': 4,
  'slug': 'pic_syousai.php?gbID=1130&dt=190805151358',
  'sex': '男性',
  'age_exposure': '21',
  'exposure_pref': '広島',
  'exposure_dist': ''},
 {'name': '秋月|辰一郎',
  'id': 5,
  'slug': 'pic_syousai.php?gbID=87&dt=190805151358',
  'sex': '男性',
  'age_exposure': '29',
  'exposure_pref': '長崎',
  'exposure_dist': '1.8km'},
 {'name': '秋本|三郎',
  'id': 6,
  'slug': 'pic_syousai.php?gbID=847&dt=190805151358',
  'sex': '男性',
  'age_exposur

In [184]:
df_basic = pd.DataFrame(rows)
df_basic

Unnamed: 0,age_exposure,exposure_dist,exposure_pref,id,name,sex,slug
0,6,6.0km,広島,1,アイリーン|トモエ|平野,女性,pic_syousai.php?gbID=1280&dt=190805151358
1,20,2.2km,長崎,2,青木|茂,男性,pic_syousai.php?gbID=140&dt=190805151358
2,23,1.8km,広島,3,青木|美枝,女性,pic_syousai.php?gbID=141&dt=190805151358
3,21,,広島,4,赤名|正市,男性,pic_syousai.php?gbID=1130&dt=190805151358
4,29,1.8km,長崎,5,秋月|辰一郎,男性,pic_syousai.php?gbID=87&dt=190805151358
5,14,3.2km,長崎,6,秋本|三郎,男性,pic_syousai.php?gbID=847&dt=190805151358
6,16,3.5km,長崎,7,安里|盛繁,男性,pic_syousai.php?gbID=9&dt=190805151358
7,13,,広島,8,朝比奈|隆,男性,pic_syousai.php?gbID=1206&dt=190805151358
8,19,,広島,9,阿部|磨智惠,女性,pic_syousai.php?gbID=1067&dt=190805151358
9,14,,広島,10,天野|文子,女性,pic_syousai.php?gbID=142&dt=190805151358


In [185]:
df_basic.to_csv('data/survivors_globalpeace_bio_basic.csv', index=False)

# Scrape more data for each survivor

In [231]:
# Scrape details for one survivor
df_basic.slug[0]
url = main_url + survivor['slug']
response = requests.get(url)
response.encoding = 'utf-8'
doc = BeautifulSoup(response.text)

table = doc.find('table', bordercolor="#99CCFF")

name_kana = table.find_all('td')[0].text.split('（')[1].replace('\u3000','|').replace('）','')
age_production = table.find_all('td')[6].text.strip()
exposure_loc = table.find_all('td')[12].text.strip()
status_then = table.find_all('td')[14].text.strip()
status_then_detail = table.find_all('td')[16].text.strip().replace('\u3000','|')
record_location = table.find_all('td')[18].text.strip()

rows = []
row = {}
row['name_kana'] = table.find_all('td')[0].text.split('（')[1].replace('\u3000','|').replace('）','')
row['age_production'] = table.find_all('td')[6].text.strip()
row['exposure_loc'] = table.find_all('td')[12].text.strip()
row['status_then'] = table.find_all('td')[14].text.strip()
row['status_then_detail'] = table.find_all('td')[16].text.strip().replace('\u3000','|')
row['record_location'] = table.find_all('td')[18].text.strip()

rows.append(row)
rows

[{'name_kana': 'あいりーん|ともえ|ひらの',
  'age_production': '2018年11月5日',
  'exposure_loc': '広島市仁保町堀越［現：広島市南区（堀越）］',
  'status_then': '児童',
  'status_then_detail': '青崎国民学校|一年生',
  'record_location': '国立広島原爆死没者追悼平和祈念館'}]

In [243]:
# Scrape details for ALL survivors in Global Peace databse

slugs = df_basic.slug
rows = []
errors = []

for slug in slugs:
    row = {}
    
    url = main_url + slug
    response = requests.get(url)
    response.encoding = 'utf-8'
    doc = BeautifulSoup(response.text)
    table = doc.find('table', bordercolor="#99CCFF")
    
    try:
        row['name_kana'] = table.find_all('td')[0].text.split('（')[1].replace('\u3000','|').replace('）','')
        row['age_production'] = table.find_all('td')[6].text.strip()
        row['exposure_loc'] = table.find_all('td')[12].text.strip()
        row['status_then'] = table.find_all('td')[14].text.strip()
        row['status_then_detail'] = table.find_all('td')[16].text.strip().replace('\u3000','|')
        row['record_location'] = table.find_all('td')[18].text.strip()
        row['slug'] = slug
    except:
        errors.append(slug)
    rows.append(row)
rows

[{'name_kana': 'あいりーん|ともえ|ひらの',
  'age_production': '2018年11月5日',
  'exposure_loc': '広島市仁保町堀越［現：広島市南区（堀越）］',
  'status_then': '児童',
  'status_then_detail': '青崎国民学校|一年生',
  'record_location': '国立広島原爆死没者追悼平和祈念館',
  'slug': 'pic_syousai.php?gbID=1280&dt=190805151358'},
 {'name_kana': 'あおき|しげる',
  'age_production': '2012年10月13日',
  'exposure_loc': '長崎市東北郷[現在の長崎市]',
  'status_then': '一般就業者',
  'status_then_detail': '三菱重工㈱長崎兵器製作所',
  'record_location': '国立広島原爆死没者追悼平和祈念館',
  'slug': 'pic_syousai.php?gbID=140&dt=190805151358'},
 {'name_kana': 'あおき|みえ',
  'age_production': '2006年9月30日',
  'exposure_loc': '広島市平野町[現：広島市中区平野町]',
  'status_then': '',
  'status_then_detail': '',
  'record_location': '国立広島原爆死没者追悼平和祈念館',
  'slug': 'pic_syousai.php?gbID=141&dt=190805151358'},
 {'name_kana': 'あかな|しょういち',
  'age_production': '2016年10月3日',
  'exposure_loc': '',
  'status_then': '軍人・軍属',
  'status_then_detail': '中国軍管区歩兵第２補充隊（中国第106部隊）|兵長',
  'record_location': '国立広島原爆死没者追悼平和祈念館',
  'slug': 'pic_syousai.php

In [250]:
df_details = pd.DataFrame(rows)
df_details.to_csv('data/survivors_globalpeace_bio_details.csv', index=False)

In [258]:
df_globalpeace = df_details.merge(df_basic, on='slug')
df_globalpeace.to_csv('data/survivors_globalpeace_bio.csv', index=False)

# Turn names into romaji

https://github.com/soimort/python-romkan

In [264]:
# !pip install romkan
import romkan

In [278]:
name_romaji = []
for name in df.name_kana:
    name = romkan.to_roma(name)
    name_romaji.append(name)
name_romaji

df['name_romaji'] = name_romaji
df[['name_kana', 'name_romaji']].tail(3)

Unnamed: 0,name_kana,name_romaji
689,ワタナベ|タツオ,watanabe|tatsuo
690,ワタナベ|ミヨコ,watanabe|miyoko
691,ワダ|イサオ,wada|isao


In [279]:
name_romaji = []
for name in df_globalpeace.name_kana:
    name = romkan.to_roma(name)
    name_romaji.append(name)
name_romaji

df_globalpeace['name_romaji'] = name_romaji
df_globalpeace[['name_kana', 'name_romaji']].tail(3)

Unnamed: 0,name_kana,name_romaji
635,わたなべ|すがこ,watanabe|sugako
636,わたなべ|ちえこ,watanabe|chieko
637,わたなべ|つかさ,watanabe|tsukasa


In [282]:
df.shape

(692, 17)

In [283]:
df_globalpeace.shape

(638, 14)

# Merge Hiroshima Peace Museum and Global Peace datasets into one dataframe

In [287]:
df_survivors = df.merge(df_globalpeace, how='outer', on='name_romaji', suffixes=('_hiroshima', '_globalpeace'))
df_survivors.shape

(1312, 30)

In [288]:
df_survivors.to_csv('data/survivors_bio_complete.csv', index=False)

In [None]:
df_survivors = pd.read_csv('data/survivors_bio_complete.csv')

# Clean the dataframe by filling in values for common columns (age, etc)

In [298]:
df_survivors.columns

Index(['age_exposure_detail', 'age_production_hiroshima', 'code', 'details',
       'exposure_dist_hiroshima', 'exposure_loc_hiroshima',
       'name_kana_hiroshima', 'status_then_detail_hiroshima', 'work_location',
       'age_exposure_hiroshima', 'id_hiroshima', 'name_hiroshima',
       'production_date', 'sex_hiroshima', 'slug_hiroshima',
       'status_then_hiroshima', 'name_romaji', 'age_production_globalpeace',
       'exposure_loc_globalpeace', 'name_kana_globalpeace', 'record_location',
       'slug_globalpeace', 'status_then_globalpeace',
       'status_then_detail_globalpeace', 'age_exposure_globalpeace',
       'exposure_dist_globalpeace', 'exposure_pref', 'id_globalpeace',
       'name_globalpeace', 'sex_globalpeace'],
      dtype='object')

In [435]:
df_survivors[['age_exposure_hiroshima','age_exposure_globalpeace']]

Unnamed: 0,age_exposure_hiroshima,age_exposure_globalpeace
0,21.0,
1,14.0,
2,15.0,
3,21.0,
4,13.0,
5,30.0,
6,24.0,
7,18.0,
8,24.0,
9,22.0,


In [299]:
df_survivors.age_exposure_globalpeace.value_counts(dropna=False)
df_survivors['age_exposure_filled'] = df_survivors.age_exposure_globalpeace.combine_first(df_survivors.age_exposure_hiroshima)

In [311]:
df_survivors['name_filled'] = df_survivors.name_globalpeace.combine_first(df_survivors.name_hiroshima)
df_survivors['name_kana_filled'] = df_survivors.name_kana_globalpeace.combine_first(df_survivors.name_kana_hiroshima)

df_survivors['exposure_loc_filled'] = df_survivors.exposure_loc_globalpeace.combine_first(df_survivors.exposure_loc_hiroshima)
df_survivors['exposure_dist_filled'] = df_survivors.exposure_dist_globalpeace.combine_first(df_survivors.exposure_dist_hiroshima)

df_survivors['sex_filled'] = df_survivors.sex_globalpeace.combine_first(df_survivors.sex_hiroshima)
df_survivors['slug_filled'] = df_survivors.slug_globalpeace.combine_first(df_survivors.slug_hiroshima)

df_survivors['age_production_filled'] = df_survivors.age_production_globalpeace.combine_first(df_survivors.age_production_hiroshima)

df_survivors['status_then_filled'] = df_survivors.status_then_globalpeace.combine_first(df_survivors.status_then_hiroshima)
df_survivors['status_then_detail_filled'] = df_survivors.status_then_detail_globalpeace.combine_first(df_survivors.status_then_detail_hiroshima)

In [418]:
# drop columns

cols = ['age_production_hiroshima',
       'exposure_dist_hiroshima', 'exposure_loc_hiroshima',
       'name_kana_hiroshima', 'status_then_detail_hiroshima',
       'age_exposure_hiroshima', 'name_hiroshima',
       'sex_hiroshima', 'slug_hiroshima',
       'status_then_hiroshima', 'age_production_globalpeace',
       'exposure_loc_globalpeace', 'name_kana_globalpeace',
       'slug_globalpeace', 'status_then_globalpeace',
       'status_then_detail_globalpeace', 'age_exposure_globalpeace',
       'exposure_dist_globalpeace',
       'name_globalpeace', 'sex_globalpeace']

df_cleaned = df_survivors.drop(columns=cols)

In [419]:
df_cleaned[df_cleaned.exposure_dist_filled == '広島: 3.0km   長崎: 3.0km']
df_cleaned.loc[1266, 'exposure_dist_filled'] = '3.0km'
df_cleaned.loc[1266, 'details'] = 'TKTKTK'
df_cleaned.loc[1266, 'work_location'] = 'Hiroshima and Nagasaki'

# df_cleaned.iloc[1266]

In [420]:
# Get rid of 'km', turn empty values tp NaNs, turn into int
df_cleaned['exposure_dist_cleaned'] = df_cleaned.exposure_dist_filled.replace('',np.nan).str[:-2].astype(float)
df_cleaned['age_exposure_cleaned'] = df_cleaned.age_exposure_filled.replace('',np.nan).astype(float)

In [421]:
df_cleaned.sex_filled.value_counts(dropna=False)

男性       768
女性       541
           2
男性,女性      1
Name: sex_filled, dtype: int64

In [425]:
# Fill in 男性 for 今井|敏郎, 女性 for 丸山|巴	

df_cleaned['sex_cleaned'] = df_cleaned.sex_filled
df_cleaned.loc[45, 'sex_cleaned'] = '男性'
df_cleaned.loc[578, 'sex_cleaned'] = '女性'


# df_cleaned[df_cleaned.sex_filled == '男性,女性']

# # Clean this manually in text editor, duplicating the row so that the husband and wife have a row each
# # CODE: VS00473
df_cleaned.loc[421, 'sex_cleaned'] = np.nan

# Turn sex into 0/1
df_cleaned['is_female'] = df_cleaned.sex_cleaned.replace({'女性':1, '男性':0})
df_cleaned.is_female.value_counts(dropna=False)

男性    769
女性    542
Name: sex_cleaned, dtype: int64

In [427]:
df_cleaned.dtypes

age_exposure_detail           object
code                          object
details                       object
work_location                 object
id_hiroshima                 float64
production_date               object
name_romaji                   object
record_location               object
exposure_pref                 object
id_globalpeace               float64
age_exposure_filled           object
name_kana_filled              object
exposure_loc_filled           object
status_then_filled            object
status_then_detail_filled     object
name_filled                   object
exposure_dist_filled          object
sex_filled                    object
slug_filled                   object
age_production_filled         object
exposure_dist_cleaned        float64
age_exposure_cleaned         float64
sex_cleaned                   object
is_female                    float64
dtype: object

## Tsutomu Yamaguchi

https://www.global-peace.go.jp/picture/pic_syousai.php?gbID=119&dt=190805151430

He was 29. Worked as 設計技師 (ships) in Nagasaki. He was in Hiroshima for business. 
Severe burns on face, arms, etc from Hiroshima.
Went back to Nagasaki two days later. Then the next day, bomb dropped, irradiated again.
1:37~

TRANSCRIBE
"XXXX"

In [428]:
df_cleaned.loc[1266, 'exposure_dist_filled'] = '3.0km'
df_cleaned.loc[1266, 'details'] = 'TKTKTK'
df_cleaned.loc[1266, 'work_location'] = 'Hiroshima and Nagasaki'

df_cleaned.iloc[1266]

age_exposure_detail                                               NaN
code                                                              NaN
details                                                        TKTKTK
work_location                                  Hiroshima and Nagasaki
id_hiroshima                                                      NaN
production_date                                                   NaN
name_romaji                                         yamaguchi|tsutomu
record_location                                      国立長崎原爆死没者追悼平和祈念館
exposure_pref                                                      両市
id_globalpeace                                                    592
age_exposure_filled                                                29
name_kana_filled                                             やまぐち|つとむ
exposure_loc_filled           広島:広島市江波町[現:広島市中区]長崎:長崎市水の浦町[現:長崎市飽の浦町]
status_then_filled                                              一般就業者
status_then_detail_f

In [430]:
df_cleaned.columns

Index(['age_exposure_detail', 'code', 'details', 'work_location',
       'id_hiroshima', 'production_date', 'name_romaji', 'record_location',
       'exposure_pref', 'id_globalpeace', 'age_exposure_filled',
       'name_kana_filled', 'exposure_loc_filled', 'status_then_filled',
       'status_then_detail_filled', 'name_filled', 'exposure_dist_filled',
       'sex_filled', 'slug_filled', 'age_production_filled',
       'exposure_dist_cleaned', 'age_exposure_cleaned', 'sex_cleaned',
       'is_female'],
      dtype='object')

In [431]:
# Drop columns

cols = ['age_exposure_filled',
       'name_kana_filled',
       'sex_filled',
       'exposure_dist_filled',
       'sex_cleaned',
       'sex_filled']

df_final = df_cleaned.drop(columns=cols)
df_final.head(2)

Unnamed: 0,age_exposure_detail,code,details,work_location,id_hiroshima,production_date,name_romaji,record_location,exposure_pref,id_globalpeace,exposure_loc_filled,status_then_filled,status_then_detail_filled,name_filled,slug_filled,age_production_filled,exposure_dist_cleaned,age_exposure_cleaned,is_female
0,21,VS00366,当日の朝、女学校（現吉田高校）職員室内から閃光を見、音を聞いた。南方にはきのこ雲。一日中生徒...,吉田高等女学校,1.0,1993/08/01,aoyama|yasuko,,,,入市,教職員,教職員,青山|恭子,/pdbj/detail/207680,69,,21.0,1.0
1,14,VS00678,作業中明るい光が走った。焼夷弾の直撃を受けたと思い、机の下に隠れたが気づくと下敷きになってい...,安田高等女学校,2.0,2000/03/31,akita|sachie,,,,三篠町,学生,学生,秋田|サチヱ,/pdbj/detail/207588,69,2.5,14.0,1.0


# TWO people who were irradiated twice
https://www.global-peace.go.jp/picture/pic_syousai.php?gbID=119&dt=190805151430

https://www.global-peace.go.jp/picture/pic_syousai.php?gbID=1151&dt=190805151412
    

In [433]:
df_final.exposure_pref.value_counts(dropna=False)
df_final[df_final.exposure_pref == '両市']

Unnamed: 0,age_exposure_detail,code,details,work_location,id_hiroshima,production_date,name_romaji,record_location,exposure_pref,id_globalpeace,exposure_loc_filled,status_then_filled,status_then_detail_filled,name_filled,slug_filled,age_production_filled,exposure_dist_cleaned,age_exposure_cleaned,is_female
950,,,,,,,shiraishi|shun'ichi,国立長崎原爆死没者追悼平和祈念館,両市,264.0,広島:長崎:,生徒・学生,東京教育農業専門学校 1年生,白石|俊一,pic_syousai.php?gbID=1151&dt=190805151412,2017年5月11日,,17.0,0.0
1266,,,TKTKTK,Hiroshima and Nagasaki,,,yamaguchi|tsutomu,国立長崎原爆死没者追悼平和祈念館,両市,592.0,広島:広島市江波町[現:広島市中区]長崎:長崎市水の浦町[現:長崎市飽の浦町],一般就業者,三菱重工業㈱長崎造船所|造機設計部,山口|疆,pic_syousai.php?gbID=119&dt=190805151430,2004年7月1日,3.0,29.0,0.0


In [438]:
df_final.to_csv('data/survivors_bio_complete_cleaned.csv', index=False)