In [1]:
import pandas as pd
import numpy as np
import json
from pathlib import Path
import sys

BASE_DIR = Path().resolve().parent.parent
# sys.path.append((BASE_DIR / 'vietnamadminunits/parser').as_posix())
# from utils import key_normalize

from vietnamadminunits.parser.utils import key_normalize

import warnings
warnings.filterwarnings("ignore")

In [2]:
def create_sort(text, level=1):
    if isinstance(text, str):
        if level == 1:
            text = re.sub(r'^Tỉnh\s|Thành phố\s|Thủ đô\s', '', text, flags=re.IGNORECASE)
        else:
            text = re.sub(r'^Phường\s|Đặc khu\s|Xã\s', '', text, flags=re.IGNORECASE)

        return text.strip()
    return text

district_type_acronym = {
    'Quận': 'q',
    'Thị xã': 'tx',
    'Thành phố': 'tp',
    'Huyện': 'h',
}
ward_type_acronym = {
    'Phường': 'p',
    'Đặc khu': 'dk',
    'Xã': 'x'
}
def create_keywords(row, level=1):
    keywords = []
    if level == 1:
        keywords.append(row['provinceKey'])
        keywords.append(row['provinceShortKey'])
        if pd.notnull(row['provinceAlias']):
            aliases = json.loads(row['provinceAlias'])
            for a in aliases:
                keywords.append(key_normalize(a))

    else:
        keywords.append(row['wardKey'])
        if not row['wardShortKeyDuplicated']:
            keywords.append(row['wardShortKey'])
        else:
            keywords.append(key_normalize(f"{row['wardShortKey']} {row['wardType']}"))
            keywords.append(key_normalize(f"{ward_type_acronym[row['wardType']]} {row['wardShortKey']}"))

        if pd.notnull(row['wardAlias']):
            aliases = json.loads(row['wardAlias'])
            for a in aliases:
                keywords.append(key_normalize(a))

    keywords = list(set(keywords))
    keywords = sorted(keywords, key=len, reverse=True)
    return json.dumps(keywords)

In [3]:
df = pd.read_csv(BASE_DIR / 'data/processed/convert_legacy_2025_with_location_and_default_ward.csv')

In [4]:
df_63 = pd.read_csv(BASE_DIR / 'data/interim/legacy_63-province-10040-ward_with_location_and_key.csv')
df_63 = df_63[['province', 'district', 'ward', 'provinceKey', 'districtKey', 'wardKey', 'wardKeyDuplicated']]

In [5]:
df_34 = df[['newProvince', 'newWard']].drop_duplicates().copy()

# ENRICH DATA
unit_cols = ['newProvince', 'newWard']
for col in unit_cols:
    # Create key
    df_34[f"{col}Key"] = df_34[f"{col}"].apply(key_normalize)


# Check ward key
count_ward_key = df_34.groupby(['newProvinceKey', 'newWardKey']).size().reset_index(name='count').sort_values(by=['count'], ascending=False)
duplicated_ward_key = count_ward_key[count_ward_key['count']>1].copy()
duplicated_ward_key['newWardKeyDuplicated'] = True
duplicated_ward_key.drop(columns=['count'], inplace=True)

# Add data to df_34
df_34 = pd.merge(df_34, duplicated_ward_key, on=['newProvinceKey', 'newWardKey'], how='left')
df_34['newWardKeyDuplicated'].fillna(False, inplace=True)

# Change key and short key to accented if newWardKeyDuplicated = True
df_34['newWardKey'] = np.where(df_34['newWardKeyDuplicated']==True, df_34['newWard'].apply(key_normalize, args=([], False)), df_34['newWardKey'])

In [6]:
df = pd.merge(df, df_63, on=['province', 'district', 'ward'], how='left')
df = pd.merge(df, df_34, on=['newProvince', 'newWard'], how='left')
df['provinceDistrictWardKey'] =  df['provinceKey'] + '_' + df['districtKey'] + '_' + df['wardKey'].fillna('')

In [7]:
df_no_divided = df[df['isDividedWard']==False]
df_no_divided.groupby(['newProvinceKey', 'provinceDistrictWardKey']).size().reset_index(name='count').sort_values(by=['count'], ascending=False)

Unnamed: 0,newProvinceKey,provinceDistrictWardKey,count
0,thanhphocantho,thanhphocantho_huyencodo_thitrancodo,1
6374,tinhninhbinh,tinhninhbinh_huyennhoquan_xadongphong,1
6376,tinhninhbinh,tinhninhbinh_huyennhoquan_xagialam,1
6377,tinhninhbinh,tinhninhbinh_huyennhoquan_xagiason,1
6378,tinhninhbinh,tinhninhbinh_huyennhoquan_xagiathuy,1
...,...,...,...
3190,tinhdongnai,tinhdongnai_huyenthongnhat_xalo25,1
3191,tinhdongnai,tinhdongnai_huyenthongnhat_xaquangtrung,1
3192,tinhdongnai,tinhdongnai_huyenthongnhat_xaxuanthien,1
3193,tinhdongnai,tinhdongnai_huyentrangbom_thitrantrangbom,1


In [8]:
df_divided = df[df['isDividedWard']==True]
df_divided.groupby(['newProvinceKey', 'provinceDistrictWardKey']).size().reset_index(name='count').sort_values(by=['count'], ascending=False)

Unnamed: 0,newProvinceKey,provinceDistrictWardKey,count
265,thanhphohanoi,thanhphohanoi_quannamtuliem_phuongdaimo,5
260,thanhphohanoi,thanhphohanoi_quanlongbien_phuongphucdong,4
168,thanhphohanoi,thanhphohanoi_huyenthanhtri_xatantrieu,4
262,thanhphohanoi,thanhphohanoi_quanlongbien_phuongthachban,4
213,thanhphohanoi,thanhphohanoi_quanhadong_phuongduongnoi,4
...,...,...,...
152,thanhphohanoi,thanhphohanoi_huyenthachthat_xabinhyen,2
151,thanhphohanoi,thanhphohanoi_huyensocson_xaquangtien,2
150,thanhphohanoi,thanhphohanoi_huyensocson_xaphuminh,2
149,thanhphohanoi,thanhphohanoi_huyensocson_xamaidinh,2


In [9]:
df_province = df[['newProvinceKey', 'provinceKey']].drop_duplicates().reset_index(drop=True)
DICT_PROVINCE = {}
for newProvinceKey, group in df_province.groupby('newProvinceKey'):
    DICT_PROVINCE[newProvinceKey] = group['provinceKey'].values.tolist()

In [10]:
DICT_PROVINCE_WARD_NO_DIVIDED = {}

for newProvinceKey, group in df_no_divided.groupby('newProvinceKey'):
    ward_dict = {}

    for newWardKey, group in group.groupby('newWardKey'):
        ward_dict[newWardKey] = group['provinceDistrictWardKey'].values.tolist()

    DICT_PROVINCE_WARD_NO_DIVIDED[newProvinceKey] = ward_dict

In [11]:
DICT_PROVINCE_WARD_NO_DIVIDED

{'thanhphocantho': {'phuonganbinh': ['thanhphocantho_quanninhkieu_phuonganbinh',
   'thanhphocantho_huyenphongdien_xamykhanh'],
  'phuongbinhthuy': ['thanhphocantho_quanbinhthuy_phuongbinhthuy',
   'thanhphocantho_quanbinhthuy_phuonganthoi'],
  'phuongcaikhe': ['thanhphocantho_quanninhkieu_phuongcaikhe',
   'thanhphocantho_quanninhkieu_phuonganhoa'],
  'phuongcairang': ['thanhphocantho_quancairang_phuonglebinh',
   'thanhphocantho_quancairang_phuonghungthanh',
   'thanhphocantho_quancairang_phuongbalang',
   'thanhphocantho_quancairang_phuongthuongthanh'],
  'phuongdaithanh': ['tinhhaugiang_thanhphongabay_phuonghieploi',
   'tinhhaugiang_thanhphongabay_xadaithanh',
   'tinhhaugiang_thanhphongabay_xatanthanh'],
  'phuonghungphu': ['thanhphocantho_quancairang_phuonghungphu',
   'thanhphocantho_quancairang_phuongphuthu',
   'thanhphocantho_quancairang_phuongtanphu'],
  'phuongkhanhhoa': ['tinhsoctrang_thixavinhchau_xahoadong',
   'tinhsoctrang_thixavinhchau_phuongkhanhhoa',
   'tinhsoctra

In [12]:
DICT_PROVINCE_WARD_DIVIDED = {}
for newProvinceKey, group in df_divided.groupby('newProvinceKey'):
    keyword_dict = {}

    for provinceDistrictWardKey, group in group.groupby('provinceDistrictWardKey'):
        ward_dict = []

        for _, group in group.groupby('newWardKey'):
            ward = {
                'newWardKey': group['newWardKey'].iloc[0],
                'isDefaultNewWard': group['isDefaultNewWard'].iloc[0],
                'newWardLat': group['newWardLat'].iloc[0],
                'newWardLon': group['newWardLon'].iloc[0],
                'newWardAreaKm2': group['newWardAreaKm2'].iloc[0],
            }
            ward_dict.append(ward)

        keyword_dict[provinceDistrictWardKey] = ward_dict

    DICT_PROVINCE_WARD_DIVIDED[newProvinceKey] = keyword_dict

In [13]:
DICT_PROVINCE_WARD_DIVIDED

{'thanhphocantho': {'thanhphocantho_huyenthoilai_xatanthanh': [{'newWardKey': 'xatanthanh',
    'isDefaultNewWard': False,
    'newWardLat': 9.62301,
    'newWardLon': 106.064,
    'newWardAreaKm2': 70.8},
   {'newWardKey': 'xatruongthanh',
    'isDefaultNewWard': True,
    'newWardLat': 10.0423,
    'newWardLon': 105.61,
    'newWardAreaKm2': 59.09}],
  'thanhphocantho_huyenvinhthanh_xathanhquoi': [{'newWardKey': 'xagiahoa',
    'isDefaultNewWard': False,
    'newWardLat': 9.43175,
    'newWardLon': 105.8,
    'newWardAreaKm2': 77.06},
   {'newWardKey': 'xathanhquoi',
    'isDefaultNewWard': True,
    'newWardLat': 10.2075,
    'newWardLon': 105.348,
    'newWardAreaKm2': 103.86}],
  'thanhphocantho_quanbinhthuy_phuongbuihuunghia': [{'newWardKey': 'phuongbinhthuy',
    'isDefaultNewWard': True,
    'newWardLat': 10.0712,
    'newWardLon': 105.752,
    'newWardAreaKm2': 15.17},
   {'newWardKey': 'phuongcaikhe',
    'isDefaultNewWard': False,
    'newWardLat': 10.0517,
    'newWardLon':

In [14]:
converter_data = {
    'DICT_PROVINCE': DICT_PROVINCE,
    'DICT_PROVINCE_WARD_NO_DIVIDED': DICT_PROVINCE_WARD_NO_DIVIDED,
    'DICT_PROVINCE_WARD_DIVIDED': DICT_PROVINCE_WARD_DIVIDED
}

In [15]:
with open(BASE_DIR / 'vietnamadminunits/data/converter_2025.json', 'w') as f:
    json.dump(converter_data, f)

In [16]:
df[df['isDividedWard'] & df['wardKeyDuplicated']]
# May quá không có =))

Unnamed: 0,provinceCode,isMergedProvince,districtCode,districtType,districtShortDuplicated,wardCode,wardType,wardShortDuplicated,isMergedWard,isDividedWard,...,isDefaultNewWard,newProvinceShort,provinceKey,districtKey,wardKey,wardKeyDuplicated,newProvinceKey,newWardKey,newWardKeyDuplicated,provinceDistrictWardKey
