# JSON Parsor
---

## 1. environment setting
---

### 1. import package

In [3]:
import json
import os
import glob
import folium
import csv
import geopandas as gpd
import pandas as pd
import matplotlib.pyplot as plt

from shapely import wkt
from shapely import affinity
from tqdm.notebook import tqdm

ModuleNotFoundError: No module named 'geopandas'

### 2. set path

In [None]:
BASE_PATH = os.path.join(os.getenv('HOME'), 'workspace/Hackerton') # project folder
DATA_PATH = os.path.join(BASE_PATH, 'RSI_OP_NIA_AIHUB') # data folder
SAMPLE_GEOJSON_PATH = os.path.join(DATA_PATH, 'buildings/training/label/BLD00001_PS3_K3A_NIA0276.json') # sample geojson file path
MERGED_PATH = os.path.join(DATA_PATH, 'merged') # summary dir folder

BUILDINGS_TRAIN_LABEL_PATH = os.path.join(DATA_PATH, 'buildings/training/label')
BUILDINGS_VAL_LABEL_PATH = os.path.join(DATA_PATH, 'buildings/validation/label')
ROADS_TRAIN_LABEL_PATH = os.path.join(DATA_PATH, 'roads/training/label')
ROADS_VAL_LABEL_PATH = os.path.join(DATA_PATH, 'roads/validation/label')

In [None]:
SAMPLE_GEOJSON_PATH

### 3. load json file

In [None]:
with open(SAMPLE_GEOJSON_PATH, "r") as st_json:
    st_python = json.load(st_json)

type(st_python)

## 2. Parse Json to class
---

### 1. define class :: type1

![CD_geojson.png](./assets/CD_geojson.png)

In [None]:
class Property():
    def __init__(self, raw):
        self.object_imcoords = raw['object_imcoords'] if 'object_imcoords' in raw else "EMPTY"
        self.building_imcoords = raw['building_imcoords'] if 'building_imcoords' in raw else "EMPTY"
        self.road_imcoords = raw['road_imcoords'] if 'road_imcoords' in raw else "EMPTY"
        self.image_id = raw['image_id'] if 'image_id' in raw else None
        self.ingest_time = raw['ingest_time'] if 'ingest_time' in raw else None
        self.type_id = raw['type_id'] if 'type_id' in raw else None
        self.type_name = raw['type_name'] if 'type_name' in raw else None
        
    def is_building(self):
        ans = False
        if self.building_imcoords != "EMPTY" and self.road_imcoords == "EMPTY":
            ans = True
        return ans
    
    def is_road(self):
        ans = False
        if self.building_imcoords == "EMPTY" and self.road_imcoords != "EMPTY":
            ans = True
        return ans
    
    def __str__(self):
        return f'''{{
    object_imcoords : {self.object_imcoords},
    building_imcoords : {self.building_imcoords},
    road_imcoords : {self.road_imcoords},
    image_id : {self.image_id},
    ingest_time : {self.ingest_time},
    type_id : {self.type_id},
    type_name : {self.type_name}
}}
        '''
    
    def __repr__(self):
        data_type = "building" if self.is_building() else "road"
        _str = ""
        if data_type == "building":
            _str = f'Property : data type({data_type}), coordinates({self.building_imcoords})'
        else:
            _str = f'Property : data type({data_type}), coordinates({self.road_imcoords})'
        return _str

In [None]:
class Geometry():
    def __init__(self, raw):
        self.coordinates = raw['coordinates'] if 'coordinates' in raw else []
        self.type = raw['type'] if 'type' in raw else ""

    def __str__(self):
        return f'''{{
    type : {self.type},
    coordinates : {self.coordinates}
}}
        '''
    
    def __repr__(self):
        return f'Geometry : data type({self.type}), coordinates({self.coordinates})'

In [None]:
class Feature():
    def __init__(self, raw):
        self.type = "Feature"
        self.geometry = Geometry(raw['geometry']) if 'geometry' in raw else None
        self.properties = Property(raw['properties']) if 'properties' in raw else None

    def __str__(self):
        return 'Feature : Geometry({}), data type({})'.format(
            self.geometry.type,
            "building" if self.properties.is_building() else "road"
        )
    
    def __repr__(self):
        return self.__str__()

In [None]:
class FeatureCollection():
    def __init__(self, raw):
        self.type = "FeatureCollection"
        self.features = []
        
        if 'features' in raw and type(raw['features']) == list:
            self.features = [ Feature(feature) for feature in raw['features'] ]
    
    def __str__(self):
        return f'FeatureCollection with {len(self.features)} features'
    
    def __repr__(self):
        return self.__str__()

In [None]:
fc = FeatureCollection(st_python)

In [None]:
fc

In [None]:
fc.features[0]

In [None]:
fc.features[0].geometry

In [None]:
fc.features[0].properties

In [None]:
print(fc.features[0].properties)

In [None]:
img = [feature.properties.image_id for feature in fc.features]
set(img)

### 2. define class :: type 2

![image](./assets/CD_object.png)

In [None]:
class Object():
    def __init__(self, raw):
        self.area_id = raw['image_id'] if 'image_id' in raw else None
        self.imcoords = raw['imcoords_pix'] if 'imcoords_pix' in raw else None
        self.image_id = raw['imcoords_geo'] if 'imcoords_geo' in raw else None
        self.type = raw['type'] if 'type' in raw else None

In [None]:
class Site():
    def __init__(self, coodinates, area_id):
        self.coodinates = coodinates
        self.image_id = image_id
    
    def __str__(self):
        return f'Site({self.image_id}) : {self.coordinates}'

### 3. parsing

#### A. load all json file

In [None]:
def load_json(file_paths):
    res = {}
    for file_path in tqdm(file_paths):
        with open(file_path, "r") as f:
            name = os.path.basename(f.name)
            if name in res:
                print(f'WARNING : file name {naem} is duplicated')
            else:
                res[name] = json.load(f)
    return res

In [None]:
buildings_train_label_pattern = BUILDINGS_TRAIN_LABEL_PATH + r"/*.json"
buildings_val_label_pattern = BUILDINGS_VAL_LABEL_PATH + r"/*.json"
roads_train_label_pattern = ROADS_TRAIN_LABEL_PATH + r"/*.json"
roads_val_label_pattern = ROADS_VAL_LABEL_PATH + r"/*.json"

In [None]:
buildings_train_label_paths = glob.glob(buildings_train_label_pattern)
buildings_val_label_paths = glob.glob(buildings_val_label_pattern)
roads_train_label_paths = glob.glob(roads_train_label_pattern)
roads_val_label_paths = glob.glob(roads_val_label_pattern)

print(f'len(buildings_train_label_paths) : {len(buildings_train_label_paths)}')
print(f'len(buildings_val_label_paths) : {len(buildings_val_label_paths)}')
print(f'len(roads_train_label_paths) : {len(roads_train_label_paths)}')
print(f'len(roads_val_label_paths) : {len(roads_val_label_paths)}')

In [None]:
buildings_train_label_json = load_json(buildings_train_label_paths)
buildings_val_label_json = load_json(buildings_val_label_paths)
roads_train_label_json = load_json(roads_train_label_paths)
roads_val_label_json = load_json(roads_val_label_paths)

In [None]:
print(f'len(buildings_train_label_json) : {len(buildings_train_label_json)}')
print(f'len(buildings_val_label_json) : {len(buildings_val_label_json)}')
print(f'len(roads_train_label_json) : {len(roads_train_label_json)}')
print(f'len(roads_val_label_json) : {len(roads_val_label_json)}')

In [None]:
all_keys = list(buildings_train_label_json.keys()) + \
           list(buildings_val_label_json.keys()) + \
           list(roads_train_label_json.keys()) + \
           list(roads_val_label_json.keys())
print(len(all_keys), len(list(set(all_keys))))
### key들에 중복이 존재

#### B. parse json as class

In [None]:
all_json = list(buildings_train_label_json.values()) + \
           list(buildings_val_label_json.values()) +  \
           list(roads_train_label_json.values()) + \
           list(roads_val_label_json.values())
feature_collections = [ FeatureCollection(_json) for _json in tqdm(all_json) ]

In [None]:
feature_collections[0]

In [None]:
feature_collections[0].features[0]

In [None]:
feature_collections[0].features[0].geometry

In [None]:
print(feature_collections[0].features[0].properties)

## 3. Abstract Area
---

- wkt
    - 정의 : https://en.wikipedia.org/wiki/Well-known_text_representation_of_geometry
    - 데이터 확인 : https://clydedacruz.github.io/openstreetmap-wkt-playground/

### 1. Geometry 추출

In [None]:
geometreis = [ [f.properties.image_id.split('.')[0], f.geometry] for fc in feature_collections for f in fc.features ]
len(geometreis), geometreis[0][0], geometreis[0][1].coordinates

### 2. list 형태로 되어 있는 좌표정보(coordinates)를 wkt로 변화

In [None]:
def coor_to_polygon(coordinates):
    """
    [[x1, y1, z1],
     [x2, y2, z2],
     [x3, y3, z3],
     [x4, y4, z4]]
    to
    POLYGON ((x1 y1, x2 y2, x3 y3, x4 y4, x1 y1))
    """
    coordinates.append(coordinates[0])
    wkt = "POLYGON(("
    wkt += ", ".join([" ".join(map(str, coor[:-1])) for coor in coordinates])
    wkt += "))"
    return wkt

In [None]:
areas = [(image_id, coor_to_polygon(geometry.coordinates)) for image_id, geometry in tqdm(geometreis)]
areas = list(set(areas))
print(len(areas), areas[0])

### 3. csv 저장

In [None]:
def save_csv(path, data, fields):
    if type(data) != list:
        assert f'[save_csv] The type of input parameter data must be list.'
    if not data:
        assert f'[save_csv] The parameter data should not empty.'
    if len(data[0]) != len(fields):
        assert f'[save_csv] The number of data column and fields must be same.'
            
    with open(path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(fields)
        writer.writerows(data)

In [None]:
MERGED_PATH = os.path.join(DATA_PATH, 'merged')
save_csv(os.path.join(MERGED_PATH, 'Site.csv'), areas, ['image_id', 'coordinates'])

### 4. 저장된 파일 확인

In [None]:
area_path = os.path.join(MERGED_PATH, 'Site.csv')
df = pd.read_csv(area_path, encoding='utf-8')
df.head(5)

In [None]:
df['coordinates'][0]

In [None]:
df.describe(include='all')

In [None]:
df['coordinates'] = gpd.GeoSeries.from_wkt(df['coordinates'])
gdf = gpd.GeoDataFrame(df, geometry='coordinates')
gdf.head(5)

In [None]:
list(df['coordinates'][0].exterior.coords)

In [None]:
type(gdf['coordinates'][0])

#### A. **AREA 분포 확인**

In [None]:



m = folium.Map(zoom_start=8, tiles='CartoDB positron')

for _, r in gdf.iterrows():
    sim_geo = gpd.GeoSeries(r['coordinates']).simplify(tolerance=0.001)
    geo_j = sim_geo.to_json()
    geo_j = folium.GeoJson(data=geo_j,
                           style_function=lambda x: {'fillColor': 'orange'})
    folium.Popup(r['image_id']).add_to(geo_j)
    geo_j.add_to(m)
m

## 4. Abstract Building Object
---

### 1. Buildings 추출

In [None]:
def imcoords_to_wkt(coordinates):
    if coordinates == "":
        return None # Nan
    
    coor = coordinates.split(',')
    if len(coor) < 6:
        return None # 다각형은 세개 이상의 점으로 이루어 진다.
    
    
    coor = [f'{round(float(coor[i]))} {round(float(coor[i+1]))}' for i in range(0, len(coor), 2)]
    coor += [coor[0]]
    wkt = 'POLYGON((' + ", ".join(coor) + '))'
    return wkt

In [None]:
def get_building_attributes(feature):
    return {
        'image_id': feature.properties.image_id.split('.')[0],
        'type': "building",
        'coordinates_pix': imcoords_to_wkt(feature.properties.building_imcoords),
        'coordinates_geo': ""
    }

In [None]:
buildings = [get_building_attributes(f) for fc in feature_collections 
             for f in fc.features 
             if f.properties.is_building()]
buildings = [b for b in buildings if b['coordinates_pix']]
print(len(buildings))

### 2. add site
- coordinates_geo 계산을 위하여 site의 좌표 정보를 불러온다.

In [None]:
site_df = pd.read_csv(area_path, encoding='utf-8')
site_df.head(5)

In [None]:
building_df = pd.DataFrame(buildings)
building_df.head(5)

In [None]:
merged = pd.merge(building_df, site_df, left_on="image_id", right_on="image_id", how="inner")
merged.head(5)

In [None]:
len(building_df), len(merged)

In [None]:
merged['coordinates'] = gpd.GeoSeries.from_wkt(merged['coordinates'])
merged['coordinates_pix'] = gpd.GeoSeries.from_wkt(merged['coordinates_pix'])
merged_gdf = gpd.GeoDataFrame(merged, geometry='coordinates_pix')
merged_gdf.head(5)

### 3. coordinates_geo 계산

In [None]:
def pix_coor_to_real_coor(site_geo, src, pix_size):
    real_URC = (site_geo.bounds[2], site_geo.bounds[3])
    real_LLC = (site_geo.bounds[0], site_geo.bounds[1])

    pix_width = pix_size # 1024*1024 고정
    pix_height = pix_size

    width_scale = abs((real_URC[0] - real_LLC[0]) / pix_width)
    height_scale = abs((real_URC[1] - real_LLC[1]) / pix_height)

    width_trans = real_LLC[0]
    height_trans = real_LLC[1]

    
    mirrored = affinity.affine_transform(src, [1, 0, 0, -1, 0, pix_height ])
    return affinity.affine_transform(mirrored, [width_scale, 0, 0, height_scale, width_trans, height_trans ])

In [None]:
list(merged_gdf.loc[0]['coordinates'].exterior.coords)

In [None]:
merged_gdf['coordinates_geo'] = merged_gdf.apply(lambda x: pix_coor_to_real_coor(x.coordinates, x.coordinates_pix, 1024), axis='columns')
merged_gdf.head(5)

#### A. 계산된 데이터 확인 1 - matplot

In [None]:
tmp = gpd.GeoSeries(merged_gdf[:15]['coordinates_geo'])
tmp.plot(figsize=(12,12))
plt.show()

#### B. 계산된 데이터 확인 2 - folium

In [None]:
m = folium.Map(zoom_start=17, location=[39.7213631342, -104.8953397997], tiles='CartoDB positron')

for i, r in merged_gdf[:50].iterrows():    
    sim_geo = gpd.GeoSeries(r['coordinates_geo']).simplify(tolerance=0.00001)
    geo_j = sim_geo.to_json()
    geo_j = folium.GeoJson(data=geo_j,
                           style_function=lambda x: {'fillColor': 'orange'})
    folium.Popup(f'idx({i})').add_to(geo_j)
    geo_j.add_to(m)
m

### 4. csv 저장

In [None]:
merged_gdf.drop('coordinates', axis=1).to_csv(os.path.join(MERGED_PATH, 'Buildings.csv'),',', index=False)

### 5. 저장된 파일 확인

In [None]:
area_path = os.path.join(MERGED_PATH, 'Buildings.csv')
df = pd.read_csv(area_path, encoding='utf-8')
df.head(5)

In [None]:
df['coordinates_pix'] = gpd.GeoSeries.from_wkt(df['coordinates_pix'])
df['coordinates_geo'] = gpd.GeoSeries.from_wkt(df['coordinates_geo'])
gdf = gpd.GeoDataFrame(df, geometry='coordinates_pix')
gdf.tail(5)

## 5. Abstract Road Object
---

### 1. Road 추출

In [None]:
def get_road_attributes(feature):
    return {
        'image_id': feature.properties.image_id.split('.')[0],
        'type': "road",
        'coordinates_pix': imcoords_to_wkt(feature.properties.road_imcoords),
        'coordinates_geo': ""
    }

In [None]:
roads = [get_road_attributes(f) for fc in feature_collections 
             for f in fc.features 
             if f.properties.is_road()]
roads = [r for r in roads if r['coordinates_pix']]
print(len(roads))

### 2. add site

In [None]:
area_path = os.path.join(MERGED_PATH, 'Site.csv')
site_df = pd.read_csv(area_path, encoding='utf-8')
site_df.head(5)

In [None]:
road_df = pd.DataFrame(roads)
road_df.head(5)

In [None]:
merged = pd.merge(road_df, site_df, left_on="image_id", right_on="image_id", how="inner")
merged.head(5)

In [None]:
len(road_df), len(merged)

In [None]:
merged['coordinates'] = gpd.GeoSeries.from_wkt(merged['coordinates'])
merged['coordinates_pix'] = gpd.GeoSeries.from_wkt(merged['coordinates_pix'])
merged_gdf = gpd.GeoDataFrame(merged, geometry='coordinates_pix')
merged_gdf.head(5)

### 3. coordinates_geo 계산

In [None]:
merged_gdf['coordinates_geo'] = merged_gdf.apply(lambda x: pix_coor_to_real_coor(x.coordinates, x.coordinates_pix, 1024), axis='columns')
merged_gdf.head(5)

#### A. 계산된 데이터 확인 1 - matplot

In [None]:
tmp = gpd.GeoSeries(merged_gdf[:15]['coordinates_geo'])
tmp.plot(figsize=(12,12))
plt.show()

#### B. 계산된 데이터 확인 2 - folium

In [None]:
m = folium.Map(zoom_start=17, location=[33.9516227282, -118.2718820291], tiles='cartodb positron')

for i, r in merged_gdf[:50].iterrows():    
    sim_geo = gpd.GeoSeries(r['coordinates_geo']).simplify(tolerance=0.00001)
    geo_j = sim_geo.to_json()
    geo_j = folium.GeoJson(data=geo_j,
                           style_function=lambda x: {'fillColor': 'orange'})
    folium.Popup(f'idx({i})').add_to(geo_j)
    geo_j.add_to(m)
m

### 4. csv 저장

In [None]:
merged_gdf.drop('coordinates', axis=1).to_csv(os.path.join(MERGED_PATH, 'Roads.csv'),',', index=False)

### 5. 저장된 파일 확인

In [None]:
road_path = os.path.join(MERGED_PATH, 'Roads.csv')
df = pd.read_csv(road_path, encoding='utf-8')
df.head(5)

In [None]:
df['coordinates_pix'] = gpd.GeoSeries.from_wkt(df['coordinates_pix'])
df['coordinates_geo'] = gpd.GeoSeries.from_wkt(df['coordinates_geo'])
gdf = gpd.GeoDataFrame(df, geometry='coordinates_pix')
gdf.tail(5)

## ref
- [An example of polygon plotting with folium](https://geopandas.org/gallery/polygon_plotting_with_folium.html)