In [None]:
import pandas as pd
import geopandas as gpd
import leafmap
import zipfile
import os
import time

In [None]:
url = 'https://raw.githubusercontent.com/giswqs/data/main/us/us_states.csv'
states = pd.read_csv(url)
states.head()

In [None]:
state_id = 'WY'
in_dir = '/media/hdd/Team-Drives/Buildings/'
out_dir = '/media/hdd/Data/Buildings'


In [None]:
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

In [None]:
ornl_files = leafmap.find_files(os.path.join(in_dir, 'USA_Structures'), ext='.zip')

In [None]:
for index, row in states.iterrows():
    state_id = row['id']
    state_name = row['name']
    out_shp = os.path.join(out_dir, f'Height/{state_id}.shp')

    start_time = time.time()
    # if state_id != 'WY':
    #     continue
    
    if os.path.exists(out_shp):
        continue

    print(f'Processing {state_id} ...')
    for file in ornl_files:
        if file.endswith(state_id + '.zip'):
            state = file
            break
    

    ornl_dir = os.path.join(out_dir, 'USA_Structures')
    print(f'Extracting USA Structures {state_id} ...')
    with zipfile.ZipFile(state, 'r') as zip_ref:
        zip_ref.extractall(ornl_dir)

    basename = os.path.basename(state).replace('.zip', '')
    db_name= f'{state_id}_Structures.gdb'
    db_path = os.path.join(ornl_dir, basename, db_name)
    if not os.path.exists(db_path):
        try:
            if state_id not in ['CA', 'TX', 'LA']:
                basename = basename + '_OCC'
                db_name = f'{state_id}_Structures_OCC.gdb'
                db_path = os.path.join(ornl_dir, basename, db_name)
            elif state_id == 'CA':
                basename = basename + '_OCC'
                db_name = f'{state_id}_Structures.gdb'
                db_path = os.path.join(ornl_dir, basename, db_name)
            elif state_id == 'LA':
                basename = basename + 'v2_OCC'
                db_name = f'{state_id}_Structures_v2_OCC.gdb'
                db_path = os.path.join(ornl_dir, basename, db_name)
            elif state_id == 'TX':
                basename = basename + 'v2_OCC'
                db_name = f'{state_id}_Structures_v2_OCC.gdb'
                db_path = os.path.join(ornl_dir, basename, db_name)                
        except:
            raise Exception(f'File {db_path} does not exists.')
    
    print(f'Loading USA Structures {state_id} ...')
    try:
        ornl_gdf = gpd.read_file(db_path, layer=db_name.replace('.gdb', ''))
    except Exception as e:
        print(e)
        continue
    

    # for col in ornl_gdf.columns:
    #     if ornl_gdf[col].dtype in ['datetime64[ns]', 'datetime64[ns, UTC]']:
    #         ornl_gdf[col] = ornl_gdf[col].astype(str)

    print('Creating centroids ...')
    centroid_points = ornl_gdf.centroid
    centroid_gdf = gpd.GeoDataFrame(ornl_gdf.drop('geometry', axis=1), geometry=centroid_points)
    centroids = centroid_gdf[['geometry', 'HEIGHT', 'SQMETERS', 'IMAGE_DATE']]

    geojson = f"{in_dir}MS_USBuildingFootprints/{state_name.replace(' ', '')}.geojson.zip"
    if not os.path.exists(geojson):
        raise Exception(f'File {geojson} does not exists.')
    
    print(f'Loading MS Buildings {state_id} ...')
    ms_gdf = gpd.read_file(geojson)
    ms_gdf['index'] = ms_gdf.index

    print('Joining MS Buildings with ORNL centroids ...')
    gdf_joined = gpd.sjoin(ms_gdf, centroids, how='left', predicate='intersects')
    gdf_joined['index'] = gdf_joined.index

    gdf_joined_mean = gdf_joined.groupby('index')['HEIGHT', 'SQMETERS'].mean()
    gdf_pts_height = ms_gdf.merge(gdf_joined_mean, on='index', how='left')

    gdf_pts_height_notnull = gdf_pts_height[gdf_pts_height['HEIGHT'].notnull()]
    gdf_pts_height_null = gdf_pts_height[gdf_pts_height['HEIGHT'].isnull()].drop(['HEIGHT', 'SQMETERS'], axis=1)

    print('Joining MS Buildings with ORNL polygons ...')
    gdf_poly_height_sj = gpd.sjoin(gdf_pts_height_null, ornl_gdf, how='left', predicate='intersects')
    gdf_poly_height_mean = gdf_poly_height_sj.groupby('index')['HEIGHT', 'SQMETERS'].mean()

    gdf_poly_height = gdf_pts_height_null.merge(gdf_poly_height_mean, on='index', how='left')
    gdf_height = pd.concat([gdf_pts_height_notnull, gdf_poly_height])

    gdf_height.sort_values('index', inplace=True)

    
    print(f'Writing {state_id}.shp ...')
    if not os.path.exists(os.path.dirname(out_shp)):
        os.makedirs(os.path.dirname(out_shp))
    gdf_height.drop('index', axis=1).to_file(out_shp)

    end_time = time.time()

    elapsed_minutes = (end_time - start_time) / 60
    print(f'Finished {state_id} in {elapsed_minutes:.2f} minutes.')

In [None]:
state = '/media/hdd/Team-Drives/Buildings/USA_Structures/Deliverable20211203WY.zip'
out_dir = os.path.expanduser('~/Downloads')
out_pts_dir = os.path.expanduser('~/Downloads/USA_Structures_PTS')
out_building_height = os.path.expanduser('~/Downloads/Building_Height')

In [None]:
if not os.path.exists(out_pts_dir):
    os.makedirs(out_pts_dir)

In [None]:
if not os.path.exists(out_building_height):
    os.makedirs(out_building_height)

In [None]:
with zipfile.ZipFile(state, 'r') as zip_ref:
    zip_ref.extractall(out_dir)

In [None]:
basename = os.path.basename(state).replace('.zip', '')
state_abbr = basename[-2:]
db_name= f'{state_abbr}_Structures.gdb'
db_path = os.path.join(out_dir, basename, db_name)
if not os.path.exists(db_path):
    raise Exception(f'File {db_path} does not exists.')

In [None]:
ornl_gdf = gpd.read_file(db_path, layer=db_name.replace('.gdb', ''))

In [None]:
ornl_gdf.head()

In [None]:
for col in ornl_gdf.columns:
    if ornl_gdf[col].dtype in ['datetime64[ns]', 'datetime64[ns, UTC]']:
        ornl_gdf[col] = ornl_gdf[col].astype(str)

In [None]:
ornl_gdf.head(n=50).explore()

In [None]:
# Get the centroids
centroid_points = ornl_gdf.centroid

# Create a new GeoDataFrame with the centroid points and the attributes from the original GeoDataFrame
centroid_gdf = gpd.GeoDataFrame(ornl_gdf.drop('geometry', axis=1), geometry=centroid_points)

In [None]:
centroids = centroid_gdf[['geometry', 'HEIGHT', 'SQMETERS', 'IMAGE_DATE']]
centroids.head()

In [None]:
len(centroids)

In [None]:
shp_name = db_name.replace('.gdb', '') + '_PTS.shp'
shp_path = os.path.join(out_pts_dir, shp_name)

In [None]:
centroids.to_file(shp_path)

In [None]:
geojson = '/media/hdd/Team-Drives/Buildings/MS_USBuildingFootprints/Wyoming.geojson.zip'

In [None]:
ms_gdf = gpd.read_file(geojson)

In [None]:
ms_gdf['index'] = ms_gdf.index
ms_gdf.head()

In [None]:
len(ms_gdf)

In [None]:
# Perform spatial join based on intersecting polygons
gdf_joined = gpd.sjoin(ms_gdf, centroids, how='left', predicate='intersects')
gdf_joined['index'] = gdf_joined.index
len(gdf_joined)

In [None]:
gdf_joined.head()

In [None]:
gdf_joined_mean = gdf_joined.groupby('index')['HEIGHT', 'SQMETERS'].mean()
gdf_joined_mean.head()

In [None]:
gdf_pts_height = ms_gdf.merge(gdf_joined_mean, on='index', how='left')
gdf_pts_height.head()

In [None]:
len(gdf_pts_height)

In [None]:
gdf_pts_height.drop('index', axis=1).to_file(os.path.join(out_building_height, f'{state_abbr}.shp'))

In [None]:
gdf_pts_height_notnull = gdf_pts_height[gdf_pts_height['HEIGHT'].notnull()]
len(gdf_pts_height_notnull)

In [None]:
gdf_pts_height_notnull.head()

In [None]:
gdf_pts_height_null = gdf_pts_height[gdf_pts_height['HEIGHT'].isnull()].drop(['HEIGHT', 'SQMETERS'], axis=1)
len(gdf_pts_height_null)

In [None]:
gdf_pts_height_null.head()

In [None]:
gdf_poly_height_sj = gpd.sjoin(gdf_pts_height_null, ornl_gdf, how='left', predicate='intersects')

In [None]:
gdf_poly_height_mean = gdf_poly_height_sj.groupby('index')['HEIGHT', 'SQMETERS'].mean()
gdf_poly_height_mean

In [None]:
gdf_poly_height = gdf_pts_height_null.merge(gdf_poly_height_mean, on='index', how='left')
len(gdf_poly_height)

In [None]:
gdf_poly_height.head()

In [None]:

gdf_height = pd.concat([gdf_pts_height_notnull, gdf_poly_height])
len(gdf_height)


In [None]:
gdf_height.sort_values('index', inplace=True)

In [None]:
gdf_height.head()

In [None]:
gdf_height.drop('index', axis=1).to_file(os.path.join(out_building_height, f'{state_abbr}.shp'))